Close

Node.js Webscrapping (Step-By-Step Tutorial)

By Oxylabs on YouTube

Now go inside this folder

				
					cd Webs
				
			

Iniatialize Node.js

This will create a package.json file in the directory. The file will contain information about the packages that are installed in the folder.

				
					npm init-y
				
			

Wrote to /Users/codeRECODE/scrape/package.json

Install Libraries

				
					npm install cheerio
				
			
				
					npm install axios
				
			
				
					npm install axios cheerio json2csv
				
			

Fictional Book Store

				
					https://books.toscrape.com/
				
			

Google Chrome Extension

				
					Selector Gadget Extension
				
			

Write the code

Save this code as genre.js and run this script

				
					const cheerio = require("cheerio");
const axios = require("axios");

const url="https://books.toscrape.com/catalogue/category/books/mystery_3/index.html";

async function getGenre() {
    try{
    const response = await axios.get(url);
        const $=cheerio.load(response.data);
        const genre = $("h1").text();
        
        console.log(genre);
    }
    catch(error){
        console.error(error);
    }


}

getGenre();
				
			
				
					node genre.js
				
			

Now it is time to extract all the book's name with the price and the availability from the page

Open a new file books.js

				
					const cheerio = require("cheerio");
const axios = require("axios");

const mystery="https://books.toscrape.com/catalogue/category/books/mystery_3/index.html";

const book_data = []

async function getBooks() {
    try{
    const response = await axios.get(mystery);
        const $=cheerio.load(response.data);
        
        const books = $("article");
        books.each(function(){
            title = $(this).find("h3 a").text();
            price = $(this).find(".price_color").text();
            stock = $(this).find(".availability").text().trim();
            
            book_data.push({title, price, stock});
        });
        
        console.log(book_data);
    }
    catch(error){
        console.error(error);
    }


}

getBooks(mystery);
				
			

Run this script

				
					node books.js
				
			

This will print the array of books on the console. The only limitation of this, JavaScript cod is that is scraping only one page.

The next section will cover how pagination can be handled.

The listing are usualy spread over multiple pages.

While every site may have its own way of paginating. The common one is having a next button on every page.

The exception would be the last page, which would not have a next page link.

				
					node books.js
				
			

Open a new file booksAllPages.js

				
					const cheerio = require("cheerio");
const axios = require("axios");

const mystery="https://books.toscrape.com/catalogue/category/books/mystery_3/index.html";
const baseUrl="https://books.toscrape.com/catalogue/category/books/mystery_3/";


const book_data = []

async function getBooks() {
    try{
    const response = await axios.get(baseUrl);
        const $=cheerio.load(response.data);

        const books = $("article");
        books.each(function(){
            title = $(this).find("h3 a").text();
            price = $(this).find(".price_color").text();
            stock = $(this).find(".availability").text().trim();

            book_data.push({title, price, stock});
        });

        console.log(book_data);
   
    if($(".next a").length > 0){
        next_page = baseUrl + $("next a").attr("href");
        getBooks(next_page);
    }
    }   
    catch(error){
        console.error(error);
    }


}

getBooks(baseUrl);
				
			

Run this script

				
					node booksAllPages.js
				
			

Saving the data with 2 packages FS (built in Node.js) and Json 2 CSV

				
					npm install json2csv
				
			

add json2csv to books.js

				
					const fs = require("fs");
const j2cp = require("json2csv").Parser;
const axios = require("axios");
const cheerio = require("cheerio");
 
const mystery = "http://books.toscrape.com/catalogue/category/books/mystery_3/index.html";
 
const books_data = [];
 
async function getBooks(url) {
  try {
    const response = await axios.get(url);
    const $ = cheerio.load(response.data);
 
    const books = $("article");
    books.each(function () {
      title = $(this).find("h3 a").text();
      price = $(this).find(".price_color").text();
      stock = $(this).find(".availability").text().trim();
      books_data.push({ title, price, stock });
    });
    // console.log(books_data);
    const baseUrl = "http://books.toscrape.com/catalogue/category/books/mystery_3/";
    if ($(".next a").length > 0) {
      next = baseUrl + $(".next a").attr("href");
      getBooks(next);
    } else {
      const parser = new j2cp();
      const csv = parser.parse(books_data);
      fs.writeFileSync("./books.csv", csv);
    }
  } catch (err) {
    console.error(err);
  }
}
 
getBooks(mystery);
				
			

Run this script

				
					node BooksToCsv.js
				
			

Read the file with Excel or:

				
					cat books.csv
				
			
				
					"title","price","stock"
"Sharp Objects","£47.82","In stock"
"In a Dark, Dark ...","£19.63","In stock"
"The Past Never Ends","£56.50","In stock"
"A Murder in Time","£16.64","In stock"
"The Murder of Roger ...","£44.10","In stock"
"The Last Mile (Amos ...","£54.21","In stock"
"That Darkness (Gardiner and ...","£13.92","In stock"
"Tastes Like Fear (DI ...","£10.69","In stock"
"A Time of Torment ...","£48.35","In stock"
"A Study in Scarlet ...","£16.73","In stock"
"Poisonous (Max Revere Novels ...","£26.80","In stock"
"Murder at the 42nd ...","£54.36","In stock"
"Most Wanted","£35.28","In stock"
"Hide Away (Eve Duncan ...","£11.84","In stock"
"Boar Island (Anna Pigeon ...","£59.48","In stock"
"The Widow","£27.26","In stock"
"Playing with Fire","£13.71","In stock"
"What Happened on Beale ...","£25.37","In stock"
"The Bachelor Girl's Guide ...","£52.30","In stock"
"Delivering the Truth (Quaker ...","£20.89","In stock"
"The Mysterious Affair at ...","£24.80","In stock"
"In the Woods (Dublin ...","£38.38","In stock"
"The Silkworm (Cormoran Strike ...","£23.05","In stock"
"The Exiled","£43.45","In stock"
"The Cuckoo's Calling (Cormoran ...","£19.21","In stock"
"Extreme Prey (Lucas Davenport ...","£25.40","In stock"
"Career of Evil (Cormoran ...","£24.72","In stock"
"The No. 1 Ladies' ...","£57.70","In stock"
"The Girl You Lost","£12.29","In stock"
"The Girl In The ...","£15.85","In stock"
"Blood Defense (Samantha Brinkman ...","£20.30","In stock"
"1st to Die (Women's ...","£53.98","In stock"