Simple crawler using Puppeteer and Chrome Headless

19 Oct 2018

The code below is a simple snippet describing the use of puppeteer and chrome headless to retrieve a list of proxies and additional informations. It loops through the different pages of the website containing the proxies informations and then saves them to a csv file for further use.

Dependencies

This code depends on puppeteer and csv-writer.
Run the following commands in order to get them :

$ npm i save puppeteer
$ npm i save csv-writer

The code

The selectors you can see on top of the code can be retrieved by inspecting the element on any browser and clicking on “copy selector”.

    const IP_SELECTOR = '#proxylisttable > tbody > tr:nth-child(INDEX) > td:nth-child(INDICE)';
    const URL = 'https://free-proxy-list.net';
    const NUM_PAGES = 'li.fg-button:nth-child(9) > a:nth-child(1)';
    const NEXT_PAGE = '#proxylisttable_next > a:nth-child(1)';
    const puppeteer = require('puppeteer');
    
    
    function save_to_array(list_infos) {
        const createCsvWrite = require('csv-writer').createObjectCsvWriter;
        const csvWrite = createCsvWrite({
            path: './proxies.csv',
            header: [
                {id: 'ip', title:'IP'},
                {id: 'port', title: 'PORT'},
                {id: 'code', title:'CODE'},
                {id: 'country', title:'COUNTRY'},
                {id:'https', title:'HTTPS'}
            ]
        });
        var array_dict = [];
        for(let i=0; i<list_infos.length; i++) {
            var dict_values = {};
            var split_infos = list_infos[i].split('|');
            
            dict_values.ip = split_infos[0];
            dict_values.port = split_infos[1];
            dict_values.code = split_infos[2];
            dict_values.country = split_infos[3];
            dict_values.https = split_infos[4];
            
            array_dict.push(dict_values);
        }
        try {
            csvWrite.writeRecords(array_dict).then(
                () => {
                    console.log('...File created !');
                }
            );
        } catch(error) {
            console.error(error);
        }
    
    }
    
    async function run() {
        const browser = await puppeteer.launch({
            headless:false
        });
        const page = await browser.newPage();
    
        await page.goto(URL);
    
        var arrayInfos = [];
    
        let num_pages = await page.evaluate((sel) => {
            let element = document.querySelector(sel);
            return element? element.innerHTML: null;
        }, NUM_PAGES);
        for(let pages=0; pages < parseInt(num_pages); pages++) {
            if(pages!=0) {
                await page.click(NEXT_PAGE);
            }
            for (let i=1; i <= 20; i++) {
                var infos = '';
                for(let j=0;j<6;j++) {
                    if(j == 5) j+=2;
                    var mapObj = {
                        INDEX:i,
                        INDICE:j
                    };
                    
                    let selector = IP_SELECTOR.replace(/INDEX|INDICE/gi, function(matched) {
                        return mapObj[matched];
                    });
                    
                    let result = await page.evaluate((sel) => {
                        let element = document.querySelector(sel);
                        return element? element.innerHTML: null;
                    }, selector);
        
                    if(!result)
                        continue;
                    
                    if(j==1){
                        infos+= result;
                    } else {
                        infos += '|'+result;
                    }
                    
                }
                arrayInfos.push(infos);
            }
        }
        
        browser.close();
    
        save_to_array(arrayInfos);
    
    }
    
    run();