Enhancing the efficiency of your Puppeteer web scrapers is crucial for faster data retrieval and processing. One effective way to achieve this is by leveraging Puppeteer’s request interception feature to block unnecessary resources, such as images, CSS, and media files, that are not essential to your scraping goals. This technique significantly reduces the amount of data your scraper needs to load, leading to quicker execution times and more efficient resource use. For developers looking to further optimize their web scraping projects, integrating a high-quality web scraping API can provide an additional layer of efficiency and flexibility.
// Block by resource type like fonts, images etc.
const blockResourceType = [
'beacon',
'csp_report',
'font',
'image',
'imageset',
'media',
'object',
'texttrack',
];
// Block by domains, like google-analytics etc.
const blockResourceName = [
'adition',
'adzerk',
'analytics',
'cdn.api.twitter',
'clicksor',
'clicktale',
'doubleclick',
'exelator',
'facebook',
'fontawesome',
'google',
'google-analytics',
'googletagmanager',
'mixpanel',
'optimizely',
'quantserve',
'sharethrough',
'tiqcdn',
'zedo',
];
const puppeteer = require('puppeteer');
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Enable interception feature
await page.setRequestInterception(true);
// Add a callback which inspects every
// outgoing request browser makes and decides whether to allow it
page.on('request', request => {
const requestUrl = request._url.split('?')[0];
if (
(request.resourceType() in blockedResourceType) ||
blockResourceName.some(resource => requestUrl.includes(resource))
) {
request.abort();
} else {
request.continue();
}
});
}