What is Puppeteer?
Puppeteer is a Node.js library maintained by the Chrome DevTools team at Google that provides a high-level API to control headless Chrome or Chromium browsers. It’s widely used for web scraping, automated testing, screenshot generation, and browser automation tasks.
Key Features
Browser Control
import puppeteer from 'puppeteer';
// Launch browser
const browser = await puppeteer.launch({
headless: true, // Run without UI
args: ['--no-sandbox'] // Security flags
});
// Create new page
const page = await browser.newPage();
// Navigate
await page.goto('https://example.com');
// Take screenshot
await page.screenshot({ path: 'example.png' });
// Close browser
await browser.close();
Page Interaction
async function interactWithPage() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
// Click elements
await page.click('button#submit');
// Type into inputs
await page.type('input[name="email"]', 'test@example.com');
// Select dropdown
await page.select('select#country', 'US');
// Wait for navigation
await page.waitForNavigation();
await browser.close();
}
Web Scraping with Puppeteer
Basic Data Extraction
async function scrapeData(url: string) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
// Extract data from page
const data = await page.evaluate(() => {
return {
title: document.title,
description: document.querySelector('meta[name="description"]')
?.getAttribute('content'),
headings: Array.from(document.querySelectorAll('h1, h2'))
.map(h => h.textContent?.trim()),
paragraphs: Array.from(document.querySelectorAll('p'))
.map(p => p.textContent?.trim())
};
});
await browser.close();
return data;
}
const result = await scrapeData('https://example.com');
Handling Dynamic Content
async function scrapeDynamicPage() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com/products');
// Wait for specific element
await page.waitForSelector('.product-list');
// Scroll to load more content
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
// Wait for lazy-loaded content
await page.waitForSelector('.product:nth-child(20)', {
timeout: 10000
});
// Extract products
const products = await page.$$eval('.product', items => {
return items.map(item => ({
name: item.querySelector('.name')?.textContent,
price: item.querySelector('.price')?.textContent,
image: item.querySelector('img')?.src,
url: item.querySelector('a')?.href
}));
});
await browser.close();
return products;
}
Using Puppeteer with Proxies
Proxy Configuration
async function scrapeWithProxy() {
const browser = await puppeteer.launch({
headless: true,
args: [
'--proxy-server=http://corsproxy.io:443',
'--no-sandbox'
]
});
const page = await browser.newPage();
// Authenticate with proxy
await page.authenticate({
username: process.env.CORS_API_KEY || '',
password: ''
});
await page.goto('https://httpbin.org/ip');
const content = await page.content();
await browser.close();
return content;
}
Rotating Proxies
class ProxyRotator {
private proxies: string[];
private currentIndex: number = 0;
constructor(proxies: string[]) {
this.proxies = proxies;
}
getNext(): string {
const proxy = this.proxies[this.currentIndex];
this.currentIndex = (this.currentIndex + 1) % this.proxies.length;
return proxy;
}
async launchWithProxy() {
const proxy = this.getNext();
return puppeteer.launch({
headless: true,
args: [`--proxy-server=${proxy}`]
});
}
}
// Usage
const rotator = new ProxyRotator([
'http://proxy1.example.com:8080',
'http://proxy2.example.com:8080',
'http://proxy3.example.com:8080'
]);
const browser = await rotator.launchWithProxy();
Stealth Mode
Bypassing Bot Detection
async function stealthMode() {
const browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu',
'--window-size=1920x1080'
]
});
const page = await browser.newPage();
// Set realistic viewport
await page.setViewport({
width: 1920,
height: 1080,
deviceScaleFactor: 1
});
// Set user agent
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' +
'(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
);
// Override webdriver property
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// Override permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters: any) => (
parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' } as PermissionStatus)
: originalQuery(parameters)
);
// Override plugins
Object.defineProperty(navigator, 'plugins', {
get: () => [1, 2, 3, 4, 5]
});
// Override languages
Object.defineProperty(navigator, 'languages', {
get: () => ['en-US', 'en']
});
});
await page.goto('https://example.com');
await browser.close();
}
Using puppeteer-extra with Plugins
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
// Add stealth plugin
puppeteer.use(StealthPlugin());
async function stealthScraping() {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
// All stealth features automatically applied
await page.goto('https://bot-detection-test.com');
const detectionResult = await page.evaluate(() => {
return {
webdriver: navigator.webdriver,
plugins: navigator.plugins.length,
languages: navigator.languages
};
});
await browser.close();
return detectionResult;
}
Request Interception
Modify Headers
async function interceptRequests() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
// Enable request interception
await page.setRequestInterception(true);
page.on('request', request => {
const headers = {
...request.headers(),
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'X-Custom-Header': 'custom-value'
};
request.continue({ headers });
});
await page.goto('https://example.com');
await browser.close();
}
Block Resources
async function blockResources() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', request => {
const resourceType = request.resourceType();
// Block images, stylesheets, and fonts
if (['image', 'stylesheet', 'font'].includes(resourceType)) {
request.abort();
} else {
request.continue();
}
});
// Page loads faster without heavy resources
await page.goto('https://example.com');
const text = await page.evaluate(() => document.body.textContent);
await browser.close();
return text;
}
Handling Pagination
Crawling Multiple Pages
async function scrapePaginatedData(baseUrl: string, maxPages: number) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const allData: any[] = [];
for (let i = 1; i <= maxPages; i++) {
await page.goto(`${baseUrl}?page=${i}`);
// Extract data from current page
const pageData = await page.evaluate(() => {
return Array.from(document.querySelectorAll('.item')).map(item => ({
title: item.querySelector('.title')?.textContent,
description: item.querySelector('.desc')?.textContent
}));
});
allData.push(...pageData);
// Check if next page exists
const hasNextPage = await page.$('.next-page') !== null;
if (!hasNextPage) break;
// Add delay between requests
await page.waitForTimeout(1000);
}
await browser.close();
return allData;
}
Screenshot and PDF Generation
Capture Screenshots
async function captureScreenshots() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
// Full page screenshot
await page.screenshot({
path: 'fullpage.png',
fullPage: true
});
// Element screenshot
const element = await page.$('.header');
await element?.screenshot({ path: 'header.png' });
// Screenshot with custom viewport
await page.setViewport({ width: 1920, height: 1080 });
await page.screenshot({ path: 'desktop.png' });
await browser.close();
}
Generate PDFs
async function generatePDF() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://example.com', {
waitUntil: 'networkidle0'
});
await page.pdf({
path: 'page.pdf',
format: 'A4',
printBackground: true,
margin: {
top: '20px',
right: '20px',
bottom: '20px',
left: '20px'
}
});
await browser.close();
}
Performance Monitoring
Measure Page Load Time
async function measurePerformance(url: string) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'networkidle2' });
const metrics = await page.evaluate(() => {
const perfData = window.performance.timing;
return {
loadTime: perfData.loadEventEnd - perfData.navigationStart,
domContentLoaded: perfData.domContentLoadedEventEnd - perfData.navigationStart,
responseTime: perfData.responseEnd - perfData.requestStart
};
});
await browser.close();
return metrics;
}
Error Handling
Robust Scraping
async function robustScraping(url: string, retries: number = 3) {
let attempt = 0;
let lastError: Error | null = null;
while (attempt < retries) {
const browser = await puppeteer.launch();
try {
const page = await browser.newPage();
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
const data = await page.evaluate(() => ({
title: document.title,
content: document.body.textContent
}));
await browser.close();
return data;
} catch (error) {
lastError = error as Error;
attempt++;
await browser.close();
if (attempt < retries) {
// Wait before retry
await new Promise(resolve => setTimeout(resolve, 2000 * attempt));
}
}
}
throw new Error(`Failed after ${retries} attempts: ${lastError?.message}`);
}
Best Practices
For Web Scraping
- Always close browsers to prevent memory leaks
- Use headless mode for production
- Implement rate limiting and delays
- Handle errors and retries gracefully
- Respect robots.txt and terms of service
Performance Optimization
- Disable unnecessary features (images, CSS)
- Reuse browser instances when possible
- Use connection pooling
- Set appropriate timeouts
- Monitor memory usage
Security
- Run in sandboxed environments
- Validate and sanitize scraped data
- Use proxy authentication securely
- Keep Puppeteer updated
- Handle credentials safely
Puppeteer vs Playwright
Key Differences
interface Comparison {
puppeteer: {
browsers: ['Chromium'];
maintainer: 'Google Chrome Team';
api: 'Chrome DevTools Protocol';
autoWaiting: false;
crossBrowser: false;
};
playwright: {
browsers: ['Chromium', 'Firefox', 'WebKit'];
maintainer: 'Microsoft';
api: 'Multiple browser protocols';
autoWaiting: true;
crossBrowser: true;
};
}