What is Playwright?
Playwright is a modern open-source browser automation framework developed by Microsoft that allows developers to control Chromium, Firefox, and WebKit browsers programmatically. It provides a unified API for web testing, scraping, and automation tasks with superior reliability and cross-browser support.
Key Features
Cross-Browser Support
import { chromium, firefox, webkit } from 'playwright';
// Chromium (Chrome, Edge)
const chromiumBrowser = await chromium.launch();
// Firefox
const firefoxBrowser = await firefox.launch();
// WebKit (Safari)
const webkitBrowser = await webkit.launch();
// Use the same API across all browsers
const page = await chromiumBrowser.newPage();
await page.goto('https://example.com');
Auto-Waiting
// Playwright automatically waits for elements to be ready
import { chromium } from 'playwright';
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://example.com');
// No need for manual waits - Playwright handles it
await page.click('button#submit'); // Waits for element
await page.fill('input[name="email"]', 'test@example.com'); // Waits for input
await page.selectOption('select#country', 'US'); // Waits for select
await browser.close();
Basic Web Scraping with Playwright
Simple Page Scraping
import { chromium } from 'playwright';
async function scrapePage(url: string) {
const browser = await chromium.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
// Extract data
const data = await page.evaluate(() => {
return {
title: document.title,
headings: Array.from(document.querySelectorAll('h1, h2, h3'))
.map(h => h.textContent?.trim()),
links: Array.from(document.querySelectorAll('a'))
.map(a => ({
text: a.textContent?.trim(),
href: a.href
}))
};
});
await browser.close();
return data;
}
const result = await scrapePage('https://example.com');
console.log(result);
Dynamic Content Scraping
async function scrapeDynamicContent() {
const browser = await chromium.launch();
const page = await browser.newPage();
await page.goto('https://example.com/products');
// Wait for dynamic content to load
await page.waitForSelector('.product-card');
// Scroll to trigger lazy loading
await page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
// Wait for new content
await page.waitForTimeout(2000);
// Extract all products
const products = await page.$$eval('.product-card', cards => {
return cards.map(card => ({
name: card.querySelector('.product-name')?.textContent,
price: card.querySelector('.product-price')?.textContent,
image: card.querySelector('img')?.src
}));
});
await browser.close();
return products;
}
Playwright with Proxies
Using Proxy Servers
import { chromium } from 'playwright';
async function scrapeWithProxy() {
const browser = await chromium.launch({
headless: true,
proxy: {
server: 'http://corsproxy.io:443',
username: process.env.CORS_API_KEY,
password: ''
}
});
const page = await browser.newPage();
// All requests go through the proxy
await page.goto('https://httpbin.org/ip');
const content = await page.content();
await browser.close();
return content;
}
Per-Context Proxy Configuration
async function multiProxySetup() {
const browser = await chromium.launch();
// Context 1 with proxy A
const context1 = await browser.newContext({
proxy: {
server: 'http://proxy1.example.com:8080'
}
});
// Context 2 with proxy B
const context2 = await browser.newContext({
proxy: {
server: 'http://proxy2.example.com:8080'
}
});
const page1 = await context1.newPage();
const page2 = await context2.newPage();
// Each page uses its own proxy
await Promise.all([
page1.goto('https://example.com'),
page2.goto('https://example.com')
]);
await browser.close();
}
Handling Anti-Bot Measures
Stealth Configuration
async function stealthBrowser() {
const browser = await chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-dev-shm-usage'
]
});
const context = await browser.newContext({
userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
viewport: { width: 1920, height: 1080 },
deviceScaleFactor: 1,
hasTouch: false,
javaScriptEnabled: true,
locale: 'en-US',
timezoneId: 'America/New_York'
});
// Hide automation indicators
await context.addInitScript(() => {
// Override navigator.webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// Override permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters: any) => (
parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' } as PermissionStatus)
: originalQuery(parameters)
);
});
const page = await context.newPage();
await page.goto('https://example.com');
await browser.close();
}
Handling CAPTCHAs
async function handleCaptcha() {
const browser = await chromium.launch({ headless: false });
const page = await browser.newPage();
await page.goto('https://example.com');
// Check for CAPTCHA
const hasCaptcha = await page.locator('iframe[src*="recaptcha"]').count() > 0;
if (hasCaptcha) {
console.log('CAPTCHA detected - waiting for manual solve...');
// Wait for CAPTCHA to be solved (manual or service)
await page.waitForSelector('.captcha-success', { timeout: 300000 });
}
// Continue scraping
const data = await page.textContent('.content');
await browser.close();
return data;
}
Network Interception
Modify Requests
async function interceptRequests() {
const browser = await chromium.launch();
const page = await browser.newPage();
// Intercept and modify requests
await page.route('**/*', route => {
const headers = {
...route.request().headers(),
'X-Custom-Header': 'custom-value',
'Accept-Language': 'en-US'
};
route.continue({ headers });
});
await page.goto('https://example.com');
await browser.close();
}
Block Resources
async function blockResources() {
const browser = await chromium.launch();
const page = await browser.newPage();
// Block images, stylesheets, and fonts for faster scraping
await page.route('**/*', route => {
const resourceType = route.request().resourceType();
if (['image', 'stylesheet', 'font'].includes(resourceType)) {
route.abort();
} else {
route.continue();
}
});
await page.goto('https://example.com');
// Page loads faster without heavy resources
const text = await page.textContent('body');
await browser.close();
return text;
}
Browser Context Isolation
Multiple Sessions
async function multipleUserSessions() {
const browser = await chromium.launch();
// User 1 session
const context1 = await browser.newContext({
storageState: 'user1-cookies.json'
});
const page1 = await context1.newPage();
// User 2 session
const context2 = await browser.newContext({
storageState: 'user2-cookies.json'
});
const page2 = await context2.newPage();
// Each context has isolated cookies and storage
await Promise.all([
page1.goto('https://example.com/dashboard'),
page2.goto('https://example.com/dashboard')
]);
await browser.close();
}
Mobile Emulation
Device Emulation
import { devices } from 'playwright';
async function mobileScrapng() {
const browser = await chromium.launch();
const iPhone = devices['iPhone 13 Pro'];
const context = await browser.newContext({
...iPhone
});
const page = await context.newPage();
await page.goto('https://example.com');
// Get mobile-specific content
const mobileContent = await page.textContent('.mobile-menu');
await browser.close();
return mobileContent;
}
Parallel Scraping
Concurrent Pages
async function parallelScraping(urls: string[]) {
const browser = await chromium.launch();
const context = await browser.newContext();
const results = await Promise.all(
urls.map(async url => {
const page = await context.newPage();
await page.goto(url);
const data = await page.evaluate(() => ({
title: document.title,
url: window.location.href
}));
await page.close();
return data;
})
);
await browser.close();
return results;
}
const urls = [
'https://example.com/page1',
'https://example.com/page2',
'https://example.com/page3'
];
const data = await parallelScraping(urls);
Best Practices
For Web Scraping
- Use headless mode for production
- Implement proper error handling
- Respect rate limits and add delays
- Use browser contexts for isolation
- Clean up resources with try-finally
For Testing
- Use Playwright’s built-in assertions
- Leverage auto-waiting features
- Create page object models
- Run tests in parallel
- Generate screenshots on failures
Performance Optimization
- Block unnecessary resources
- Reuse browser contexts
- Use parallel execution
- Set appropriate timeouts
- Close pages when done