What is a Web Crawler?
Web Crawler (also known as spider, bot, or web robot) is an automated program that systematically browses the web to discover and index content. Crawlers navigate from page to page by following hyperlinks, collecting information, and building a comprehensive map of websites and their content.
Web Crawler vs Web Scraper
Key Differences
interface Comparison {
webCrawler: {
scope: 'Multiple pages/entire sites';
navigation: 'Follows links automatically';
discovery: 'Discovers new pages';
depth: 'Multi-level traversal';
focus: 'Structure and relationships';
examples: ['Site indexing', 'SEO audits', 'Link checking'];
};
webScraper: {
scope: 'Specific pages/data';
navigation: 'Targeted URLs';
discovery: 'Extracts specific data';
depth: 'Single page focus';
focus: 'Data extraction';
examples: ['Product prices', 'Contact info', 'Article content'];
};
}
Basic Web Crawler
Simple URL Crawler
class BasicWebCrawler {
private visited: Set<string> = new Set();
private queue: string[] = [];
private maxPages: number;
private baseUrl: string;
constructor(startUrl: string, maxPages: number = 100) {
this.baseUrl = new URL(startUrl).origin;
this.queue.push(startUrl);
this.maxPages = maxPages;
}
async crawl() {
while (this.queue.length > 0 && this.visited.size < this.maxPages) {
const url = this.queue.shift()!;
if (this.visited.has(url)) continue;
console.log(`Crawling: ${url}`);
try {
await this.crawlPage(url);
this.visited.add(url);
} catch (error) {
console.error(`Failed to crawl ${url}:`, error);
}
// Polite crawling - add delay
await this.delay(1000);
}
return {
visited: Array.from(this.visited),
totalPages: this.visited.size
};
}
private async crawlPage(url: string) {
const response = await fetch(url);
const html = await response.text();
// Extract links
const links = this.extractLinks(html, url);
// Add to queue
for (const link of links) {
if (!this.visited.has(link) && !this.queue.includes(link)) {
this.queue.push(link);
}
}
}
private extractLinks(html: string, baseUrl: string): string[] {
const links: string[] = [];
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
let match;
while ((match = linkRegex.exec(html)) !== null) {
try {
const href = match[1];
const absoluteUrl = new URL(href, baseUrl).href;
// Only crawl same domain
if (absoluteUrl.startsWith(this.baseUrl)) {
links.push(absoluteUrl);
}
} catch (error) {
// Invalid URL, skip
}
}
return links;
}
private delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage
const crawler = new BasicWebCrawler('https://example.com', 50);
const results = await crawler.crawl();
console.log(`Crawled ${results.totalPages} pages`);
Advanced Web Crawler
Full-Featured Crawler with BeautifulSoup
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import Set, List
import time
class AdvancedWebCrawler:
def __init__(self, start_url: str, max_depth: int = 3, max_pages: int = 100):
self.start_url = start_url
self.max_depth = max_depth
self.max_pages = max_pages
self.visited: Set[str] = set()
self.domain = urlparse(start_url).netloc
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def crawl(self):
"""Start crawling from the start URL"""
self._crawl_recursive(self.start_url, depth=0)
return {
'visited_urls': list(self.visited),
'total_pages': len(self.visited)
}
def _crawl_recursive(self, url: str, depth: int):
"""Recursively crawl pages"""
if depth > self.max_depth or len(self.visited) >= self.max_pages:
return
if url in self.visited:
return
print(f"Crawling (depth {depth}): {url}")
try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
self.visited.add(url)
soup = BeautifulSoup(response.text, 'lxml')
# Extract and process data
self._process_page(url, soup)
# Find all links
links = self._extract_links(soup, url)
# Crawl linked pages
for link in links:
time.sleep(0.5) # Polite delay
self._crawl_recursive(link, depth + 1)
except Exception as e:
print(f"Error crawling {url}: {e}")
def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
"""Extract all valid links from page"""
links = []
for link in soup.find_all('a', href=True):
href = link['href']
absolute_url = urljoin(base_url, href)
# Only crawl same domain
if self._is_same_domain(absolute_url):
# Normalize URL
normalized = self._normalize_url(absolute_url)
if normalized and normalized not in self.visited:
links.append(normalized)
return links
def _is_same_domain(self, url: str) -> bool:
"""Check if URL belongs to same domain"""
return urlparse(url).netloc == self.domain
def _normalize_url(self, url: str) -> str:
"""Normalize URL (remove fragments, etc.)"""
parsed = urlparse(url)
# Skip non-HTTP URLs
if parsed.scheme not in ['http', 'https']:
return ''
# Remove fragment
normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
# Add query if present
if parsed.query:
normalized += f"?{parsed.query}"
return normalized
def _process_page(self, url: str, soup: BeautifulSoup):
"""Process and extract data from page"""
data = {
'url': url,
'title': soup.title.string if soup.title else '',
'h1': [h.get_text(strip=True) for h in soup.find_all('h1')],
'meta_description': soup.find('meta', {'name': 'description'}),
'word_count': len(soup.get_text().split())
}
# Store or process data
print(f" Title: {data['title']}")
print(f" Word count: {data['word_count']}")
# Usage
crawler = AdvancedWebCrawler('https://example.com', max_depth=2, max_pages=50)
results = crawler.crawl()
print(f"\nCrawled {results['total_pages']} pages")
Crawler with Playwright
JavaScript-Heavy Site Crawler
import { chromium } from 'playwright';
class PlaywrightCrawler {
private visited: Set<string> = new Set();
private queue: string[] = [];
private maxPages: number;
private baseUrl: string;
constructor(startUrl: string, maxPages: number = 100) {
this.baseUrl = new URL(startUrl).origin;
this.queue.push(startUrl);
this.maxPages = maxPages;
}
async crawl() {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext();
while (this.queue.length > 0 && this.visited.size < this.maxPages) {
const url = this.queue.shift()!;
if (this.visited.has(url)) continue;
console.log(`Crawling: ${url}`);
try {
await this.crawlPage(context, url);
this.visited.add(url);
} catch (error) {
console.error(`Error crawling ${url}:`, error);
}
await this.delay(1000);
}
await browser.close();
return {
visited: Array.from(this.visited),
totalPages: this.visited.size
};
}
private async crawlPage(context: any, url: string) {
const page = await context.newPage();
try {
await page.goto(url, { waitUntil: 'networkidle' });
// Extract data
const data = await page.evaluate(() => {
return {
title: document.title,
headings: Array.from(document.querySelectorAll('h1, h2, h3'))
.map(h => h.textContent?.trim()),
links: Array.from(document.querySelectorAll('a[href]'))
.map(a => (a as HTMLAnchorElement).href)
};
});
console.log(` Title: ${data.title}`);
// Add links to queue
for (const link of data.links) {
if (link.startsWith(this.baseUrl) && !this.visited.has(link)) {
if (!this.queue.includes(link)) {
this.queue.push(link);
}
}
}
} finally {
await page.close();
}
}
private delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage
const crawler = new PlaywrightCrawler('https://example.com', 50);
const results = await crawler.crawl();
console.log(`Crawled ${results.totalPages} pages`);
Sitemap-Based Crawler
Efficient Sitemap Parsing
class SitemapCrawler {
async crawlFromSitemap(sitemapUrl: string) {
const urls = await this.parseSitemap(sitemapUrl);
const results = [];
for (const url of urls) {
try {
const data = await this.scrapeUrl(url);
results.push(data);
await this.delay(500);
} catch (error) {
console.error(`Failed to scrape ${url}:`, error);
}
}
return results;
}
private async parseSitemap(sitemapUrl: string): Promise<string[]> {
const response = await fetch(sitemapUrl);
const xml = await response.text();
// Extract URLs from sitemap
const urlRegex = /<loc>(.*?)<\/loc>/g;
const urls: string[] = [];
let match;
while ((match = urlRegex.exec(xml)) !== null) {
urls.push(match[1]);
}
// Handle sitemap index
if (xml.includes('<sitemapindex')) {
const sitemaps = urls;
urls.length = 0;
for (const sitemapUrl of sitemaps) {
const childUrls = await this.parseSitemap(sitemapUrl);
urls.push(...childUrls);
}
}
return urls;
}
private async scrapeUrl(url: string) {
const response = await fetch(url);
const html = await response.text();
// Extract data (simplified)
const titleMatch = html.match(/<title>(.*?)<\/title>/);
const title = titleMatch ? titleMatch[1] : '';
return { url, title };
}
private delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Usage
const crawler = new SitemapCrawler();
const results = await crawler.crawlFromSitemap('https://example.com/sitemap.xml');
console.log(`Crawled ${results.length} pages from sitemap`);
Respecting robots.txt
robots.txt Parser
class RobotsTxtParser {
private rules: Map<string, string[]> = new Map();
private crawlDelay: number = 0;
async load(baseUrl: string) {
try {
const robotsUrl = new URL('/robots.txt', baseUrl).href;
const response = await fetch(robotsUrl);
const content = await response.text();
this.parse(content);
} catch (error) {
console.log('No robots.txt found or error loading');
}
}
private parse(content: string) {
const lines = content.split('\n');
let currentUserAgent = '*';
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith('#') || !trimmed) continue;
const [key, ...valueParts] = trimmed.split(':');
const value = valueParts.join(':').trim();
if (key.toLowerCase() === 'user-agent') {
currentUserAgent = value;
if (!this.rules.has(currentUserAgent)) {
this.rules.set(currentUserAgent, []);
}
} else if (key.toLowerCase() === 'disallow') {
const rules = this.rules.get(currentUserAgent) || [];
rules.push(value);
this.rules.set(currentUserAgent, rules);
} else if (key.toLowerCase() === 'crawl-delay') {
this.crawlDelay = parseFloat(value) * 1000;
}
}
}
isAllowed(url: string, userAgent: string = '*'): boolean {
const path = new URL(url).pathname;
const rules = this.rules.get(userAgent) || this.rules.get('*') || [];
for (const rule of rules) {
if (rule === '/' || path.startsWith(rule)) {
return false;
}
}
return true;
}
getCrawlDelay(): number {
return this.crawlDelay || 1000; // Default 1 second
}
}
// Usage
const robotsParser = new RobotsTxtParser();
await robotsParser.load('https://example.com');
const url = 'https://example.com/products';
if (robotsParser.isAllowed(url)) {
console.log('Crawling allowed');
// Proceed with crawling
} else {
console.log('Crawling disallowed by robots.txt');
}
Distributed Crawling
Queue-Based Crawler with Redis
import Redis from 'ioredis';
class DistributedCrawler {
private redis: Redis;
private queueKey: string = 'crawler:queue';
private visitedKey: string = 'crawler:visited';
constructor(redisUrl: string) {
this.redis = new Redis(redisUrl);
}
async addUrl(url: string) {
const isVisited = await this.redis.sismember(this.visitedKey, url);
if (!isVisited) {
await this.redis.lpush(this.queueKey, url);
}
}
async getNextUrl(): Promise<string | null> {
const url = await this.redis.rpop(this.queueKey);
return url;
}
async markVisited(url: string) {
await this.redis.sadd(this.visitedKey, url);
}
async crawlWorker() {
while (true) {
const url = await this.getNextUrl();
if (!url) {
await this.delay(5000);
continue;
}
console.log(`Worker crawling: ${url}`);
try {
const links = await this.crawlPage(url);
await this.markVisited(url);
// Add discovered links
for (const link of links) {
await this.addUrl(link);
}
} catch (error) {
console.error(`Error crawling ${url}:`, error);
}
await this.delay(1000);
}
}
private async crawlPage(url: string): Promise<string[]> {
const response = await fetch(url);
const html = await response.text();
// Extract links (simplified)
const links: string[] = [];
const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
let match;
while ((match = linkRegex.exec(html)) !== null) {
try {
const absoluteUrl = new URL(match[1], url).href;
links.push(absoluteUrl);
} catch (error) {
// Invalid URL
}
}
return links;
}
private delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async close() {
await this.redis.quit();
}
}
// Usage - Run multiple workers
const crawler = new DistributedCrawler('redis://localhost:6379');
// Seed queue
await crawler.addUrl('https://example.com');
// Start multiple workers
const workers = [
crawler.crawlWorker(),
crawler.crawlWorker(),
crawler.crawlWorker()
];
await Promise.race(workers);
Best Practices
For Responsible Crawling
- Always check and respect robots.txt
- Implement crawl delays (1-2 seconds minimum)
- Use appropriate User-Agent identification
- Limit concurrent requests
- Handle errors gracefully
For Performance
- Use concurrent requests wisely
- Implement URL deduplication
- Cache DNS lookups
- Reuse HTTP connections
- Store crawled data efficiently
For Data Quality
- Validate URLs before crawling
- Handle redirects properly
- Detect and skip duplicate content
- Extract and store metadata
- Monitor crawl progress