Infrastructure

Web Crawler

An automated program that systematically browses the internet to discover, index, and extract information from web pages by following links across domains and websites.

What is a Web Crawler?

Web Crawler (also known as spider, bot, or web robot) is an automated program that systematically browses the web to discover and index content. Crawlers navigate from page to page by following hyperlinks, collecting information, and building a comprehensive map of websites and their content.

Web Crawler vs Web Scraper

Key Differences

interface Comparison {
  webCrawler: {
    scope: 'Multiple pages/entire sites';
    navigation: 'Follows links automatically';
    discovery: 'Discovers new pages';
    depth: 'Multi-level traversal';
    focus: 'Structure and relationships';
    examples: ['Site indexing', 'SEO audits', 'Link checking'];
  };
  webScraper: {
    scope: 'Specific pages/data';
    navigation: 'Targeted URLs';
    discovery: 'Extracts specific data';
    depth: 'Single page focus';
    focus: 'Data extraction';
    examples: ['Product prices', 'Contact info', 'Article content'];
  };
}

Basic Web Crawler

Simple URL Crawler

class BasicWebCrawler {
  private visited: Set<string> = new Set();
  private queue: string[] = [];
  private maxPages: number;
  private baseUrl: string;

  constructor(startUrl: string, maxPages: number = 100) {
    this.baseUrl = new URL(startUrl).origin;
    this.queue.push(startUrl);
    this.maxPages = maxPages;
  }

  async crawl() {
    while (this.queue.length > 0 && this.visited.size < this.maxPages) {
      const url = this.queue.shift()!;

      if (this.visited.has(url)) continue;

      console.log(`Crawling: ${url}`);

      try {
        await this.crawlPage(url);
        this.visited.add(url);
      } catch (error) {
        console.error(`Failed to crawl ${url}:`, error);
      }

      // Polite crawling - add delay
      await this.delay(1000);
    }

    return {
      visited: Array.from(this.visited),
      totalPages: this.visited.size
    };
  }

  private async crawlPage(url: string) {
    const response = await fetch(url);
    const html = await response.text();

    // Extract links
    const links = this.extractLinks(html, url);

    // Add to queue
    for (const link of links) {
      if (!this.visited.has(link) && !this.queue.includes(link)) {
        this.queue.push(link);
      }
    }
  }

  private extractLinks(html: string, baseUrl: string): string[] {
    const links: string[] = [];
    const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
    let match;

    while ((match = linkRegex.exec(html)) !== null) {
      try {
        const href = match[1];
        const absoluteUrl = new URL(href, baseUrl).href;

        // Only crawl same domain
        if (absoluteUrl.startsWith(this.baseUrl)) {
          links.push(absoluteUrl);
        }
      } catch (error) {
        // Invalid URL, skip
      }
    }

    return links;
  }

  private delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage
const crawler = new BasicWebCrawler('https://example.com', 50);
const results = await crawler.crawl();
console.log(`Crawled ${results.totalPages} pages`);

Advanced Web Crawler

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import Set, List
import time

class AdvancedWebCrawler:
    def __init__(self, start_url: str, max_depth: int = 3, max_pages: int = 100):
        self.start_url = start_url
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.visited: Set[str] = set()
        self.domain = urlparse(start_url).netloc
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def crawl(self):
        """Start crawling from the start URL"""
        self._crawl_recursive(self.start_url, depth=0)

        return {
            'visited_urls': list(self.visited),
            'total_pages': len(self.visited)
        }

    def _crawl_recursive(self, url: str, depth: int):
        """Recursively crawl pages"""
        if depth > self.max_depth or len(self.visited) >= self.max_pages:
            return

        if url in self.visited:
            return

        print(f"Crawling (depth {depth}): {url}")

        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()

            self.visited.add(url)

            soup = BeautifulSoup(response.text, 'lxml')

            # Extract and process data
            self._process_page(url, soup)

            # Find all links
            links = self._extract_links(soup, url)

            # Crawl linked pages
            for link in links:
                time.sleep(0.5)  # Polite delay
                self._crawl_recursive(link, depth + 1)

        except Exception as e:
            print(f"Error crawling {url}: {e}")

    def _extract_links(self, soup: BeautifulSoup, base_url: str) -> List[str]:
        """Extract all valid links from page"""
        links = []

        for link in soup.find_all('a', href=True):
            href = link['href']
            absolute_url = urljoin(base_url, href)

            # Only crawl same domain
            if self._is_same_domain(absolute_url):
                # Normalize URL
                normalized = self._normalize_url(absolute_url)
                if normalized and normalized not in self.visited:
                    links.append(normalized)

        return links

    def _is_same_domain(self, url: str) -> bool:
        """Check if URL belongs to same domain"""
        return urlparse(url).netloc == self.domain

    def _normalize_url(self, url: str) -> str:
        """Normalize URL (remove fragments, etc.)"""
        parsed = urlparse(url)

        # Skip non-HTTP URLs
        if parsed.scheme not in ['http', 'https']:
            return ''

        # Remove fragment
        normalized = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"

        # Add query if present
        if parsed.query:
            normalized += f"?{parsed.query}"

        return normalized

    def _process_page(self, url: str, soup: BeautifulSoup):
        """Process and extract data from page"""
        data = {
            'url': url,
            'title': soup.title.string if soup.title else '',
            'h1': [h.get_text(strip=True) for h in soup.find_all('h1')],
            'meta_description': soup.find('meta', {'name': 'description'}),
            'word_count': len(soup.get_text().split())
        }

        # Store or process data
        print(f"  Title: {data['title']}")
        print(f"  Word count: {data['word_count']}")

# Usage
crawler = AdvancedWebCrawler('https://example.com', max_depth=2, max_pages=50)
results = crawler.crawl()
print(f"\nCrawled {results['total_pages']} pages")

Crawler with Playwright

JavaScript-Heavy Site Crawler

import { chromium } from 'playwright';

class PlaywrightCrawler {
  private visited: Set<string> = new Set();
  private queue: string[] = [];
  private maxPages: number;
  private baseUrl: string;

  constructor(startUrl: string, maxPages: number = 100) {
    this.baseUrl = new URL(startUrl).origin;
    this.queue.push(startUrl);
    this.maxPages = maxPages;
  }

  async crawl() {
    const browser = await chromium.launch({ headless: true });
    const context = await browser.newContext();

    while (this.queue.length > 0 && this.visited.size < this.maxPages) {
      const url = this.queue.shift()!;

      if (this.visited.has(url)) continue;

      console.log(`Crawling: ${url}`);

      try {
        await this.crawlPage(context, url);
        this.visited.add(url);
      } catch (error) {
        console.error(`Error crawling ${url}:`, error);
      }

      await this.delay(1000);
    }

    await browser.close();

    return {
      visited: Array.from(this.visited),
      totalPages: this.visited.size
    };
  }

  private async crawlPage(context: any, url: string) {
    const page = await context.newPage();

    try {
      await page.goto(url, { waitUntil: 'networkidle' });

      // Extract data
      const data = await page.evaluate(() => {
        return {
          title: document.title,
          headings: Array.from(document.querySelectorAll('h1, h2, h3'))
            .map(h => h.textContent?.trim()),
          links: Array.from(document.querySelectorAll('a[href]'))
            .map(a => (a as HTMLAnchorElement).href)
        };
      });

      console.log(`  Title: ${data.title}`);

      // Add links to queue
      for (const link of data.links) {
        if (link.startsWith(this.baseUrl) && !this.visited.has(link)) {
          if (!this.queue.includes(link)) {
            this.queue.push(link);
          }
        }
      }
    } finally {
      await page.close();
    }
  }

  private delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage
const crawler = new PlaywrightCrawler('https://example.com', 50);
const results = await crawler.crawl();
console.log(`Crawled ${results.totalPages} pages`);

Sitemap-Based Crawler

Efficient Sitemap Parsing

class SitemapCrawler {
  async crawlFromSitemap(sitemapUrl: string) {
    const urls = await this.parseSitemap(sitemapUrl);
    const results = [];

    for (const url of urls) {
      try {
        const data = await this.scrapeUrl(url);
        results.push(data);

        await this.delay(500);
      } catch (error) {
        console.error(`Failed to scrape ${url}:`, error);
      }
    }

    return results;
  }

  private async parseSitemap(sitemapUrl: string): Promise<string[]> {
    const response = await fetch(sitemapUrl);
    const xml = await response.text();

    // Extract URLs from sitemap
    const urlRegex = /<loc>(.*?)<\/loc>/g;
    const urls: string[] = [];
    let match;

    while ((match = urlRegex.exec(xml)) !== null) {
      urls.push(match[1]);
    }

    // Handle sitemap index
    if (xml.includes('<sitemapindex')) {
      const sitemaps = urls;
      urls.length = 0;

      for (const sitemapUrl of sitemaps) {
        const childUrls = await this.parseSitemap(sitemapUrl);
        urls.push(...childUrls);
      }
    }

    return urls;
  }

  private async scrapeUrl(url: string) {
    const response = await fetch(url);
    const html = await response.text();

    // Extract data (simplified)
    const titleMatch = html.match(/<title>(.*?)<\/title>/);
    const title = titleMatch ? titleMatch[1] : '';

    return { url, title };
  }

  private delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }
}

// Usage
const crawler = new SitemapCrawler();
const results = await crawler.crawlFromSitemap('https://example.com/sitemap.xml');
console.log(`Crawled ${results.length} pages from sitemap`);

Respecting robots.txt

robots.txt Parser

class RobotsTxtParser {
  private rules: Map<string, string[]> = new Map();
  private crawlDelay: number = 0;

  async load(baseUrl: string) {
    try {
      const robotsUrl = new URL('/robots.txt', baseUrl).href;
      const response = await fetch(robotsUrl);
      const content = await response.text();

      this.parse(content);
    } catch (error) {
      console.log('No robots.txt found or error loading');
    }
  }

  private parse(content: string) {
    const lines = content.split('\n');
    let currentUserAgent = '*';

    for (const line of lines) {
      const trimmed = line.trim();

      if (trimmed.startsWith('#') || !trimmed) continue;

      const [key, ...valueParts] = trimmed.split(':');
      const value = valueParts.join(':').trim();

      if (key.toLowerCase() === 'user-agent') {
        currentUserAgent = value;
        if (!this.rules.has(currentUserAgent)) {
          this.rules.set(currentUserAgent, []);
        }
      } else if (key.toLowerCase() === 'disallow') {
        const rules = this.rules.get(currentUserAgent) || [];
        rules.push(value);
        this.rules.set(currentUserAgent, rules);
      } else if (key.toLowerCase() === 'crawl-delay') {
        this.crawlDelay = parseFloat(value) * 1000;
      }
    }
  }

  isAllowed(url: string, userAgent: string = '*'): boolean {
    const path = new URL(url).pathname;
    const rules = this.rules.get(userAgent) || this.rules.get('*') || [];

    for (const rule of rules) {
      if (rule === '/' || path.startsWith(rule)) {
        return false;
      }
    }

    return true;
  }

  getCrawlDelay(): number {
    return this.crawlDelay || 1000; // Default 1 second
  }
}

// Usage
const robotsParser = new RobotsTxtParser();
await robotsParser.load('https://example.com');

const url = 'https://example.com/products';
if (robotsParser.isAllowed(url)) {
  console.log('Crawling allowed');
  // Proceed with crawling
} else {
  console.log('Crawling disallowed by robots.txt');
}

Distributed Crawling

Queue-Based Crawler with Redis

import Redis from 'ioredis';

class DistributedCrawler {
  private redis: Redis;
  private queueKey: string = 'crawler:queue';
  private visitedKey: string = 'crawler:visited';

  constructor(redisUrl: string) {
    this.redis = new Redis(redisUrl);
  }

  async addUrl(url: string) {
    const isVisited = await this.redis.sismember(this.visitedKey, url);

    if (!isVisited) {
      await this.redis.lpush(this.queueKey, url);
    }
  }

  async getNextUrl(): Promise<string | null> {
    const url = await this.redis.rpop(this.queueKey);
    return url;
  }

  async markVisited(url: string) {
    await this.redis.sadd(this.visitedKey, url);
  }

  async crawlWorker() {
    while (true) {
      const url = await this.getNextUrl();

      if (!url) {
        await this.delay(5000);
        continue;
      }

      console.log(`Worker crawling: ${url}`);

      try {
        const links = await this.crawlPage(url);
        await this.markVisited(url);

        // Add discovered links
        for (const link of links) {
          await this.addUrl(link);
        }
      } catch (error) {
        console.error(`Error crawling ${url}:`, error);
      }

      await this.delay(1000);
    }
  }

  private async crawlPage(url: string): Promise<string[]> {
    const response = await fetch(url);
    const html = await response.text();

    // Extract links (simplified)
    const links: string[] = [];
    const linkRegex = /<a[^>]+href=["']([^"']+)["']/gi;
    let match;

    while ((match = linkRegex.exec(html)) !== null) {
      try {
        const absoluteUrl = new URL(match[1], url).href;
        links.push(absoluteUrl);
      } catch (error) {
        // Invalid URL
      }
    }

    return links;
  }

  private delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
  }

  async close() {
    await this.redis.quit();
  }
}

// Usage - Run multiple workers
const crawler = new DistributedCrawler('redis://localhost:6379');

// Seed queue
await crawler.addUrl('https://example.com');

// Start multiple workers
const workers = [
  crawler.crawlWorker(),
  crawler.crawlWorker(),
  crawler.crawlWorker()
];

await Promise.race(workers);

Best Practices

For Responsible Crawling

  1. Always check and respect robots.txt
  2. Implement crawl delays (1-2 seconds minimum)
  3. Use appropriate User-Agent identification
  4. Limit concurrent requests
  5. Handle errors gracefully

For Performance

  1. Use concurrent requests wisely
  2. Implement URL deduplication
  3. Cache DNS lookups
  4. Reuse HTTP connections
  5. Store crawled data efficiently

For Data Quality

  1. Validate URLs before crawling
  2. Handle redirects properly
  3. Detect and skip duplicate content
  4. Extract and store metadata
  5. Monitor crawl progress

Learn More

Create a free Account to fix CORS Errors in Production

Say goodbye to CORS errors and get back to building great web applications. It's free!

CORSPROXY Dashboard

Related Terms

More in Infrastructure

Related guides

Back to Glossary