| | import os |
| | import re |
| | import logging |
| | from scrapy.spiders import CrawlSpider, Rule |
| | from scrapy.linkextractors import LinkExtractor |
| | from scrapy.crawler import CrawlerProcess |
| | from scrapy.utils.project import get_project_settings |
| | from bs4 import BeautifulSoup |
| | from urllib.parse import urlparse |
| |
|
| | """ |
| | A web crawler module for extracting content from documentation websites. |
| | This module provides classes for crawling a domain and extracting main content |
| | from web pages, saving the results as text files. |
| | """ |
| |
|
| | logger = logging.getLogger(__name__) |
| |
|
| |
|
| | class WebsiteSpider(CrawlSpider): |
| | """ |
| | A Scrapy spider for crawling documentation websites and extracting content. |
| | |
| | This spider follows links within the allowed domain and extracts the main content |
| | from each page, saving it to a text file. It attempts to find content within <main>, |
| | <article> or content div tags, falling back to the full body if none are found. |
| | |
| | Args: |
| | start_url (str): The URL where crawling should begin |
| | output_dir (str): Directory where extracted content should be saved |
| | *args: Additional positional arguments passed to CrawlSpider |
| | **kwargs: Additional keyword arguments passed to CrawlSpider |
| | """ |
| |
|
| | name = "website_spider" |
| |
|
| | def __init__(self, start_url, output_dir, *args, **kwargs): |
| | super().__init__(*args, **kwargs) |
| |
|
| | self.start_urls = [start_url] |
| | self.allowed_domains = [urlparse(start_url).netloc] |
| | self.output_dir = output_dir |
| |
|
| | os.makedirs(output_dir, exist_ok=True) |
| | logger.info(f"Created output directory: {output_dir}") |
| |
|
| | self.rules = ( |
| | Rule( |
| | LinkExtractor(allow_domains=self.allowed_domains), |
| | callback="parse_item", |
| | follow=True, |
| | ), |
| | ) |
| |
|
| | def parse_item(self, response): |
| | """ |
| | Parse a webpage and extract its content. |
| | |
| | Args: |
| | response: The Scrapy response object containing the webpage |
| | |
| | The extracted content is saved to a text file in the output directory, |
| | including the page URL, title and main content. |
| | """ |
| | try: |
| | |
| | soup = BeautifulSoup(response.body, "html.parser") |
| |
|
| | |
| | title = soup.title.string if soup.title else "No Title" |
| |
|
| | |
| | filename = re.sub(r"[^\w\-_]", "_", title) + ".txt" |
| | filepath = os.path.join(self.output_dir, filename) |
| |
|
| | |
| | main_content = ( |
| | soup.find("main") |
| | or soup.find("article") |
| | or soup.find("div", class_="content") |
| | ) |
| |
|
| | |
| | if main_content: |
| | text_content = main_content.get_text(separator="\n", strip=True) |
| | else: |
| | |
| | text_content = ( |
| | soup.body.get_text(separator="\n", strip=True) |
| | if soup.body |
| | else "No content" |
| | ) |
| | logger.warning( |
| | f"No main content found for {response.url}, falling back to body text" |
| | ) |
| |
|
| | |
| | with open(filepath, "w", encoding="utf-8") as f: |
| | f.write(f"URL: {response.url}\n") |
| | f.write(f"Title: {title}\n\n") |
| | f.write(text_content) |
| |
|
| | logger.info(f"Saved content from {response.url} to {filepath}") |
| |
|
| | except Exception as e: |
| | logger.error(f"Error processing {response.url}: {e}", exc_info=True) |
| |
|
| |
|
| | class DomainCrawler: |
| | """ |
| | High-level crawler class for extracting content from a documentation website. |
| | |
| | This class provides a simple interface for crawling a website and extracting its |
| | content. It configures and runs a Scrapy crawler with sensible defaults for |
| | crawling documentation sites. |
| | |
| | Example: |
| | crawler = DomainCrawler("https://docs.example.com") |
| | crawler.start() # Crawls the site and saves content to ./crawled_content/ |
| | |
| | Args: |
| | start_url (str): The URL where crawling should begin |
| | output_dir (str, optional): Directory where extracted content should be saved. |
| | Defaults to "crawled_content" |
| | """ |
| |
|
| | def __init__(self, start_url, output_dir="crawled_content"): |
| | self.start_url = start_url |
| | self.domain = urlparse(start_url).netloc |
| | self.output_dir = output_dir |
| |
|
| | os.makedirs(output_dir, exist_ok=True) |
| | logger.info(f"Created output directory: {output_dir}") |
| |
|
| | |
| | self.settings = get_project_settings() |
| | self.settings.update( |
| | { |
| | "BOT_NAME": "website_crawler", |
| | "ROBOTSTXT_OBEY": True, |
| | "CONCURRENT_REQUESTS": 16, |
| | "DOWNLOAD_DELAY": 1, |
| | "COOKIES_ENABLED": False, |
| | "USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)", |
| | } |
| | ) |
| |
|
| | def start(self): |
| | """ |
| | Start the crawling process. |
| | |
| | This method initiates the crawler and blocks until crawling is complete. |
| | The extracted content will be saved to the configured output directory. |
| | """ |
| | logger.info(f"Starting crawl from {self.start_url}") |
| |
|
| | process = CrawlerProcess(self.settings) |
| | process.crawl( |
| | WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir |
| | ) |
| | process.start() |
| |
|
| | logger.info("\nCrawl completed!") |
| | logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}") |
| |
|