Spaces:

Rsr2425
/

SimpliFi

Sleeping

App Files Files Community

SimpliFi / backend /app /crawler.py

Rsr2425

Fix broken test and code style

26b3c2a 12 months ago

raw

history blame contribute delete

5.74 kB

	import os
	import re
	import logging
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.linkextractors import LinkExtractor
	from scrapy.crawler import CrawlerProcess
	from scrapy.utils.project import get_project_settings
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse

	"""
	A web crawler module for extracting content from documentation websites.
	This module provides classes for crawling a domain and extracting main content
	from web pages, saving the results as text files.
	"""

	logger = logging.getLogger(__name__)


	class WebsiteSpider(CrawlSpider):
	"""
	A Scrapy spider for crawling documentation websites and extracting content.

	This spider follows links within the allowed domain and extracts the main content
	from each page, saving it to a text file. It attempts to find content within <main>,
	<article> or content div tags, falling back to the full body if none are found.

	Args:
	start_url (str): The URL where crawling should begin
	output_dir (str): Directory where extracted content should be saved
	*args: Additional positional arguments passed to CrawlSpider
	**kwargs: Additional keyword arguments passed to CrawlSpider
	"""

	name = "website_spider"

	def __init__(self, start_url, output_dir, args, *kwargs):
	super().__init__(args, *kwargs)

	self.start_urls = [start_url]
	self.allowed_domains = [urlparse(start_url).netloc]
	self.output_dir = output_dir

	os.makedirs(output_dir, exist_ok=True)
	logger.info(f"Created output directory: {output_dir}")

	self.rules = (
	Rule(
	LinkExtractor(allow_domains=self.allowed_domains),
	callback="parse_item",
	follow=True,
	),
	)

	def parse_item(self, response):
	"""
	Parse a webpage and extract its content.

	Args:
	response: The Scrapy response object containing the webpage

	The extracted content is saved to a text file in the output directory,
	including the page URL, title and main content.
	"""
	try:
	# Parse the HTML with BeautifulSoup
	soup = BeautifulSoup(response.body, "html.parser")

	# Extract the title
	title = soup.title.string if soup.title else "No Title"

	# Clean the filename
	filename = re.sub(r"[^\w\-_]", "_", title) + ".txt"
	filepath = os.path.join(self.output_dir, filename)

	# Extract main content
	main_content = (
	soup.find("main")
	or soup.find("article")
	or soup.find("div", class_="content")
	)

	# If we found main content, extract the text
	if main_content:
	text_content = main_content.get_text(separator="\n", strip=True)
	else:
	# Fallback to body text
	text_content = (
	soup.body.get_text(separator="\n", strip=True)
	if soup.body
	else "No content"
	)
	logger.warning(
	f"No main content found for {response.url}, falling back to body text"
	)

	# Save the extracted content
	with open(filepath, "w", encoding="utf-8") as f:
	f.write(f"URL: {response.url}\n")
	f.write(f"Title: {title}\n\n")
	f.write(text_content)

	logger.info(f"Saved content from {response.url} to {filepath}")

	except Exception as e:
	logger.error(f"Error processing {response.url}: {e}", exc_info=True)


	class DomainCrawler:
	"""
	High-level crawler class for extracting content from a documentation website.

	This class provides a simple interface for crawling a website and extracting its
	content. It configures and runs a Scrapy crawler with sensible defaults for
	crawling documentation sites.

	Example:
	crawler = DomainCrawler("https://docs.example.com")
	crawler.start() # Crawls the site and saves content to ./crawled_content/

	Args:
	start_url (str): The URL where crawling should begin
	output_dir (str, optional): Directory where extracted content should be saved.
	Defaults to "crawled_content"
	"""

	def __init__(self, start_url, output_dir="crawled_content"):
	self.start_url = start_url
	self.domain = urlparse(start_url).netloc
	self.output_dir = output_dir

	os.makedirs(output_dir, exist_ok=True)
	logger.info(f"Created output directory: {output_dir}")

	# Configure Scrapy settings
	self.settings = get_project_settings()
	self.settings.update(
	{
	"BOT_NAME": "website_crawler",
	"ROBOTSTXT_OBEY": True,
	"CONCURRENT_REQUESTS": 16,
	"DOWNLOAD_DELAY": 1,
	"COOKIES_ENABLED": False,
	"USER_AGENT": "Mozilla/5.0 (compatible; SimplifyCrawler/1.0)",
	}
	)

	def start(self):
	"""
	Start the crawling process.

	This method initiates the crawler and blocks until crawling is complete.
	The extracted content will be saved to the configured output directory.
	"""
	logger.info(f"Starting crawl from {self.start_url}")

	process = CrawlerProcess(self.settings)
	process.crawl(
	WebsiteSpider, start_url=self.start_url, output_dir=self.output_dir
	)
	process.start()

	logger.info("\nCrawl completed!")
	logger.info(f"Content saved to: {os.path.abspath(self.output_dir)}")