Spaces:

ashishninehertz
/

ConvoBot

Sleeping

ConvoBot / src /crawler.py

ashish-ninehertz

changes

e272f4f 7 months ago

8.5 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from typing import List, Set, Optional, Dict
	import logging
	import re
	from app.config import Config
	import aiohttp

	class URLCrawler:
	"""
	A web crawler that extracts and processes content from websites.
	Handles both synchronous and asynchronous crawling operations.

	Features:
	- URL validation and sanitization
	- Content extraction with noise removal
	- Breadth-first crawling with configurable depth
	- Respects robots.txt and avoids non-html content
	"""

	def __init__(self):
	"""Initialize the crawler with default settings."""
	self.visited_urls: Set[str] = set() # Tracks crawled URLs to avoid duplicates
	self.logger = logging.getLogger(__name__)
	# Configure headers to mimic a real browser
	self.headers = {
	'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)',
	'Accept-Language': 'en-US,en;q=0.9'
	}

	def is_valid_url(self, url: str, base_domain: str) -> bool:
	"""
	Validate if a URL should be crawled.

	Args:
	url: URL to validate
	base_domain: The target domain to stay within

	Returns:
	bool: True if URL is crawlable
	"""
	parsed = urlparse(url)
	return (parsed.scheme in ('http', 'https') and # Only HTTP/HTTPS
	parsed.netloc == base_domain and # Stay within target domain
	not any(ext in url.lower() # Skip binary files
	for ext in ['.pdf', '.jpg', '.png', '.zip']) and
	url not in self.visited_urls) # Avoid duplicates

	def sanitize_url(self, url: str) -> str:
	"""
	Normalize URL by removing fragments and query parameters.

	Args:
	url: URL to sanitize

	Returns:
	str: Normalized URL
	"""
	parsed = urlparse(url)
	return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"

	def clean_text(self, text: str) -> str:
	"""
	Clean and normalize extracted text content.

	Args:
	text: Raw extracted text

	Returns:
	str: Cleaned text content
	"""
	# Remove excessive whitespace
	text = re.sub(r'\s+', ' ', text)
	# Remove common boilerplate
	text = re.sub(r'(\b(privacy policy\|terms of service\|cookie policy)\b\|\b\d+\s*(comments\|shares\|likes)\b)', '', text, flags=re.I)
	# Remove short lines (likely not meaningful content)
	return '\n'.join(line for line in text.split('\n')
	if len(line.strip()) > 30)

	def extract_main_content(self, soup: BeautifulSoup) -> str:
	"""
	Extract primary content from HTML using semantic heuristics.

	Args:
	soup: BeautifulSoup parsed HTML document

	Returns:
	str: Extracted main content
	"""
	# Remove unwanted elements that typically don't contain main content
	for element in soup(['script', 'style', 'nav', 'footer',
	'header', 'iframe', 'aside', 'form']):
	element.decompose()

	# Prioritize semantic HTML containers that likely contain main content
	for tag in ['article', 'main', 'section[role="main"]', '.content']:
	content = soup.select_one(tag)
	if content:
	return self.clean_text(content.get_text(separator='\n'))

	# Fallback to body if no semantic containers found
	return self.clean_text(soup.body.get_text(separator='\n'))

	def get_page_content(self, url: str) -> Optional[Dict]:
	"""
	Fetch and process a single web page.

	Args:
	url: URL to fetch

	Returns:
	Optional[Dict]: Structured page data or None if invalid
	"""
	try:
	response = requests.get(url, headers=self.headers, timeout=15)
	response.raise_for_status()

	# Skip non-HTML content
	if 'text/html' not in response.headers.get('Content-Type', ''):
	return None

	soup = BeautifulSoup(response.text, 'lxml')
	title = soup.title.string if soup.title else urlparse(url).path
	content = self.extract_main_content(soup)

	# Skip pages with insufficient content
	if len(content.split()) < 100: # Minimum 100 words
	return None

	return {
	'url': url,
	'title': title,
	'content': content,
	'last_modified': response.headers.get('Last-Modified', '')
	}

	except Exception as e:
	self.logger.warning(f"Error processing {url}: {str(e)}")
	return None

	def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]:
	"""
	Extract all crawlable links from a page.

	Args:
	url: Base URL for relative link resolution
	soup: Parsed HTML document

	Returns:
	List[str]: List of absolute URLs to crawl
	"""
	base_domain = urlparse(url).netloc
	links = set()

	for link in soup.find_all('a', href=True):
	href = link['href'].split('#')[0] # Remove fragments
	if not href or href.startswith('javascript:'):
	continue

	absolute_url = urljoin(url, href)
	sanitized_url = self.sanitize_url(absolute_url)

	if self.is_valid_url(sanitized_url, base_domain):
	links.add(sanitized_url)

	return sorted(links)[:Config.MAX_LINKS_PER_PAGE] # Apply limit

	async def crawl(self, url: str) -> str:
	"""
	Asynchronously crawl a single URL and return its text content.

	Args:
	url: URL to crawl

	Returns:
	str: Extracted text content

	Raises:
	Exception: If crawling fails
	"""
	try:
	async with aiohttp.ClientSession() as session:
	async with session.get(url) as response:
	html = await response.text()
	soup = BeautifulSoup(html, 'html.parser')
	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()
	return soup.get_text()
	except Exception as e:
	self.logger.error(f"Crawling error: {str(e)}")
	raise

	def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]:
	"""
	Synchronously crawl a website using breadth-first search.

	Args:
	start_url: Initial URL to begin crawling
	max_pages: Maximum number of pages to crawl

	Returns:
	List[Dict]: Structured documents from crawled pages
	"""
	base_domain = urlparse(start_url).netloc
	queue = [start_url] # URLs to crawl
	documents = [] # Collected documents

	while queue and len(documents) < max_pages:
	current_url = queue.pop(0)
	sanitized_url = self.sanitize_url(current_url)

	if sanitized_url in self.visited_urls:
	continue

	self.visited_urls.add(sanitized_url)
	self.logger.info(f"Crawling: {sanitized_url}")

	page_data = self.get_page_content(sanitized_url)
	if not page_data:
	continue

	documents.append(page_data)

	# Get links for further crawling
	try:
	response = requests.get(sanitized_url, headers=self.headers, timeout=10)
	soup = BeautifulSoup(response.text, 'lxml')
	new_links = self.extract_links(sanitized_url, soup)
	queue.extend(link for link in new_links
	if link not in self.visited_urls)
	except Exception as e:
	self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}")

	return documents