import requests from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from typing import List, Set, Optional, Dict import logging import re from app.config import Config import aiohttp class URLCrawler: """ A web crawler that extracts and processes content from websites. Handles both synchronous and asynchronous crawling operations. Features: - URL validation and sanitization - Content extraction with noise removal - Breadth-first crawling with configurable depth - Respects robots.txt and avoids non-html content """ def __init__(self): """Initialize the crawler with default settings.""" self.visited_urls: Set[str] = set() # Tracks crawled URLs to avoid duplicates self.logger = logging.getLogger(__name__) # Configure headers to mimic a real browser self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)', 'Accept-Language': 'en-US,en;q=0.9' } def is_valid_url(self, url: str, base_domain: str) -> bool: """ Validate if a URL should be crawled. Args: url: URL to validate base_domain: The target domain to stay within Returns: bool: True if URL is crawlable """ parsed = urlparse(url) return (parsed.scheme in ('http', 'https') and # Only HTTP/HTTPS parsed.netloc == base_domain and # Stay within target domain not any(ext in url.lower() # Skip binary files for ext in ['.pdf', '.jpg', '.png', '.zip']) and url not in self.visited_urls) # Avoid duplicates def sanitize_url(self, url: str) -> str: """ Normalize URL by removing fragments and query parameters. Args: url: URL to sanitize Returns: str: Normalized URL """ parsed = urlparse(url) return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}" def clean_text(self, text: str) -> str: """ Clean and normalize extracted text content. Args: text: Raw extracted text Returns: str: Cleaned text content """ # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove common boilerplate text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I) # Remove short lines (likely not meaningful content) return '\n'.join(line for line in text.split('\n') if len(line.strip()) > 30) def extract_main_content(self, soup: BeautifulSoup) -> str: """ Extract primary content from HTML using semantic heuristics. Args: soup: BeautifulSoup parsed HTML document Returns: str: Extracted main content """ # Remove unwanted elements that typically don't contain main content for element in soup(['script', 'style', 'nav', 'footer', 'header', 'iframe', 'aside', 'form']): element.decompose() # Prioritize semantic HTML containers that likely contain main content for tag in ['article', 'main', 'section[role="main"]', '.content']: content = soup.select_one(tag) if content: return self.clean_text(content.get_text(separator='\n')) # Fallback to body if no semantic containers found return self.clean_text(soup.body.get_text(separator='\n')) def get_page_content(self, url: str) -> Optional[Dict]: """ Fetch and process a single web page. Args: url: URL to fetch Returns: Optional[Dict]: Structured page data or None if invalid """ try: response = requests.get(url, headers=self.headers, timeout=15) response.raise_for_status() # Skip non-HTML content if 'text/html' not in response.headers.get('Content-Type', ''): return None soup = BeautifulSoup(response.text, 'lxml') title = soup.title.string if soup.title else urlparse(url).path content = self.extract_main_content(soup) # Skip pages with insufficient content if len(content.split()) < 100: # Minimum 100 words return None return { 'url': url, 'title': title, 'content': content, 'last_modified': response.headers.get('Last-Modified', '') } except Exception as e: self.logger.warning(f"Error processing {url}: {str(e)}") return None def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]: """ Extract all crawlable links from a page. Args: url: Base URL for relative link resolution soup: Parsed HTML document Returns: List[str]: List of absolute URLs to crawl """ base_domain = urlparse(url).netloc links = set() for link in soup.find_all('a', href=True): href = link['href'].split('#')[0] # Remove fragments if not href or href.startswith('javascript:'): continue absolute_url = urljoin(url, href) sanitized_url = self.sanitize_url(absolute_url) if self.is_valid_url(sanitized_url, base_domain): links.add(sanitized_url) return sorted(links)[:Config.MAX_LINKS_PER_PAGE] # Apply limit async def crawl(self, url: str) -> str: """ Asynchronously crawl a single URL and return its text content. Args: url: URL to crawl Returns: str: Extracted text content Raises: Exception: If crawling fails """ try: async with aiohttp.ClientSession() as session: async with session.get(url) as response: html = await response.text() soup = BeautifulSoup(html, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() return soup.get_text() except Exception as e: self.logger.error(f"Crawling error: {str(e)}") raise def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]: """ Synchronously crawl a website using breadth-first search. Args: start_url: Initial URL to begin crawling max_pages: Maximum number of pages to crawl Returns: List[Dict]: Structured documents from crawled pages """ base_domain = urlparse(start_url).netloc queue = [start_url] # URLs to crawl documents = [] # Collected documents while queue and len(documents) < max_pages: current_url = queue.pop(0) sanitized_url = self.sanitize_url(current_url) if sanitized_url in self.visited_urls: continue self.visited_urls.add(sanitized_url) self.logger.info(f"Crawling: {sanitized_url}") page_data = self.get_page_content(sanitized_url) if not page_data: continue documents.append(page_data) # Get links for further crawling try: response = requests.get(sanitized_url, headers=self.headers, timeout=10) soup = BeautifulSoup(response.text, 'lxml') new_links = self.extract_links(sanitized_url, soup) queue.extend(link for link in new_links if link not in self.visited_urls) except Exception as e: self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}") return documents