Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| from typing import List, Set, Optional, Dict | |
| import logging | |
| import re | |
| from app.config import Config | |
| import aiohttp | |
| class URLCrawler: | |
| """ | |
| A web crawler that extracts and processes content from websites. | |
| Handles both synchronous and asynchronous crawling operations. | |
| Features: | |
| - URL validation and sanitization | |
| - Content extraction with noise removal | |
| - Breadth-first crawling with configurable depth | |
| - Respects robots.txt and avoids non-html content | |
| """ | |
| def __init__(self): | |
| """Initialize the crawler with default settings.""" | |
| self.visited_urls: Set[str] = set() # Tracks crawled URLs to avoid duplicates | |
| self.logger = logging.getLogger(__name__) | |
| # Configure headers to mimic a real browser | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)', | |
| 'Accept-Language': 'en-US,en;q=0.9' | |
| } | |
| def is_valid_url(self, url: str, base_domain: str) -> bool: | |
| """ | |
| Validate if a URL should be crawled. | |
| Args: | |
| url: URL to validate | |
| base_domain: The target domain to stay within | |
| Returns: | |
| bool: True if URL is crawlable | |
| """ | |
| parsed = urlparse(url) | |
| return (parsed.scheme in ('http', 'https') and # Only HTTP/HTTPS | |
| parsed.netloc == base_domain and # Stay within target domain | |
| not any(ext in url.lower() # Skip binary files | |
| for ext in ['.pdf', '.jpg', '.png', '.zip']) and | |
| url not in self.visited_urls) # Avoid duplicates | |
| def sanitize_url(self, url: str) -> str: | |
| """ | |
| Normalize URL by removing fragments and query parameters. | |
| Args: | |
| url: URL to sanitize | |
| Returns: | |
| str: Normalized URL | |
| """ | |
| parsed = urlparse(url) | |
| return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}" | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Clean and normalize extracted text content. | |
| Args: | |
| text: Raw extracted text | |
| Returns: | |
| str: Cleaned text content | |
| """ | |
| # Remove excessive whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove common boilerplate | |
| text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I) | |
| # Remove short lines (likely not meaningful content) | |
| return '\n'.join(line for line in text.split('\n') | |
| if len(line.strip()) > 30) | |
| def extract_main_content(self, soup: BeautifulSoup) -> str: | |
| """ | |
| Extract primary content from HTML using semantic heuristics. | |
| Args: | |
| soup: BeautifulSoup parsed HTML document | |
| Returns: | |
| str: Extracted main content | |
| """ | |
| # Remove unwanted elements that typically don't contain main content | |
| for element in soup(['script', 'style', 'nav', 'footer', | |
| 'header', 'iframe', 'aside', 'form']): | |
| element.decompose() | |
| # Prioritize semantic HTML containers that likely contain main content | |
| for tag in ['article', 'main', 'section[role="main"]', '.content']: | |
| content = soup.select_one(tag) | |
| if content: | |
| return self.clean_text(content.get_text(separator='\n')) | |
| # Fallback to body if no semantic containers found | |
| return self.clean_text(soup.body.get_text(separator='\n')) | |
| def get_page_content(self, url: str) -> Optional[Dict]: | |
| """ | |
| Fetch and process a single web page. | |
| Args: | |
| url: URL to fetch | |
| Returns: | |
| Optional[Dict]: Structured page data or None if invalid | |
| """ | |
| try: | |
| response = requests.get(url, headers=self.headers, timeout=15) | |
| response.raise_for_status() | |
| # Skip non-HTML content | |
| if 'text/html' not in response.headers.get('Content-Type', ''): | |
| return None | |
| soup = BeautifulSoup(response.text, 'lxml') | |
| title = soup.title.string if soup.title else urlparse(url).path | |
| content = self.extract_main_content(soup) | |
| # Skip pages with insufficient content | |
| if len(content.split()) < 100: # Minimum 100 words | |
| return None | |
| return { | |
| 'url': url, | |
| 'title': title, | |
| 'content': content, | |
| 'last_modified': response.headers.get('Last-Modified', '') | |
| } | |
| except Exception as e: | |
| self.logger.warning(f"Error processing {url}: {str(e)}") | |
| return None | |
| def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]: | |
| """ | |
| Extract all crawlable links from a page. | |
| Args: | |
| url: Base URL for relative link resolution | |
| soup: Parsed HTML document | |
| Returns: | |
| List[str]: List of absolute URLs to crawl | |
| """ | |
| base_domain = urlparse(url).netloc | |
| links = set() | |
| for link in soup.find_all('a', href=True): | |
| href = link['href'].split('#')[0] # Remove fragments | |
| if not href or href.startswith('javascript:'): | |
| continue | |
| absolute_url = urljoin(url, href) | |
| sanitized_url = self.sanitize_url(absolute_url) | |
| if self.is_valid_url(sanitized_url, base_domain): | |
| links.add(sanitized_url) | |
| return sorted(links)[:Config.MAX_LINKS_PER_PAGE] # Apply limit | |
| async def crawl(self, url: str) -> str: | |
| """ | |
| Asynchronously crawl a single URL and return its text content. | |
| Args: | |
| url: URL to crawl | |
| Returns: | |
| str: Extracted text content | |
| Raises: | |
| Exception: If crawling fails | |
| """ | |
| try: | |
| async with aiohttp.ClientSession() as session: | |
| async with session.get(url) as response: | |
| html = await response.text() | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # Remove script and style elements | |
| for script in soup(["script", "style"]): | |
| script.decompose() | |
| return soup.get_text() | |
| except Exception as e: | |
| self.logger.error(f"Crawling error: {str(e)}") | |
| raise | |
| def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]: | |
| """ | |
| Synchronously crawl a website using breadth-first search. | |
| Args: | |
| start_url: Initial URL to begin crawling | |
| max_pages: Maximum number of pages to crawl | |
| Returns: | |
| List[Dict]: Structured documents from crawled pages | |
| """ | |
| base_domain = urlparse(start_url).netloc | |
| queue = [start_url] # URLs to crawl | |
| documents = [] # Collected documents | |
| while queue and len(documents) < max_pages: | |
| current_url = queue.pop(0) | |
| sanitized_url = self.sanitize_url(current_url) | |
| if sanitized_url in self.visited_urls: | |
| continue | |
| self.visited_urls.add(sanitized_url) | |
| self.logger.info(f"Crawling: {sanitized_url}") | |
| page_data = self.get_page_content(sanitized_url) | |
| if not page_data: | |
| continue | |
| documents.append(page_data) | |
| # Get links for further crawling | |
| try: | |
| response = requests.get(sanitized_url, headers=self.headers, timeout=10) | |
| soup = BeautifulSoup(response.text, 'lxml') | |
| new_links = self.extract_links(sanitized_url, soup) | |
| queue.extend(link for link in new_links | |
| if link not in self.visited_urls) | |
| except Exception as e: | |
| self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}") | |
| return documents |