Spaces:

ashishninehertz
/

ConvoBot

Sleeping

File size: 8,503 Bytes

e272f4f

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Optional, Dict
import logging
import re
from app.config import Config
import aiohttp

class URLCrawler:
    """
    A web crawler that extracts and processes content from websites.
    Handles both synchronous and asynchronous crawling operations.
    
    Features:
    - URL validation and sanitization
    - Content extraction with noise removal
    - Breadth-first crawling with configurable depth
    - Respects robots.txt and avoids non-html content
    """
    
    def __init__(self):
        """Initialize the crawler with default settings."""
        self.visited_urls: Set[str] = set()  # Tracks crawled URLs to avoid duplicates
        self.logger = logging.getLogger(__name__)
        # Configure headers to mimic a real browser
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)',
            'Accept-Language': 'en-US,en;q=0.9'
        }

    def is_valid_url(self, url: str, base_domain: str) -> bool:
        """
        Validate if a URL should be crawled.
        
        Args:
            url: URL to validate
            base_domain: The target domain to stay within
            
        Returns:
            bool: True if URL is crawlable
        """
        parsed = urlparse(url)
        return (parsed.scheme in ('http', 'https') and  # Only HTTP/HTTPS
                parsed.netloc == base_domain and  # Stay within target domain
                not any(ext in url.lower()  # Skip binary files
                       for ext in ['.pdf', '.jpg', '.png', '.zip']) and
                url not in self.visited_urls)  # Avoid duplicates

    def sanitize_url(self, url: str) -> str:
        """
        Normalize URL by removing fragments and query parameters.
        
        Args:
            url: URL to sanitize
            
        Returns:
            str: Normalized URL
        """
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize extracted text content.
        
        Args:
            text: Raw extracted text
            
        Returns:
            str: Cleaned text content
        """
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove common boilerplate
        text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I)
        # Remove short lines (likely not meaningful content)
        return '\n'.join(line for line in text.split('\n') 
                        if len(line.strip()) > 30)

    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """
        Extract primary content from HTML using semantic heuristics.
        
        Args:
            soup: BeautifulSoup parsed HTML document
            
        Returns:
            str: Extracted main content
        """
        # Remove unwanted elements that typically don't contain main content
        for element in soup(['script', 'style', 'nav', 'footer', 
                           'header', 'iframe', 'aside', 'form']):
            element.decompose()

        # Prioritize semantic HTML containers that likely contain main content
        for tag in ['article', 'main', 'section[role="main"]', '.content']:
            content = soup.select_one(tag)
            if content:
                return self.clean_text(content.get_text(separator='\n'))
        
        # Fallback to body if no semantic containers found
        return self.clean_text(soup.body.get_text(separator='\n'))

    def get_page_content(self, url: str) -> Optional[Dict]:
        """
        Fetch and process a single web page.
        
        Args:
            url: URL to fetch
            
        Returns:
            Optional[Dict]: Structured page data or None if invalid
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            
            # Skip non-HTML content
            if 'text/html' not in response.headers.get('Content-Type', ''):
                return None

            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.title.string if soup.title else urlparse(url).path
            content = self.extract_main_content(soup)
            
            # Skip pages with insufficient content
            if len(content.split()) < 100:  # Minimum 100 words
                return None
                
            return {
                'url': url,
                'title': title,
                'content': content,
                'last_modified': response.headers.get('Last-Modified', '')
            }
            
        except Exception as e:
            self.logger.warning(f"Error processing {url}: {str(e)}")
            return None

    def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]:
        """
        Extract all crawlable links from a page.
        
        Args:
            url: Base URL for relative link resolution
            soup: Parsed HTML document
            
        Returns:
            List[str]: List of absolute URLs to crawl
        """
        base_domain = urlparse(url).netloc
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href'].split('#')[0]  # Remove fragments
            if not href or href.startswith('javascript:'):
                continue
                
            absolute_url = urljoin(url, href)
            sanitized_url = self.sanitize_url(absolute_url)
            
            if self.is_valid_url(sanitized_url, base_domain):
                links.add(sanitized_url)
        
        return sorted(links)[:Config.MAX_LINKS_PER_PAGE]  # Apply limit

    async def crawl(self, url: str) -> str:
        """
        Asynchronously crawl a single URL and return its text content.
        
        Args:
            url: URL to crawl
            
        Returns:
            str: Extracted text content
            
        Raises:
            Exception: If crawling fails
        """
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()
                    return soup.get_text()
        except Exception as e:
            self.logger.error(f"Crawling error: {str(e)}")
            raise

    def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]:
        """
        Synchronously crawl a website using breadth-first search.
        
        Args:
            start_url: Initial URL to begin crawling
            max_pages: Maximum number of pages to crawl
            
        Returns:
            List[Dict]: Structured documents from crawled pages
        """
        base_domain = urlparse(start_url).netloc
        queue = [start_url]  # URLs to crawl
        documents = []  # Collected documents
        
        while queue and len(documents) < max_pages:
            current_url = queue.pop(0)
            sanitized_url = self.sanitize_url(current_url)
            
            if sanitized_url in self.visited_urls:
                continue
                
            self.visited_urls.add(sanitized_url)
            self.logger.info(f"Crawling: {sanitized_url}")
            
            page_data = self.get_page_content(sanitized_url)
            if not page_data:
                continue
                
            documents.append(page_data)
            
            # Get links for further crawling
            try:
                response = requests.get(sanitized_url, headers=self.headers, timeout=10)
                soup = BeautifulSoup(response.text, 'lxml')
                new_links = self.extract_links(sanitized_url, soup)
                queue.extend(link for link in new_links 
                            if link not in self.visited_urls)
            except Exception as e:
                self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}")
        
        return documents