import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Optional, Dict
import logging
import re
from app.config import Config
import aiohttp

class URLCrawler:
    """
    A web crawler that extracts and processes content from websites.
    Handles both synchronous and asynchronous crawling operations.
    
    Features:
    - URL validation and sanitization
    - Content extraction with noise removal
    - Breadth-first crawling with configurable depth
    - Respects robots.txt and avoids non-html content
    """
    
    def __init__(self):
        """Initialize the crawler with default settings."""
        self.visited_urls: Set[str] = set()  # Tracks crawled URLs to avoid duplicates
        self.logger = logging.getLogger(__name__)
        # Configure headers to mimic a real browser
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)',
            'Accept-Language': 'en-US,en;q=0.9'
        }

    def is_valid_url(self, url: str, base_domain: str) -> bool:
        """
        Validate if a URL should be crawled.
        
        Args:
            url: URL to validate
            base_domain: The target domain to stay within
            
        Returns:
            bool: True if URL is crawlable
        """
        parsed = urlparse(url)
        return (parsed.scheme in ('http', 'https') and  # Only HTTP/HTTPS
                parsed.netloc == base_domain and  # Stay within target domain
                not any(ext in url.lower()  # Skip binary files
                       for ext in ['.pdf', '.jpg', '.png', '.zip']) and
                url not in self.visited_urls)  # Avoid duplicates

    def sanitize_url(self, url: str) -> str:
        """
        Normalize URL by removing fragments and query parameters.
        
        Args:
            url: URL to sanitize
            
        Returns:
            str: Normalized URL
        """
        parsed = urlparse(url)
        return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"

    def clean_text(self, text: str) -> str:
        """
        Clean and normalize extracted text content.
        
        Args:
            text: Raw extracted text
            
        Returns:
            str: Cleaned text content
        """
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove common boilerplate
        text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I)
        # Remove short lines (likely not meaningful content)
        return '\n'.join(line for line in text.split('\n') 
                        if len(line.strip()) > 30)

    def extract_main_content(self, soup: BeautifulSoup) -> str:
        """
        Extract primary content from HTML using semantic heuristics.
        
        Args:
            soup: BeautifulSoup parsed HTML document
            
        Returns:
            str: Extracted main content
        """
        # Remove unwanted elements that typically don't contain main content
        for element in soup(['script', 'style', 'nav', 'footer', 
                           'header', 'iframe', 'aside', 'form']):
            element.decompose()

        # Prioritize semantic HTML containers that likely contain main content
        for tag in ['article', 'main', 'section[role="main"]', '.content']:
            content = soup.select_one(tag)
            if content:
                return self.clean_text(content.get_text(separator='\n'))
        
        # Fallback to body if no semantic containers found
        return self.clean_text(soup.body.get_text(separator='\n'))

    def get_page_content(self, url: str) -> Optional[Dict]:
        """
        Fetch and process a single web page.
        
        Args:
            url: URL to fetch
            
        Returns:
            Optional[Dict]: Structured page data or None if invalid
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=15)
            response.raise_for_status()
            
            # Skip non-HTML content
            if 'text/html' not in response.headers.get('Content-Type', ''):
                return None

            soup = BeautifulSoup(response.text, 'lxml')
            title = soup.title.string if soup.title else urlparse(url).path
            content = self.extract_main_content(soup)
            
            # Skip pages with insufficient content
            if len(content.split()) < 100:  # Minimum 100 words
                return None
                
            return {
                'url': url,
                'title': title,
                'content': content,
                'last_modified': response.headers.get('Last-Modified', '')
            }
            
        except Exception as e:
            self.logger.warning(f"Error processing {url}: {str(e)}")
            return None

    def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]:
        """
        Extract all crawlable links from a page.
        
        Args:
            url: Base URL for relative link resolution
            soup: Parsed HTML document
            
        Returns:
            List[str]: List of absolute URLs to crawl
        """
        base_domain = urlparse(url).netloc
        links = set()
        
        for link in soup.find_all('a', href=True):
            href = link['href'].split('#')[0]  # Remove fragments
            if not href or href.startswith('javascript:'):
                continue
                
            absolute_url = urljoin(url, href)
            sanitized_url = self.sanitize_url(absolute_url)
            
            if self.is_valid_url(sanitized_url, base_domain):
                links.add(sanitized_url)
        
        return sorted(links)[:Config.MAX_LINKS_PER_PAGE]  # Apply limit

    async def crawl(self, url: str) -> str:
        """
        Asynchronously crawl a single URL and return its text content.
        
        Args:
            url: URL to crawl
            
        Returns:
            str: Extracted text content
            
        Raises:
            Exception: If crawling fails
        """
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    # Remove script and style elements
                    for script in soup(["script", "style"]):
                        script.decompose()
                    return soup.get_text()
        except Exception as e:
            self.logger.error(f"Crawling error: {str(e)}")
            raise

    def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]:
        """
        Synchronously crawl a website using breadth-first search.
        
        Args:
            start_url: Initial URL to begin crawling
            max_pages: Maximum number of pages to crawl
            
        Returns:
            List[Dict]: Structured documents from crawled pages
        """
        base_domain = urlparse(start_url).netloc
        queue = [start_url]  # URLs to crawl
        documents = []  # Collected documents
        
        while queue and len(documents) < max_pages:
            current_url = queue.pop(0)
            sanitized_url = self.sanitize_url(current_url)
            
            if sanitized_url in self.visited_urls:
                continue
                
            self.visited_urls.add(sanitized_url)
            self.logger.info(f"Crawling: {sanitized_url}")
            
            page_data = self.get_page_content(sanitized_url)
            if not page_data:
                continue
                
            documents.append(page_data)
            
            # Get links for further crawling
            try:
                response = requests.get(sanitized_url, headers=self.headers, timeout=10)
                soup = BeautifulSoup(response.text, 'lxml')
                new_links = self.extract_links(sanitized_url, soup)
                queue.extend(link for link in new_links 
                            if link not in self.visited_urls)
            except Exception as e:
                self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}")
        
        return documents