ConvoBot / src /crawler.py
ashish-ninehertz
changes
e272f4f
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from typing import List, Set, Optional, Dict
import logging
import re
from app.config import Config
import aiohttp
class URLCrawler:
"""
A web crawler that extracts and processes content from websites.
Handles both synchronous and asynchronous crawling operations.
Features:
- URL validation and sanitization
- Content extraction with noise removal
- Breadth-first crawling with configurable depth
- Respects robots.txt and avoids non-html content
"""
def __init__(self):
"""Initialize the crawler with default settings."""
self.visited_urls: Set[str] = set() # Tracks crawled URLs to avoid duplicates
self.logger = logging.getLogger(__name__)
# Configure headers to mimic a real browser
self.headers = {
'User-Agent': 'Mozilla/5.0 (compatible; RAGBot/1.0)',
'Accept-Language': 'en-US,en;q=0.9'
}
def is_valid_url(self, url: str, base_domain: str) -> bool:
"""
Validate if a URL should be crawled.
Args:
url: URL to validate
base_domain: The target domain to stay within
Returns:
bool: True if URL is crawlable
"""
parsed = urlparse(url)
return (parsed.scheme in ('http', 'https') and # Only HTTP/HTTPS
parsed.netloc == base_domain and # Stay within target domain
not any(ext in url.lower() # Skip binary files
for ext in ['.pdf', '.jpg', '.png', '.zip']) and
url not in self.visited_urls) # Avoid duplicates
def sanitize_url(self, url: str) -> str:
"""
Normalize URL by removing fragments and query parameters.
Args:
url: URL to sanitize
Returns:
str: Normalized URL
"""
parsed = urlparse(url)
return f"{parsed.scheme}://{parsed.netloc}{parsed.path.rstrip('/')}"
def clean_text(self, text: str) -> str:
"""
Clean and normalize extracted text content.
Args:
text: Raw extracted text
Returns:
str: Cleaned text content
"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove common boilerplate
text = re.sub(r'(\b(privacy policy|terms of service|cookie policy)\b|\b\d+\s*(comments|shares|likes)\b)', '', text, flags=re.I)
# Remove short lines (likely not meaningful content)
return '\n'.join(line for line in text.split('\n')
if len(line.strip()) > 30)
def extract_main_content(self, soup: BeautifulSoup) -> str:
"""
Extract primary content from HTML using semantic heuristics.
Args:
soup: BeautifulSoup parsed HTML document
Returns:
str: Extracted main content
"""
# Remove unwanted elements that typically don't contain main content
for element in soup(['script', 'style', 'nav', 'footer',
'header', 'iframe', 'aside', 'form']):
element.decompose()
# Prioritize semantic HTML containers that likely contain main content
for tag in ['article', 'main', 'section[role="main"]', '.content']:
content = soup.select_one(tag)
if content:
return self.clean_text(content.get_text(separator='\n'))
# Fallback to body if no semantic containers found
return self.clean_text(soup.body.get_text(separator='\n'))
def get_page_content(self, url: str) -> Optional[Dict]:
"""
Fetch and process a single web page.
Args:
url: URL to fetch
Returns:
Optional[Dict]: Structured page data or None if invalid
"""
try:
response = requests.get(url, headers=self.headers, timeout=15)
response.raise_for_status()
# Skip non-HTML content
if 'text/html' not in response.headers.get('Content-Type', ''):
return None
soup = BeautifulSoup(response.text, 'lxml')
title = soup.title.string if soup.title else urlparse(url).path
content = self.extract_main_content(soup)
# Skip pages with insufficient content
if len(content.split()) < 100: # Minimum 100 words
return None
return {
'url': url,
'title': title,
'content': content,
'last_modified': response.headers.get('Last-Modified', '')
}
except Exception as e:
self.logger.warning(f"Error processing {url}: {str(e)}")
return None
def extract_links(self, url: str, soup: BeautifulSoup) -> List[str]:
"""
Extract all crawlable links from a page.
Args:
url: Base URL for relative link resolution
soup: Parsed HTML document
Returns:
List[str]: List of absolute URLs to crawl
"""
base_domain = urlparse(url).netloc
links = set()
for link in soup.find_all('a', href=True):
href = link['href'].split('#')[0] # Remove fragments
if not href or href.startswith('javascript:'):
continue
absolute_url = urljoin(url, href)
sanitized_url = self.sanitize_url(absolute_url)
if self.is_valid_url(sanitized_url, base_domain):
links.add(sanitized_url)
return sorted(links)[:Config.MAX_LINKS_PER_PAGE] # Apply limit
async def crawl(self, url: str) -> str:
"""
Asynchronously crawl a single URL and return its text content.
Args:
url: URL to crawl
Returns:
str: Extracted text content
Raises:
Exception: If crawling fails
"""
try:
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
html = await response.text()
soup = BeautifulSoup(html, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
return soup.get_text()
except Exception as e:
self.logger.error(f"Crawling error: {str(e)}")
raise
def crawl_sync(self, start_url: str, max_pages: int = Config.MAX_PAGES_TO_CRAWL) -> List[Dict]:
"""
Synchronously crawl a website using breadth-first search.
Args:
start_url: Initial URL to begin crawling
max_pages: Maximum number of pages to crawl
Returns:
List[Dict]: Structured documents from crawled pages
"""
base_domain = urlparse(start_url).netloc
queue = [start_url] # URLs to crawl
documents = [] # Collected documents
while queue and len(documents) < max_pages:
current_url = queue.pop(0)
sanitized_url = self.sanitize_url(current_url)
if sanitized_url in self.visited_urls:
continue
self.visited_urls.add(sanitized_url)
self.logger.info(f"Crawling: {sanitized_url}")
page_data = self.get_page_content(sanitized_url)
if not page_data:
continue
documents.append(page_data)
# Get links for further crawling
try:
response = requests.get(sanitized_url, headers=self.headers, timeout=10)
soup = BeautifulSoup(response.text, 'lxml')
new_links = self.extract_links(sanitized_url, soup)
queue.extend(link for link in new_links
if link not in self.visited_urls)
except Exception as e:
self.logger.warning(f"Error getting links from {sanitized_url}: {str(e)}")
return documents