""" Web Scraper Tool - Fetches and extracts text from policy pages """ import requests from bs4 import BeautifulSoup from crewai.tools import tool import time from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length from utils.logger import log_agent_action # Configuration REQUEST_TIMEOUT = 30 MAX_RETRIES = 2 RETRY_DELAY = 2 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', } def extract_text_from_html(html: str) -> str: """Extract clean text from HTML content.""" soup = BeautifulSoup(html, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']): element.decompose() # Try to find main content main_content = None for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']: main_content = soup.select_one(selector) if main_content: break if not main_content: main_content = soup.body if soup.body else soup text = main_content.get_text(separator='\n', strip=True) lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2] return '\n'.join(lines) def get_page_title(html: str) -> str: """Extract page title from HTML""" soup = BeautifulSoup(html, 'html.parser') if soup.title and soup.title.string: return soup.title.string.strip() h1 = soup.find('h1') if h1: return h1.get_text(strip=True) return "Unknown Policy" @tool("web_scraper") def web_scraper_tool(url: str) -> str: """ Scrapes text content from a policy webpage. Args: url: The URL of the policy page to scrape Returns: Extracted text content from the policy page """ start_time = time.time() # Validate URL is_valid, error_msg = validate_url(url) if not is_valid: log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}", time.time() - start_time, False, error_msg) return f"Error: {error_msg}" try: # Fetch with retry response = None for attempt in range(MAX_RETRIES + 1): try: response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) response.raise_for_status() break except requests.exceptions.RequestException as e: if attempt < MAX_RETRIES: time.sleep(RETRY_DELAY) else: raise e # Extract content html = response.text title = get_page_title(html) content = extract_text_from_html(html) content = sanitize_text(content) # Validate content is_valid, error_msg = validate_content_length(content) if not is_valid: log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg, time.time() - start_time, False, error_msg) return f"Error: {error_msg}" content = truncate_content(content) word_count = len(content.split()) log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched", f"Extracted {word_count} words", time.time() - start_time, True) return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}" except requests.exceptions.Timeout: error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds" log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg, time.time() - start_time, False, error_msg) return f"Error: {error_msg}" except requests.exceptions.HTTPError as e: error_msg = f"HTTP error: {e.response.status_code}" log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg, time.time() - start_time, False, error_msg) return f"Error: {error_msg}" except Exception as e: error_msg = f"Unexpected error: {str(e)}" log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg, time.time() - start_time, False, error_msg) return f"Error: {error_msg}"