Spaces:
Sleeping
Sleeping
| """ | |
| Web Scraper Tool - Fetches and extracts text from policy pages | |
| """ | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from crewai.tools import tool | |
| import time | |
| from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length | |
| from utils.logger import log_agent_action | |
| # Configuration | |
| REQUEST_TIMEOUT = 30 | |
| MAX_RETRIES = 2 | |
| RETRY_DELAY = 2 | |
| HEADERS = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| } | |
| def extract_text_from_html(html: str) -> str: | |
| """Extract clean text from HTML content.""" | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # Remove unwanted elements | |
| for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']): | |
| element.decompose() | |
| # Try to find main content | |
| main_content = None | |
| for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']: | |
| main_content = soup.select_one(selector) | |
| if main_content: | |
| break | |
| if not main_content: | |
| main_content = soup.body if soup.body else soup | |
| text = main_content.get_text(separator='\n', strip=True) | |
| lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2] | |
| return '\n'.join(lines) | |
| def get_page_title(html: str) -> str: | |
| """Extract page title from HTML""" | |
| soup = BeautifulSoup(html, 'html.parser') | |
| if soup.title and soup.title.string: | |
| return soup.title.string.strip() | |
| h1 = soup.find('h1') | |
| if h1: | |
| return h1.get_text(strip=True) | |
| return "Unknown Policy" | |
| def web_scraper_tool(url: str) -> str: | |
| """ | |
| Scrapes text content from a policy webpage. | |
| Args: | |
| url: The URL of the policy page to scrape | |
| Returns: | |
| Extracted text content from the policy page | |
| """ | |
| start_time = time.time() | |
| # Validate URL | |
| is_valid, error_msg = validate_url(url) | |
| if not is_valid: | |
| log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}", | |
| time.time() - start_time, False, error_msg) | |
| return f"Error: {error_msg}" | |
| try: | |
| # Fetch with retry | |
| response = None | |
| for attempt in range(MAX_RETRIES + 1): | |
| try: | |
| response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) | |
| response.raise_for_status() | |
| break | |
| except requests.exceptions.RequestException as e: | |
| if attempt < MAX_RETRIES: | |
| time.sleep(RETRY_DELAY) | |
| else: | |
| raise e | |
| # Extract content | |
| html = response.text | |
| title = get_page_title(html) | |
| content = extract_text_from_html(html) | |
| content = sanitize_text(content) | |
| # Validate content | |
| is_valid, error_msg = validate_content_length(content) | |
| if not is_valid: | |
| log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg, | |
| time.time() - start_time, False, error_msg) | |
| return f"Error: {error_msg}" | |
| content = truncate_content(content) | |
| word_count = len(content.split()) | |
| log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched", | |
| f"Extracted {word_count} words", time.time() - start_time, True) | |
| return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}" | |
| except requests.exceptions.Timeout: | |
| error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds" | |
| log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg, | |
| time.time() - start_time, False, error_msg) | |
| return f"Error: {error_msg}" | |
| except requests.exceptions.HTTPError as e: | |
| error_msg = f"HTTP error: {e.response.status_code}" | |
| log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg, | |
| time.time() - start_time, False, error_msg) | |
| return f"Error: {error_msg}" | |
| except Exception as e: | |
| error_msg = f"Unexpected error: {str(e)}" | |
| log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg, | |
| time.time() - start_time, False, error_msg) | |
| return f"Error: {error_msg}" | |