Spaces:

AISA-Framework
/

PolicySummarizer

Sleeping

File size: 4,611 Bytes

81ddc8e

"""
Web Scraper Tool - Fetches and extracts text from policy pages
"""
import requests
from bs4 import BeautifulSoup
from crewai.tools import tool
import time

from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length
from utils.logger import log_agent_action

# Configuration
REQUEST_TIMEOUT = 30
MAX_RETRIES = 2
RETRY_DELAY = 2

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
}


def extract_text_from_html(html: str) -> str:
    """Extract clean text from HTML content."""
    soup = BeautifulSoup(html, 'html.parser')
    
    # Remove unwanted elements
    for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
        element.decompose()
    
    # Try to find main content
    main_content = None
    for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']:
        main_content = soup.select_one(selector)
        if main_content:
            break
    
    if not main_content:
        main_content = soup.body if soup.body else soup
    
    text = main_content.get_text(separator='\n', strip=True)
    
    lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2]
    return '\n'.join(lines)


def get_page_title(html: str) -> str:
    """Extract page title from HTML"""
    soup = BeautifulSoup(html, 'html.parser')
    if soup.title and soup.title.string:
        return soup.title.string.strip()
    h1 = soup.find('h1')
    if h1:
        return h1.get_text(strip=True)
    return "Unknown Policy"


@tool("web_scraper")
def web_scraper_tool(url: str) -> str:
    """
    Scrapes text content from a policy webpage.
    
    Args:
        url: The URL of the policy page to scrape
        
    Returns:
        Extracted text content from the policy page
    """
    start_time = time.time()
    
    # Validate URL
    is_valid, error_msg = validate_url(url)
    if not is_valid:
        log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}", 
                        time.time() - start_time, False, error_msg)
        return f"Error: {error_msg}"
    
    try:
        # Fetch with retry
        response = None
        for attempt in range(MAX_RETRIES + 1):
            try:
                response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
                response.raise_for_status()
                break
            except requests.exceptions.RequestException as e:
                if attempt < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    raise e
        
        # Extract content
        html = response.text
        title = get_page_title(html)
        content = extract_text_from_html(html)
        content = sanitize_text(content)
        
        # Validate content
        is_valid, error_msg = validate_content_length(content)
        if not is_valid:
            log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg,
                           time.time() - start_time, False, error_msg)
            return f"Error: {error_msg}"
        
        content = truncate_content(content)
        word_count = len(content.split())
        
        log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched",
                        f"Extracted {word_count} words", time.time() - start_time, True)
        
        return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}"
        
    except requests.exceptions.Timeout:
        error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds"
        log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
                        time.time() - start_time, False, error_msg)
        return f"Error: {error_msg}"
        
    except requests.exceptions.HTTPError as e:
        error_msg = f"HTTP error: {e.response.status_code}"
        log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
                        time.time() - start_time, False, error_msg)
        return f"Error: {error_msg}"
        
    except Exception as e:
        error_msg = f"Unexpected error: {str(e)}"
        log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg,
                        time.time() - start_time, False, error_msg)
        return f"Error: {error_msg}"