PolicySummarizer / tools /web_scraper.py
Nadasr's picture
Upload 3 files
81ddc8e verified
"""
Web Scraper Tool - Fetches and extracts text from policy pages
"""
import requests
from bs4 import BeautifulSoup
from crewai.tools import tool
import time
from utils.validators import validate_url, sanitize_text, truncate_content, validate_content_length
from utils.logger import log_agent_action
# Configuration
REQUEST_TIMEOUT = 30
MAX_RETRIES = 2
RETRY_DELAY = 2
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
}
def extract_text_from_html(html: str) -> str:
"""Extract clean text from HTML content."""
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'nav', 'header', 'footer', 'aside', 'form', 'button']):
element.decompose()
# Try to find main content
main_content = None
for selector in ['main', 'article', '[role="main"]', '.content', '.policy-content', '#content']:
main_content = soup.select_one(selector)
if main_content:
break
if not main_content:
main_content = soup.body if soup.body else soup
text = main_content.get_text(separator='\n', strip=True)
lines = [line.strip() for line in text.split('\n') if line.strip() and len(line.strip()) > 2]
return '\n'.join(lines)
def get_page_title(html: str) -> str:
"""Extract page title from HTML"""
soup = BeautifulSoup(html, 'html.parser')
if soup.title and soup.title.string:
return soup.title.string.strip()
h1 = soup.find('h1')
if h1:
return h1.get_text(strip=True)
return "Unknown Policy"
@tool("web_scraper")
def web_scraper_tool(url: str) -> str:
"""
Scrapes text content from a policy webpage.
Args:
url: The URL of the policy page to scrape
Returns:
Extracted text content from the policy page
"""
start_time = time.time()
# Validate URL
is_valid, error_msg = validate_url(url)
if not is_valid:
log_agent_action("Web Scraper Tool", "URL Validation", f"URL provided", f"Failed: {error_msg}",
time.time() - start_time, False, error_msg)
return f"Error: {error_msg}"
try:
# Fetch with retry
response = None
for attempt in range(MAX_RETRIES + 1):
try:
response = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
break
except requests.exceptions.RequestException as e:
if attempt < MAX_RETRIES:
time.sleep(RETRY_DELAY)
else:
raise e
# Extract content
html = response.text
title = get_page_title(html)
content = extract_text_from_html(html)
content = sanitize_text(content)
# Validate content
is_valid, error_msg = validate_content_length(content)
if not is_valid:
log_agent_action("Web Scraper Tool", "Content Extraction", "HTML received", error_msg,
time.time() - start_time, False, error_msg)
return f"Error: {error_msg}"
content = truncate_content(content)
word_count = len(content.split())
log_agent_action("Web Scraper Tool", "Page Scraping", "URL fetched",
f"Extracted {word_count} words", time.time() - start_time, True)
return f"TITLE: {title}\nWORD_COUNT: {word_count}\nCONTENT:\n{content}"
except requests.exceptions.Timeout:
error_msg = f"Request timed out after {REQUEST_TIMEOUT} seconds"
log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
time.time() - start_time, False, error_msg)
return f"Error: {error_msg}"
except requests.exceptions.HTTPError as e:
error_msg = f"HTTP error: {e.response.status_code}"
log_agent_action("Web Scraper Tool", "Page Fetching", "Attempting fetch", error_msg,
time.time() - start_time, False, error_msg)
return f"Error: {error_msg}"
except Exception as e:
error_msg = f"Unexpected error: {str(e)}"
log_agent_action("Web Scraper Tool", "Page Scraping", "Processing", error_msg,
time.time() - start_time, False, error_msg)
return f"Error: {error_msg}"