qa-agent

Paused

qa-agent / tools /web_scraping.py

Jan Krüger

QA Agent for Certification

8d4d62e 10 months ago

8.08 kB

	"""
	Web scraping tools for extracting content from web pages.
	"""

	from smolagents import tool
	import requests
	from bs4 import BeautifulSoup
	import urllib.parse


	@tool
	def scrape_webpage_content(url: str, content_selector: str = None) -> str:
	"""
	Scrape content from a webpage and extract the main text content.

	Args:
	url: The URL of the webpage to scrape
	content_selector: Optional CSS selector to target specific content (e.g., '.article__content', '#main-content')

	Returns:
	The extracted text content from the webpage
	"""
	try:
	# Validate URL
	parsed_url = urllib.parse.urlparse(url)
	if not parsed_url.scheme or not parsed_url.netloc:
	return f"Invalid URL: {url}"

	# Set headers to mimic a real browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	}

	# Make the request
	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	# Parse the HTML
	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "header", "footer", "aside"]):
	script.decompose()

	# Extract content based on selector or find main content
	if content_selector:
	# Use the provided CSS selector
	content_element = soup.select_one(content_selector)
	if content_element:
	text_content = content_element.get_text(strip=True, separator=' ')
	else:
	return f"No content found with selector '{content_selector}' on {url}"
	else:
	# Try common content selectors
	content_selectors = [
	'article',
	'.article__content',
	'.content',
	'.post-content',
	'.entry-content',
	'#content',
	'main',
	'.main-content',
	'[role="main"]'
	]

	text_content = None
	for selector in content_selectors:
	element = soup.select_one(selector)
	if element:
	text_content = element.get_text(strip=True, separator=' ')
	break

	# If no specific content area found, get body text
	if not text_content:
	body = soup.find('body')
	if body:
	text_content = body.get_text(strip=True, separator=' ')
	else:
	text_content = soup.get_text(strip=True, separator=' ')

	# Clean up the text
	if text_content:
	# Remove excessive whitespace
	lines = [line.strip() for line in text_content.split('\n') if line.strip()]
	cleaned_text = '\n'.join(lines)

	# Limit length to prevent overwhelming responses
	if len(cleaned_text) > 5000:
	cleaned_text = cleaned_text[:5000] + "... [Content truncated]"

	return f"Content from {url}:\n\n{cleaned_text}"
	else:
	return f"No readable content found on {url}"

	except requests.exceptions.RequestException as e:
	return f"Error fetching webpage {url}: {str(e)}"
	except Exception as e:
	return f"Error scraping webpage {url}: {str(e)}"


	@tool
	def extract_links_from_webpage(url: str, link_text_filter: str = None) -> str:
	"""
	Extract links from a webpage, optionally filtering by link text.

	Args:
	url: The URL of the webpage to scrape
	link_text_filter: Optional text to filter links by (case-insensitive)

	Returns:
	A formatted string containing the extracted links
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all links
	links = soup.find_all('a', href=True)

	extracted_links = []
	for link in links:
	href = link['href']
	text = link.get_text(strip=True)

	# Convert relative URLs to absolute
	if href.startswith('/'):
	parsed_base = urllib.parse.urlparse(url)
	href = f"{parsed_base.scheme}://{parsed_base.netloc}{href}"
	elif href.startswith('#'):
	continue # Skip anchor links

	# Filter by text if specified
	if link_text_filter:
	if link_text_filter.lower() not in text.lower():
	continue

	if text and href.startswith('http'):
	extracted_links.append(f"• {text}: {href}")

	if extracted_links:
	result = f"Links extracted from {url}:\n\n" + '\n'.join(extracted_links[:20]) # Limit to 20 links
	if len(extracted_links) > 20:
	result += f"\n... and {len(extracted_links) - 20} more links"
	return result
	else:
	return f"No links found on {url}"

	except Exception as e:
	return f"Error extracting links from {url}: {str(e)}"


	@tool
	def get_webpage_metadata(url: str) -> str:
	"""
	Extract metadata from a webpage (title, description, etc.).

	Args:
	url: The URL of the webpage to analyze

	Returns:
	A formatted string containing the webpage metadata
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}

	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	metadata = []

	# Title
	title = soup.find('title')
	if title:
	metadata.append(f"Title: {title.get_text(strip=True)}")

	# Meta description
	meta_desc = soup.find('meta', attrs={'name': 'description'})
	if meta_desc and meta_desc.get('content'):
	metadata.append(f"Description: {meta_desc['content']}")

	# Meta keywords
	meta_keywords = soup.find('meta', attrs={'name': 'keywords'})
	if meta_keywords and meta_keywords.get('content'):
	metadata.append(f"Keywords: {meta_keywords['content']}")

	# Author
	meta_author = soup.find('meta', attrs={'name': 'author'})
	if meta_author and meta_author.get('content'):
	metadata.append(f"Author: {meta_author['content']}")

	# Open Graph metadata
	og_title = soup.find('meta', attrs={'property': 'og:title'})
	if og_title and og_title.get('content'):
	metadata.append(f"OG Title: {og_title['content']}")

	og_desc = soup.find('meta', attrs={'property': 'og:description'})
	if og_desc and og_desc.get('content'):
	metadata.append(f"OG Description: {og_desc['content']}")

	if metadata:
	return f"Metadata from {url}:\n\n" + '\n'.join(metadata)
	else:
	return f"No metadata found on {url}"

	except Exception as e:
	return f"Error extracting metadata from {url}: {str(e)}"