Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools.py

whyturbocharge

Upload 2 files

b864e2e verified 4 months ago

raw

history blame contribute delete

3.97 kB

	"""
	Custom Tools for GAIA Benchmark Agent

	Working tools:
	1. wikipedia_search - Search Wikipedia for factual information
	2. fetch_url_content - Fetch and extract text from web pages
	"""

	import os
	import requests
	from smolagents import tool
	from bs4 import BeautifulSoup


	@tool
	def wikipedia_search(query: str, lang: str = "en") -> str:
	"""Searches Wikipedia and returns a summary of the most relevant article.

	Args:
	query: The search query (e.g., "Mercedes Sosa discography")
	lang: Language code for Wikipedia (default: "en")

	Returns:
	The article title and summary text, or an error message if not found.
	"""
	try:
	search_url = f"https://{lang}.wikipedia.org/w/api.php"
	headers = {
	"User-Agent": "GAIABenchmarkAgent/1.0 (Educational project)"
	}

	# Search for the query
	search_params = {
	"action": "query",
	"list": "search",
	"srsearch": query,
	"format": "json",
	"srlimit": 1
	}
	response = requests.get(search_url, params=search_params, headers=headers, timeout=10)
	response.raise_for_status()
	search_data = response.json()

	search_results = search_data.get("query", {}).get("search", [])
	if not search_results:
	return f"No Wikipedia article found for: {query}"

	page_title = search_results[0]["title"]

	# Get the page content
	content_params = {
	"action": "query",
	"titles": page_title,
	"prop": "extracts",
	"exintro": False,
	"explaintext": True,
	"format": "json",
	"exsectionformat": "plain"
	}
	response = requests.get(search_url, params=content_params, headers=headers, timeout=10)
	response.raise_for_status()
	content_data = response.json()

	pages = content_data.get("query", {}).get("pages", {})
	if not pages:
	return f"Could not retrieve content for: {page_title}"

	page = list(pages.values())[0]
	extract = page.get("extract", "")

	if not extract:
	return f"Wikipedia article '{page_title}' has no text content."

	if len(extract) > 8000:
	extract = extract[:8000] + "\n\n[Content truncated...]"

	return f"Wikipedia: {page_title}\n\n{extract}"

	except requests.exceptions.RequestException as e:
	return f"ERROR: Failed to search Wikipedia - {str(e)}"
	except Exception as e:
	return f"ERROR: Wikipedia search failed - {str(e)}"


	@tool
	def fetch_url_content(url: str) -> str:
	"""Fetches and extracts text content from a given URL.

	Args:
	url: The URL to fetch content from

	Returns:
	The extracted text content from the webpage, or an error message.
	"""
	try:
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
	}
	response = requests.get(url, headers=headers, timeout=15)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove non-content elements
	for element in soup(["script", "style", "nav", "header", "footer"]):
	element.decompose()

	text = soup.get_text()

	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = '\n'.join(chunk for chunk in chunks if chunk)

	if len(text) > 5000:
	text = text[:5000] + "\n\n[Content truncated]"

	return f"Content from {url}:\n\n{text}"

	except requests.exceptions.RequestException as e:
	return f"ERROR: Failed to fetch URL - {str(e)}"
	except Exception as e:
	return f"ERROR: {str(e)}"


	# Export tools for use in agent.py
	custom_tools = [
	wikipedia_search,
	fetch_url_content,
	]