whyturbocharge's picture
Upload 2 files
b864e2e verified
"""
Custom Tools for GAIA Benchmark Agent
Working tools:
1. wikipedia_search - Search Wikipedia for factual information
2. fetch_url_content - Fetch and extract text from web pages
"""
import os
import requests
from smolagents import tool
from bs4 import BeautifulSoup
@tool
def wikipedia_search(query: str, lang: str = "en") -> str:
"""Searches Wikipedia and returns a summary of the most relevant article.
Args:
query: The search query (e.g., "Mercedes Sosa discography")
lang: Language code for Wikipedia (default: "en")
Returns:
The article title and summary text, or an error message if not found.
"""
try:
search_url = f"https://{lang}.wikipedia.org/w/api.php"
headers = {
"User-Agent": "GAIABenchmarkAgent/1.0 (Educational project)"
}
# Search for the query
search_params = {
"action": "query",
"list": "search",
"srsearch": query,
"format": "json",
"srlimit": 1
}
response = requests.get(search_url, params=search_params, headers=headers, timeout=10)
response.raise_for_status()
search_data = response.json()
search_results = search_data.get("query", {}).get("search", [])
if not search_results:
return f"No Wikipedia article found for: {query}"
page_title = search_results[0]["title"]
# Get the page content
content_params = {
"action": "query",
"titles": page_title,
"prop": "extracts",
"exintro": False,
"explaintext": True,
"format": "json",
"exsectionformat": "plain"
}
response = requests.get(search_url, params=content_params, headers=headers, timeout=10)
response.raise_for_status()
content_data = response.json()
pages = content_data.get("query", {}).get("pages", {})
if not pages:
return f"Could not retrieve content for: {page_title}"
page = list(pages.values())[0]
extract = page.get("extract", "")
if not extract:
return f"Wikipedia article '{page_title}' has no text content."
if len(extract) > 8000:
extract = extract[:8000] + "\n\n[Content truncated...]"
return f"Wikipedia: {page_title}\n\n{extract}"
except requests.exceptions.RequestException as e:
return f"ERROR: Failed to search Wikipedia - {str(e)}"
except Exception as e:
return f"ERROR: Wikipedia search failed - {str(e)}"
@tool
def fetch_url_content(url: str) -> str:
"""Fetches and extracts text content from a given URL.
Args:
url: The URL to fetch content from
Returns:
The extracted text content from the webpage, or an error message.
"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Remove non-content elements
for element in soup(["script", "style", "nav", "header", "footer"]):
element.decompose()
text = soup.get_text()
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
if len(text) > 5000:
text = text[:5000] + "\n\n[Content truncated]"
return f"Content from {url}:\n\n{text}"
except requests.exceptions.RequestException as e:
return f"ERROR: Failed to fetch URL - {str(e)}"
except Exception as e:
return f"ERROR: {str(e)}"
# Export tools for use in agent.py
custom_tools = [
wikipedia_search,
fetch_url_content,
]