| """ |
| Custom Tools for GAIA Benchmark Agent |
| |
| Working tools: |
| 1. wikipedia_search - Search Wikipedia for factual information |
| 2. fetch_url_content - Fetch and extract text from web pages |
| """ |
|
|
| import os |
| import requests |
| from smolagents import tool |
| from bs4 import BeautifulSoup |
|
|
|
|
| @tool |
| def wikipedia_search(query: str, lang: str = "en") -> str: |
| """Searches Wikipedia and returns a summary of the most relevant article. |
| |
| Args: |
| query: The search query (e.g., "Mercedes Sosa discography") |
| lang: Language code for Wikipedia (default: "en") |
| |
| Returns: |
| The article title and summary text, or an error message if not found. |
| """ |
| try: |
| search_url = f"https://{lang}.wikipedia.org/w/api.php" |
| headers = { |
| "User-Agent": "GAIABenchmarkAgent/1.0 (Educational project)" |
| } |
|
|
| |
| search_params = { |
| "action": "query", |
| "list": "search", |
| "srsearch": query, |
| "format": "json", |
| "srlimit": 1 |
| } |
| response = requests.get(search_url, params=search_params, headers=headers, timeout=10) |
| response.raise_for_status() |
| search_data = response.json() |
|
|
| search_results = search_data.get("query", {}).get("search", []) |
| if not search_results: |
| return f"No Wikipedia article found for: {query}" |
|
|
| page_title = search_results[0]["title"] |
|
|
| |
| content_params = { |
| "action": "query", |
| "titles": page_title, |
| "prop": "extracts", |
| "exintro": False, |
| "explaintext": True, |
| "format": "json", |
| "exsectionformat": "plain" |
| } |
| response = requests.get(search_url, params=content_params, headers=headers, timeout=10) |
| response.raise_for_status() |
| content_data = response.json() |
|
|
| pages = content_data.get("query", {}).get("pages", {}) |
| if not pages: |
| return f"Could not retrieve content for: {page_title}" |
|
|
| page = list(pages.values())[0] |
| extract = page.get("extract", "") |
|
|
| if not extract: |
| return f"Wikipedia article '{page_title}' has no text content." |
|
|
| if len(extract) > 8000: |
| extract = extract[:8000] + "\n\n[Content truncated...]" |
|
|
| return f"Wikipedia: {page_title}\n\n{extract}" |
|
|
| except requests.exceptions.RequestException as e: |
| return f"ERROR: Failed to search Wikipedia - {str(e)}" |
| except Exception as e: |
| return f"ERROR: Wikipedia search failed - {str(e)}" |
|
|
|
|
| @tool |
| def fetch_url_content(url: str) -> str: |
| """Fetches and extracts text content from a given URL. |
| |
| Args: |
| url: The URL to fetch content from |
| |
| Returns: |
| The extracted text content from the webpage, or an error message. |
| """ |
| try: |
| headers = { |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' |
| } |
| response = requests.get(url, headers=headers, timeout=15) |
| response.raise_for_status() |
|
|
| soup = BeautifulSoup(response.content, 'html.parser') |
|
|
| |
| for element in soup(["script", "style", "nav", "header", "footer"]): |
| element.decompose() |
|
|
| text = soup.get_text() |
|
|
| |
| lines = (line.strip() for line in text.splitlines()) |
| chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) |
| text = '\n'.join(chunk for chunk in chunks if chunk) |
|
|
| if len(text) > 5000: |
| text = text[:5000] + "\n\n[Content truncated]" |
|
|
| return f"Content from {url}:\n\n{text}" |
|
|
| except requests.exceptions.RequestException as e: |
| return f"ERROR: Failed to fetch URL - {str(e)}" |
| except Exception as e: |
| return f"ERROR: {str(e)}" |
|
|
|
|
| |
| custom_tools = [ |
| wikipedia_search, |
| fetch_url_content, |
| ] |
|
|