KPatelis's picture
Upload 26 files
dfd1417 verified
Raw
History Blame Contribute Delete
14.6 kB
"""Web search and fetching tools: DuckDuckGo, Tavily, Wikipedia, Arxiv, webpage fetch, YouTube transcripts."""
import re
from datetime import datetime
import requests
import trafilatura
import wikipedia
from bs4 import BeautifulSoup
from langchain_community.tools import DuckDuckGoSearchRun
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
from langchain_core.tools import tool
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
from gaia.utils import extract_youtube_id, load_config, download_task_file
# Wikipedia blocks/throttles requests with the default `wikipedia` package UA, which
# causes the API to return a non-JSON body and `requests.json()` to raise a
# `JSONDecodeError: Expecting value: line 1 column 1 (char 0)`. Setting an identifying
# UA per Wikipedia's policy fixes this for both `wiki_search` and `wikipedia_page_fetch`.
_USER_AGENT = "gaia-agent/0.1 (https://huggingface.co/spaces/KPatelis/Agents_Course_Assignment)"
wikipedia.set_user_agent(_USER_AGENT)
_ddg_search = None
_tavily_search = None
def _get_ddg():
global _ddg_search
if _ddg_search is None:
_ddg_search = DuckDuckGoSearchRun()
return _ddg_search
def _get_tavily():
global _tavily_search
if _tavily_search is None:
_tavily_search = TavilySearchResults(max_results=3)
return _tavily_search
@tool
def duck_web_search(query: str) -> str:
"""Use DuckDuckGo to search the web.
Args:
query: The search query.
"""
try:
search = _get_ddg().invoke(input=query)
return {"duckduckgo_web_search": search}
except Exception as e:
return f"[duck_web_search] failed: {type(e).__name__}: {e}"
@tool
def wiki_search(query: str) -> str:
"""Search Wikipedia for a query and return up to 3 distinct articles.
Args:
query: The search query."""
try:
documents = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=20000).load()
# Deduplicate by article title
seen_titles = set()
unique_documents = []
for d in documents:
title = d.metadata.get("title", "")
if title and title not in seen_titles:
seen_titles.add(title)
unique_documents.append(d)
processed_documents = "\n\n---\n\n".join(
[
f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
for document in unique_documents
])
return {"wiki_results": processed_documents}
except Exception as e:
return f"[wiki_search] failed: {type(e).__name__}: {e}"
_NAVBOX_MIN_CHARS = 200 # ignore navboxes with less than this many chars of text
_NAVBOX_MAX_CHARS = 15000 # cap navbox text to avoid blowing up context on huge pages
def _extract_navbox_text(html: str) -> str:
"""Pull a flat-text dump of every ``.navbox`` div on a Wikipedia page.
Navboxes are the cross-link tables Wikipedia puts at the bottom of articles.
We collect every navbox on the page, flatten whitespace, and join with blank lines.
Returns ``""`` if no meaningful navbox content is present.
"""
soup = BeautifulSoup(html, "html.parser")
parts = []
for nb in soup.find_all("div", class_="navbox"):
text = re.sub(r"\s+", " ", nb.get_text(" ", strip=True))
if text:
parts.append(text)
joined = "\n\n".join(parts).strip()
if len(joined) < _NAVBOX_MIN_CHARS:
return ""
return joined[:_NAVBOX_MAX_CHARS]
@tool
def wikipedia_page_fetch(title: str) -> str:
"""Fetch a Wikipedia page by title and return its body + navbox text.
Args:
title: The exact Wikipedia page title, optionally with a namespace prefix
(e.g. ``"Wikipedia:Featured article candidates/Featured log/November 2016"``).
Returns:
On success: a multi-line string starting with ``"Wikipedia: <resolved title>"``,
a ``URL:`` line, a blank line, the extracted body, and (if present) a
``--- Related (navbox) ---`` block.
On failure: a string starting with ``[wikipedia_page_fetch] …`` describing
the failure (page not found, disambiguation page, search fallback exhausted).
"""
def _render(page, resolved_from=None):
suffix = f" (resolved from '{resolved_from}')" if resolved_from else ""
header = f"Wikipedia: {page.title}{suffix}\nURL: {page.url}"
# Body: prefer trafilatura (preserves lists and tables — critical for
# counting-style questions). Fall back to page.content on failure.
body = None
downloaded = trafilatura.fetch_url(page.url)
if downloaded is not None:
body = trafilatura.extract(downloaded, include_tables=True, include_links=False)
if not body:
body = page.content
# Navbox: append the cross-link tables that body extractors strip.
navbox_section = ""
try:
navbox_text = _extract_navbox_text(page.html())
if navbox_text:
navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
except Exception:
pass
return f"{header}\n\n{body}{navbox_section}"
try:
page = wikipedia.page(title, auto_suggest=False)
return _render(page)
except wikipedia.exceptions.DisambiguationError as e:
return f"[wikipedia_page_fetch] '{title}' is a disambiguation page. Options: {e.options[:10]}"
except wikipedia.exceptions.PageError:
# Recover from case-sensitivity / slight title mismatches by searching once and
# fetching the top hit.
try:
hits = wikipedia.search(title, results=1)
except Exception as e:
return f"[wikipedia_page_fetch] page not found: '{title}'; search fallback failed: {e}"
if not hits:
return f"[wikipedia_page_fetch] page not found: '{title}' and no search hits."
resolved = hits[0]
if resolved == title:
return f"[wikipedia_page_fetch] page not found: '{title}'. Try wiki_search to find the correct title."
try:
page = wikipedia.page(resolved, auto_suggest=False)
except Exception as e:
return f"[wikipedia_page_fetch] resolved title '{resolved}' but fetch failed: {e}"
return _render(page, resolved_from=title)
except Exception as e:
return f"[wikipedia_page_fetch] failed: {e}"
_WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"
def _resolve_revision_at(title: str, iso_timestamp: str) -> tuple[int | None, str | None, str | None]:
"""Look up the Wikipedia revision id active for ``title`` at ``iso_timestamp``.
"""
params = {
"action": "query",
"format": "json",
"prop": "revisions",
"titles": title,
"rvprop": "ids|timestamp",
"rvlimit": 1,
"rvdir": "older",
"rvstart": iso_timestamp,
}
try:
r = requests.get(
_WIKI_API_ENDPOINT,
params=params,
headers={"User-Agent": _USER_AGENT},
timeout=30,
)
r.raise_for_status()
data = r.json()
except Exception as e:
return None, None, f"API request failed: {type(e).__name__}: {e}"
pages = data.get("query", {}).get("pages", {})
if not pages:
return None, None, "API returned no pages"
page = next(iter(pages.values()))
if "missing" in page:
return None, None, f"page not found: '{title}'"
revisions = page.get("revisions") or []
if not revisions:
return None, None, f"no revisions for '{title}' on or before {iso_timestamp}"
return revisions[0]["revid"], page.get("title", title), None
@tool
def wikipedia_page_as_of(title: str, date: str) -> str:
"""Fetch a Wikipedia page as it existed at end of day UTC on a specific date.
Args:
title: Wikipedia page title (e.g. ``"Taishō Tamai"``,
``"Hokkaido Nippon-Ham Fighters"``, ``"1928 Summer Olympics"``).
date: Target date in ISO ``"YYYY-MM-DD"`` format (e.g. ``"2023-07-31"``).
The page is fetched as it appeared at 23:59:59 UTC on that day.
Returns:
On success: a multi-line string ``"Wikipedia: <title> (as of <date>, revid <id>) / URL: <oldid URL> / <body> / --- Related (navbox) ---"``.
On failure: a string starting with ``[wikipedia_page_as_of] …`` describing
the failure (invalid date, page not found, revision lookup failure,
rendered-HTML fetch failure).
"""
try:
dt = datetime.strptime(date, "%Y-%m-%d")
except ValueError:
return f"[wikipedia_page_as_of] invalid date '{date}'; expected YYYY-MM-DD."
iso_ts = dt.strftime("%Y-%m-%dT23:59:59Z")
revid, resolved_title, err = _resolve_revision_at(title, iso_ts)
if err and err.startswith("page not found"):
# Case-/spelling-tolerant fallback: search and retry the top hit.
try:
hits = wikipedia.search(title, results=1)
except Exception as e:
return f"[wikipedia_page_as_of] page not found and search failed: {e}"
if not hits or hits[0] == title:
return f"[wikipedia_page_as_of] page not found: '{title}'"
revid, resolved_title, err = _resolve_revision_at(hits[0], iso_ts)
if err:
return f"[wikipedia_page_as_of] {err}"
url = f"https://en.wikipedia.org/w/index.php?oldid={revid}"
try:
resp = requests.get(url, headers={"User-Agent": _USER_AGENT}, timeout=30)
resp.raise_for_status()
html = resp.text
except Exception as e:
return f"[wikipedia_page_as_of] could not fetch revision URL {url}: {type(e).__name__}: {e}"
body = trafilatura.extract(html, include_tables=True, include_links=False)
if not body:
return f"[wikipedia_page_as_of] no body extracted from {url}"
navbox_section = ""
try:
navbox_text = _extract_navbox_text(html)
if navbox_text:
navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
except Exception:
pass
header = f"Wikipedia: {resolved_title} (as of {date}, revid {revid})\nURL: {url}"
return f"{header}\n\n{body}{navbox_section}"
@tool
def arxiv_search(query: str) -> str:
"""Search Arxiv for a query and return maximum 3 result.
Args:
query: The search query."""
try:
documents = ArxivLoader(query=query, load_max_docs=3).load()
processed_documents = "\n\n---\n\n".join(
[
f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
for document in documents
])
return {"arxiv_results": processed_documents}
except Exception as e:
return f"[arxiv_search] failed: {type(e).__name__}: {e}"
@tool
def tavily_web_search(query: str) -> str:
"""Search the web using Tavily for a query and return maximum 3 results.
Args:
query: The search query."""
try:
search_documents = _get_tavily().invoke(input=query)
web_results = "\n\n---\n\n".join(
[
f'Document title: {document["title"]}. Contents: {document["content"]}. Relevance Score: {document["score"]}'
for document in search_documents
])
return {"web_results": web_results}
except Exception as e:
return f"[tavily_web_search] failed: {type(e).__name__}: {e}"
@tool
def fetch_webpage(url: str) -> str:
"""
Fetch and extract the main text content from a webpage.
Use this when a search result points to a specific URL you need to read in full.
Args:
url: The full URL of the page to fetch.
Returns:
The extracted text content of the page.
"""
try:
downloaded = trafilatura.fetch_url(url)
if downloaded is None:
return f"[fetch_webpage] could not fetch {url}"
text = trafilatura.extract(downloaded, include_tables=True, include_links=False)
if text is None:
return f"[fetch_webpage] could not extract content from {url}"
return f"Page content from {url}:\n\n{text}"
except Exception as e:
return f"[fetch_webpage] failed: {e}"
@tool
def retry_file_download(task_id: str, file_name: str) -> str:
"""Retry downloading the task file from the GAIA scoring API.
Args:
task_id: The task ID for the current question.
file_name: The original file name from the question metadata.
Returns:
Local filesystem path to the downloaded file, or an error description.
"""
cfg = load_config()
local_path, err = download_task_file(
task_id=task_id,
file_name=file_name,
base_url=cfg["api"]["base_url"],
files_dir=cfg["api"]["files_dir"],
)
if local_path:
return local_path
return f"[retry_file_download] {err}"
@tool
def youtube_transcript(url: str) -> str:
"""Fetch the transcript (captions) of a YouTube video as plain text.
Args:
url: The full YouTube URL (watch, youtu.be, embed, shorts) or a bare 11-char video ID.
Returns:
The concatenated transcript text, or an error string starting with `[youtube_transcript]`.
"""
video_id = extract_youtube_id(url)
if not video_id:
return f"[youtube_transcript] could not parse video ID from: {url}"
try:
ytt_api = YouTubeTranscriptApi()
try:
fetched = ytt_api.fetch(video_id, languages=['en'])
except NoTranscriptFound:
transcript_list = ytt_api.list(video_id)
transcript = next(iter(transcript_list))
fetched = transcript.fetch()
text = " ".join(snippet.text for snippet in fetched)
return f"YouTube transcript for {url}:\n\n{text}"
except TranscriptsDisabled:
return f"[youtube_transcript] transcripts are disabled for {url}"
except VideoUnavailable:
return f"[youtube_transcript] video unavailable: {url}"
except NoTranscriptFound:
return f"[youtube_transcript] no transcript found for {url}"
except Exception as e:
return f"[youtube_transcript] failed: {e}"