"""Web search and fetching tools: DuckDuckGo, Tavily, Wikipedia, Arxiv, webpage fetch, YouTube transcripts.""" import re from datetime import datetime import requests import trafilatura import wikipedia from bs4 import BeautifulSoup from langchain_community.tools import DuckDuckGoSearchRun from langchain_community.tools.tavily_search import TavilySearchResults from langchain_community.document_loaders import WikipediaLoader, ArxivLoader from langchain_core.tools import tool from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable from gaia.utils import extract_youtube_id, load_config, download_task_file # Wikipedia blocks/throttles requests with the default `wikipedia` package UA, which # causes the API to return a non-JSON body and `requests.json()` to raise a # `JSONDecodeError: Expecting value: line 1 column 1 (char 0)`. Setting an identifying # UA per Wikipedia's policy fixes this for both `wiki_search` and `wikipedia_page_fetch`. _USER_AGENT = "gaia-agent/0.1 (https://huggingface.co/spaces/KPatelis/Agents_Course_Assignment)" wikipedia.set_user_agent(_USER_AGENT) _ddg_search = None _tavily_search = None def _get_ddg(): global _ddg_search if _ddg_search is None: _ddg_search = DuckDuckGoSearchRun() return _ddg_search def _get_tavily(): global _tavily_search if _tavily_search is None: _tavily_search = TavilySearchResults(max_results=3) return _tavily_search @tool def duck_web_search(query: str) -> str: """Use DuckDuckGo to search the web. Args: query: The search query. """ try: search = _get_ddg().invoke(input=query) return {"duckduckgo_web_search": search} except Exception as e: return f"[duck_web_search] failed: {type(e).__name__}: {e}" @tool def wiki_search(query: str) -> str: """Search Wikipedia for a query and return up to 3 distinct articles. Args: query: The search query.""" try: documents = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=20000).load() # Deduplicate by article title seen_titles = set() unique_documents = [] for d in documents: title = d.metadata.get("title", "") if title and title not in seen_titles: seen_titles.add(title) unique_documents.append(d) processed_documents = "\n\n---\n\n".join( [ f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}' for document in unique_documents ]) return {"wiki_results": processed_documents} except Exception as e: return f"[wiki_search] failed: {type(e).__name__}: {e}" _NAVBOX_MIN_CHARS = 200 # ignore navboxes with less than this many chars of text _NAVBOX_MAX_CHARS = 15000 # cap navbox text to avoid blowing up context on huge pages def _extract_navbox_text(html: str) -> str: """Pull a flat-text dump of every ``.navbox`` div on a Wikipedia page. Navboxes are the cross-link tables Wikipedia puts at the bottom of articles. We collect every navbox on the page, flatten whitespace, and join with blank lines. Returns ``""`` if no meaningful navbox content is present. """ soup = BeautifulSoup(html, "html.parser") parts = [] for nb in soup.find_all("div", class_="navbox"): text = re.sub(r"\s+", " ", nb.get_text(" ", strip=True)) if text: parts.append(text) joined = "\n\n".join(parts).strip() if len(joined) < _NAVBOX_MIN_CHARS: return "" return joined[:_NAVBOX_MAX_CHARS] @tool def wikipedia_page_fetch(title: str) -> str: """Fetch a Wikipedia page by title and return its body + navbox text. Args: title: The exact Wikipedia page title, optionally with a namespace prefix (e.g. ``"Wikipedia:Featured article candidates/Featured log/November 2016"``). Returns: On success: a multi-line string starting with ``"Wikipedia: "``, a ``URL:`` line, a blank line, the extracted body, and (if present) a ``--- Related (navbox) ---`` block. On failure: a string starting with ``[wikipedia_page_fetch] …`` describing the failure (page not found, disambiguation page, search fallback exhausted). """ def _render(page, resolved_from=None): suffix = f" (resolved from '{resolved_from}')" if resolved_from else "" header = f"Wikipedia: {page.title}{suffix}\nURL: {page.url}" # Body: prefer trafilatura (preserves lists and tables — critical for # counting-style questions). Fall back to page.content on failure. body = None downloaded = trafilatura.fetch_url(page.url) if downloaded is not None: body = trafilatura.extract(downloaded, include_tables=True, include_links=False) if not body: body = page.content # Navbox: append the cross-link tables that body extractors strip. navbox_section = "" try: navbox_text = _extract_navbox_text(page.html()) if navbox_text: navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}" except Exception: pass return f"{header}\n\n{body}{navbox_section}" try: page = wikipedia.page(title, auto_suggest=False) return _render(page) except wikipedia.exceptions.DisambiguationError as e: return f"[wikipedia_page_fetch] '{title}' is a disambiguation page. Options: {e.options[:10]}" except wikipedia.exceptions.PageError: # Recover from case-sensitivity / slight title mismatches by searching once and # fetching the top hit. try: hits = wikipedia.search(title, results=1) except Exception as e: return f"[wikipedia_page_fetch] page not found: '{title}'; search fallback failed: {e}" if not hits: return f"[wikipedia_page_fetch] page not found: '{title}' and no search hits." resolved = hits[0] if resolved == title: return f"[wikipedia_page_fetch] page not found: '{title}'. Try wiki_search to find the correct title." try: page = wikipedia.page(resolved, auto_suggest=False) except Exception as e: return f"[wikipedia_page_fetch] resolved title '{resolved}' but fetch failed: {e}" return _render(page, resolved_from=title) except Exception as e: return f"[wikipedia_page_fetch] failed: {e}" _WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php" def _resolve_revision_at(title: str, iso_timestamp: str) -> tuple[int | None, str | None, str | None]: """Look up the Wikipedia revision id active for ``title`` at ``iso_timestamp``. """ params = { "action": "query", "format": "json", "prop": "revisions", "titles": title, "rvprop": "ids|timestamp", "rvlimit": 1, "rvdir": "older", "rvstart": iso_timestamp, } try: r = requests.get( _WIKI_API_ENDPOINT, params=params, headers={"User-Agent": _USER_AGENT}, timeout=30, ) r.raise_for_status() data = r.json() except Exception as e: return None, None, f"API request failed: {type(e).__name__}: {e}" pages = data.get("query", {}).get("pages", {}) if not pages: return None, None, "API returned no pages" page = next(iter(pages.values())) if "missing" in page: return None, None, f"page not found: '{title}'" revisions = page.get("revisions") or [] if not revisions: return None, None, f"no revisions for '{title}' on or before {iso_timestamp}" return revisions[0]["revid"], page.get("title", title), None @tool def wikipedia_page_as_of(title: str, date: str) -> str: """Fetch a Wikipedia page as it existed at end of day UTC on a specific date. Args: title: Wikipedia page title (e.g. ``"Taishō Tamai"``, ``"Hokkaido Nippon-Ham Fighters"``, ``"1928 Summer Olympics"``). date: Target date in ISO ``"YYYY-MM-DD"`` format (e.g. ``"2023-07-31"``). The page is fetched as it appeared at 23:59:59 UTC on that day. Returns: On success: a multi-line string ``"Wikipedia: (as of <date>, revid <id>) / URL: <oldid URL> / <body> / --- Related (navbox) ---"``. On failure: a string starting with ``[wikipedia_page_as_of] …`` describing the failure (invalid date, page not found, revision lookup failure, rendered-HTML fetch failure). """ try: dt = datetime.strptime(date, "%Y-%m-%d") except ValueError: return f"[wikipedia_page_as_of] invalid date '{date}'; expected YYYY-MM-DD." iso_ts = dt.strftime("%Y-%m-%dT23:59:59Z") revid, resolved_title, err = _resolve_revision_at(title, iso_ts) if err and err.startswith("page not found"): # Case-/spelling-tolerant fallback: search and retry the top hit. try: hits = wikipedia.search(title, results=1) except Exception as e: return f"[wikipedia_page_as_of] page not found and search failed: {e}" if not hits or hits[0] == title: return f"[wikipedia_page_as_of] page not found: '{title}'" revid, resolved_title, err = _resolve_revision_at(hits[0], iso_ts) if err: return f"[wikipedia_page_as_of] {err}" url = f"https://en.wikipedia.org/w/index.php?oldid={revid}" try: resp = requests.get(url, headers={"User-Agent": _USER_AGENT}, timeout=30) resp.raise_for_status() html = resp.text except Exception as e: return f"[wikipedia_page_as_of] could not fetch revision URL {url}: {type(e).__name__}: {e}" body = trafilatura.extract(html, include_tables=True, include_links=False) if not body: return f"[wikipedia_page_as_of] no body extracted from {url}" navbox_section = "" try: navbox_text = _extract_navbox_text(html) if navbox_text: navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}" except Exception: pass header = f"Wikipedia: {resolved_title} (as of {date}, revid {revid})\nURL: {url}" return f"{header}\n\n{body}{navbox_section}" @tool def arxiv_search(query: str) -> str: """Search Arxiv for a query and return maximum 3 result. Args: query: The search query.""" try: documents = ArxivLoader(query=query, load_max_docs=3).load() processed_documents = "\n\n---\n\n".join( [ f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}' for document in documents ]) return {"arxiv_results": processed_documents} except Exception as e: return f"[arxiv_search] failed: {type(e).__name__}: {e}" @tool def tavily_web_search(query: str) -> str: """Search the web using Tavily for a query and return maximum 3 results. Args: query: The search query.""" try: search_documents = _get_tavily().invoke(input=query) web_results = "\n\n---\n\n".join( [ f'Document title: {document["title"]}. Contents: {document["content"]}. Relevance Score: {document["score"]}' for document in search_documents ]) return {"web_results": web_results} except Exception as e: return f"[tavily_web_search] failed: {type(e).__name__}: {e}" @tool def fetch_webpage(url: str) -> str: """ Fetch and extract the main text content from a webpage. Use this when a search result points to a specific URL you need to read in full. Args: url: The full URL of the page to fetch. Returns: The extracted text content of the page. """ try: downloaded = trafilatura.fetch_url(url) if downloaded is None: return f"[fetch_webpage] could not fetch {url}" text = trafilatura.extract(downloaded, include_tables=True, include_links=False) if text is None: return f"[fetch_webpage] could not extract content from {url}" return f"Page content from {url}:\n\n{text}" except Exception as e: return f"[fetch_webpage] failed: {e}" @tool def retry_file_download(task_id: str, file_name: str) -> str: """Retry downloading the task file from the GAIA scoring API. Args: task_id: The task ID for the current question. file_name: The original file name from the question metadata. Returns: Local filesystem path to the downloaded file, or an error description. """ cfg = load_config() local_path, err = download_task_file( task_id=task_id, file_name=file_name, base_url=cfg["api"]["base_url"], files_dir=cfg["api"]["files_dir"], ) if local_path: return local_path return f"[retry_file_download] {err}" @tool def youtube_transcript(url: str) -> str: """Fetch the transcript (captions) of a YouTube video as plain text. Args: url: The full YouTube URL (watch, youtu.be, embed, shorts) or a bare 11-char video ID. Returns: The concatenated transcript text, or an error string starting with `[youtube_transcript]`. """ video_id = extract_youtube_id(url) if not video_id: return f"[youtube_transcript] could not parse video ID from: {url}" try: ytt_api = YouTubeTranscriptApi() try: fetched = ytt_api.fetch(video_id, languages=['en']) except NoTranscriptFound: transcript_list = ytt_api.list(video_id) transcript = next(iter(transcript_list)) fetched = transcript.fetch() text = " ".join(snippet.text for snippet in fetched) return f"YouTube transcript for {url}:\n\n{text}" except TranscriptsDisabled: return f"[youtube_transcript] transcripts are disabled for {url}" except VideoUnavailable: return f"[youtube_transcript] video unavailable: {url}" except NoTranscriptFound: return f"[youtube_transcript] no transcript found for {url}" except Exception as e: return f"[youtube_transcript] failed: {e}"