Agents_Course_Assignment

Sleeping

App Files Files Community

Agents_Course_Assignment / gaia /tools /web.py

KPatelis

Upload 26 files

dfd1417 verified about 2 months ago

Raw

History Blame Contribute Delete

14.6 kB

	"""Web search and fetching tools: DuckDuckGo, Tavily, Wikipedia, Arxiv, webpage fetch, YouTube transcripts."""
	import re
	from datetime import datetime

	import requests
	import trafilatura
	import wikipedia
	from bs4 import BeautifulSoup
	from langchain_community.tools import DuckDuckGoSearchRun
	from langchain_community.tools.tavily_search import TavilySearchResults
	from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
	from langchain_core.tools import tool
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import TranscriptsDisabled, NoTranscriptFound, VideoUnavailable

	from gaia.utils import extract_youtube_id, load_config, download_task_file

	# Wikipedia blocks/throttles requests with the default `wikipedia` package UA, which
	# causes the API to return a non-JSON body and `requests.json()` to raise a
	# `JSONDecodeError: Expecting value: line 1 column 1 (char 0)`. Setting an identifying
	# UA per Wikipedia's policy fixes this for both `wiki_search` and `wikipedia_page_fetch`.
	_USER_AGENT = "gaia-agent/0.1 (https://huggingface.co/spaces/KPatelis/Agents_Course_Assignment)"
	wikipedia.set_user_agent(_USER_AGENT)


	_ddg_search = None
	_tavily_search = None


	def _get_ddg():
	global _ddg_search
	if _ddg_search is None:
	_ddg_search = DuckDuckGoSearchRun()
	return _ddg_search


	def _get_tavily():
	global _tavily_search
	if _tavily_search is None:
	_tavily_search = TavilySearchResults(max_results=3)
	return _tavily_search


	@tool
	def duck_web_search(query: str) -> str:
	"""Use DuckDuckGo to search the web.

	Args:
	query: The search query.
	"""
	try:
	search = _get_ddg().invoke(input=query)
	return {"duckduckgo_web_search": search}
	except Exception as e:
	return f"[duck_web_search] failed: {type(e).__name__}: {e}"


	@tool
	def wiki_search(query: str) -> str:
	"""Search Wikipedia for a query and return up to 3 distinct articles.

	Args:
	query: The search query."""
	try:
	documents = WikipediaLoader(query=query, load_max_docs=3, doc_content_chars_max=20000).load()
	# Deduplicate by article title
	seen_titles = set()
	unique_documents = []
	for d in documents:
	title = d.metadata.get("title", "")
	if title and title not in seen_titles:
	seen_titles.add(title)
	unique_documents.append(d)
	processed_documents = "\n\n---\n\n".join(
	[
	f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
	for document in unique_documents
	])
	return {"wiki_results": processed_documents}
	except Exception as e:
	return f"[wiki_search] failed: {type(e).__name__}: {e}"


	_NAVBOX_MIN_CHARS = 200 # ignore navboxes with less than this many chars of text
	_NAVBOX_MAX_CHARS = 15000 # cap navbox text to avoid blowing up context on huge pages


	def _extract_navbox_text(html: str) -> str:
	"""Pull a flat-text dump of every ``.navbox`` div on a Wikipedia page.

	Navboxes are the cross-link tables Wikipedia puts at the bottom of articles.
	We collect every navbox on the page, flatten whitespace, and join with blank lines.
	Returns ``""`` if no meaningful navbox content is present.
	"""
	soup = BeautifulSoup(html, "html.parser")
	parts = []
	for nb in soup.find_all("div", class_="navbox"):
	text = re.sub(r"\s+", " ", nb.get_text(" ", strip=True))
	if text:
	parts.append(text)
	joined = "\n\n".join(parts).strip()
	if len(joined) < _NAVBOX_MIN_CHARS:
	return ""
	return joined[:_NAVBOX_MAX_CHARS]


	@tool
	def wikipedia_page_fetch(title: str) -> str:
	"""Fetch a Wikipedia page by title and return its body + navbox text.
	Args:
	title: The exact Wikipedia page title, optionally with a namespace prefix
	(e.g. ``"Wikipedia:Featured article candidates/Featured log/November 2016"``).

	Returns:
	On success: a multi-line string starting with ``"Wikipedia: <resolved title>"``,
	a ``URL:`` line, a blank line, the extracted body, and (if present) a
	``--- Related (navbox) ---`` block.
	On failure: a string starting with ``[wikipedia_page_fetch] …`` describing
	the failure (page not found, disambiguation page, search fallback exhausted).
	"""

	def _render(page, resolved_from=None):
	suffix = f" (resolved from '{resolved_from}')" if resolved_from else ""
	header = f"Wikipedia: {page.title}{suffix}\nURL: {page.url}"

	# Body: prefer trafilatura (preserves lists and tables — critical for
	# counting-style questions). Fall back to page.content on failure.
	body = None
	downloaded = trafilatura.fetch_url(page.url)
	if downloaded is not None:
	body = trafilatura.extract(downloaded, include_tables=True, include_links=False)
	if not body:
	body = page.content

	# Navbox: append the cross-link tables that body extractors strip.
	navbox_section = ""
	try:
	navbox_text = _extract_navbox_text(page.html())
	if navbox_text:
	navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
	except Exception:
	pass

	return f"{header}\n\n{body}{navbox_section}"

	try:
	page = wikipedia.page(title, auto_suggest=False)
	return _render(page)
	except wikipedia.exceptions.DisambiguationError as e:
	return f"[wikipedia_page_fetch] '{title}' is a disambiguation page. Options: {e.options[:10]}"
	except wikipedia.exceptions.PageError:
	# Recover from case-sensitivity / slight title mismatches by searching once and
	# fetching the top hit.
	try:
	hits = wikipedia.search(title, results=1)
	except Exception as e:
	return f"[wikipedia_page_fetch] page not found: '{title}'; search fallback failed: {e}"
	if not hits:
	return f"[wikipedia_page_fetch] page not found: '{title}' and no search hits."
	resolved = hits[0]
	if resolved == title:
	return f"[wikipedia_page_fetch] page not found: '{title}'. Try wiki_search to find the correct title."
	try:
	page = wikipedia.page(resolved, auto_suggest=False)
	except Exception as e:
	return f"[wikipedia_page_fetch] resolved title '{resolved}' but fetch failed: {e}"
	return _render(page, resolved_from=title)
	except Exception as e:
	return f"[wikipedia_page_fetch] failed: {e}"


	_WIKI_API_ENDPOINT = "https://en.wikipedia.org/w/api.php"


	def _resolve_revision_at(title: str, iso_timestamp: str) -> tuple[int \| None, str \| None, str \| None]:
	"""Look up the Wikipedia revision id active for ``title`` at ``iso_timestamp``.
	"""
	params = {
	"action": "query",
	"format": "json",
	"prop": "revisions",
	"titles": title,
	"rvprop": "ids\|timestamp",
	"rvlimit": 1,
	"rvdir": "older",
	"rvstart": iso_timestamp,
	}
	try:
	r = requests.get(
	_WIKI_API_ENDPOINT,
	params=params,
	headers={"User-Agent": _USER_AGENT},
	timeout=30,
	)
	r.raise_for_status()
	data = r.json()
	except Exception as e:
	return None, None, f"API request failed: {type(e).__name__}: {e}"

	pages = data.get("query", {}).get("pages", {})
	if not pages:
	return None, None, "API returned no pages"
	page = next(iter(pages.values()))
	if "missing" in page:
	return None, None, f"page not found: '{title}'"
	revisions = page.get("revisions") or []
	if not revisions:
	return None, None, f"no revisions for '{title}' on or before {iso_timestamp}"
	return revisions[0]["revid"], page.get("title", title), None


	@tool
	def wikipedia_page_as_of(title: str, date: str) -> str:
	"""Fetch a Wikipedia page as it existed at end of day UTC on a specific date.
	Args:
	title: Wikipedia page title (e.g. ``"Taishō Tamai"``,
	``"Hokkaido Nippon-Ham Fighters"``, ``"1928 Summer Olympics"``).
	date: Target date in ISO ``"YYYY-MM-DD"`` format (e.g. ``"2023-07-31"``).
	The page is fetched as it appeared at 23:59:59 UTC on that day.

	Returns:
	On success: a multi-line string ``"Wikipedia: <title> (as of <date>, revid <id>) / URL: <oldid URL> / <body> / --- Related (navbox) ---"``.
	On failure: a string starting with ``[wikipedia_page_as_of] …`` describing
	the failure (invalid date, page not found, revision lookup failure,
	rendered-HTML fetch failure).
	"""
	try:
	dt = datetime.strptime(date, "%Y-%m-%d")
	except ValueError:
	return f"[wikipedia_page_as_of] invalid date '{date}'; expected YYYY-MM-DD."
	iso_ts = dt.strftime("%Y-%m-%dT23:59:59Z")

	revid, resolved_title, err = _resolve_revision_at(title, iso_ts)
	if err and err.startswith("page not found"):
	# Case-/spelling-tolerant fallback: search and retry the top hit.
	try:
	hits = wikipedia.search(title, results=1)
	except Exception as e:
	return f"[wikipedia_page_as_of] page not found and search failed: {e}"
	if not hits or hits[0] == title:
	return f"[wikipedia_page_as_of] page not found: '{title}'"
	revid, resolved_title, err = _resolve_revision_at(hits[0], iso_ts)
	if err:
	return f"[wikipedia_page_as_of] {err}"

	url = f"https://en.wikipedia.org/w/index.php?oldid={revid}"
	try:
	resp = requests.get(url, headers={"User-Agent": _USER_AGENT}, timeout=30)
	resp.raise_for_status()
	html = resp.text
	except Exception as e:
	return f"[wikipedia_page_as_of] could not fetch revision URL {url}: {type(e).__name__}: {e}"

	body = trafilatura.extract(html, include_tables=True, include_links=False)
	if not body:
	return f"[wikipedia_page_as_of] no body extracted from {url}"

	navbox_section = ""
	try:
	navbox_text = _extract_navbox_text(html)
	if navbox_text:
	navbox_section = f"\n\n--- Related (navbox) ---\n{navbox_text}"
	except Exception:
	pass

	header = f"Wikipedia: {resolved_title} (as of {date}, revid {revid})\nURL: {url}"
	return f"{header}\n\n{body}{navbox_section}"


	@tool
	def arxiv_search(query: str) -> str:
	"""Search Arxiv for a query and return maximum 3 result.

	Args:
	query: The search query."""
	try:
	documents = ArxivLoader(query=query, load_max_docs=3).load()
	processed_documents = "\n\n---\n\n".join(
	[
	f'Document title: {document.metadata.get("title", "")}. Summary: {document.metadata.get("summary", "")}. Documents details: {document.page_content}'
	for document in documents
	])
	return {"arxiv_results": processed_documents}
	except Exception as e:
	return f"[arxiv_search] failed: {type(e).__name__}: {e}"


	@tool
	def tavily_web_search(query: str) -> str:
	"""Search the web using Tavily for a query and return maximum 3 results.

	Args:
	query: The search query."""
	try:
	search_documents = _get_tavily().invoke(input=query)
	web_results = "\n\n---\n\n".join(
	[
	f'Document title: {document["title"]}. Contents: {document["content"]}. Relevance Score: {document["score"]}'
	for document in search_documents
	])
	return {"web_results": web_results}
	except Exception as e:
	return f"[tavily_web_search] failed: {type(e).__name__}: {e}"


	@tool
	def fetch_webpage(url: str) -> str:
	"""
	Fetch and extract the main text content from a webpage.
	Use this when a search result points to a specific URL you need to read in full.

	Args:
	url: The full URL of the page to fetch.

	Returns:
	The extracted text content of the page.
	"""
	try:
	downloaded = trafilatura.fetch_url(url)
	if downloaded is None:
	return f"[fetch_webpage] could not fetch {url}"
	text = trafilatura.extract(downloaded, include_tables=True, include_links=False)
	if text is None:
	return f"[fetch_webpage] could not extract content from {url}"
	return f"Page content from {url}:\n\n{text}"
	except Exception as e:
	return f"[fetch_webpage] failed: {e}"


	@tool
	def retry_file_download(task_id: str, file_name: str) -> str:
	"""Retry downloading the task file from the GAIA scoring API.
	Args:
	task_id: The task ID for the current question.
	file_name: The original file name from the question metadata.

	Returns:
	Local filesystem path to the downloaded file, or an error description.
	"""
	cfg = load_config()
	local_path, err = download_task_file(
	task_id=task_id,
	file_name=file_name,
	base_url=cfg["api"]["base_url"],
	files_dir=cfg["api"]["files_dir"],
	)
	if local_path:
	return local_path
	return f"[retry_file_download] {err}"


	@tool
	def youtube_transcript(url: str) -> str:
	"""Fetch the transcript (captions) of a YouTube video as plain text.
	Args:
	url: The full YouTube URL (watch, youtu.be, embed, shorts) or a bare 11-char video ID.

	Returns:
	The concatenated transcript text, or an error string starting with `[youtube_transcript]`.
	"""

	video_id = extract_youtube_id(url)
	if not video_id:
	return f"[youtube_transcript] could not parse video ID from: {url}"

	try:
	ytt_api = YouTubeTranscriptApi()
	try:
	fetched = ytt_api.fetch(video_id, languages=['en'])
	except NoTranscriptFound:
	transcript_list = ytt_api.list(video_id)
	transcript = next(iter(transcript_list))
	fetched = transcript.fetch()

	text = " ".join(snippet.text for snippet in fetched)
	return f"YouTube transcript for {url}:\n\n{text}"
	except TranscriptsDisabled:
	return f"[youtube_transcript] transcripts are disabled for {url}"
	except VideoUnavailable:
	return f"[youtube_transcript] video unavailable: {url}"
	except NoTranscriptFound:
	return f"[youtube_transcript] no transcript found for {url}"
	except Exception as e:
	return f"[youtube_transcript] failed: {e}"