Spaces:

W01fAI
/

gaia_unit4_space

Sleeping

App Files Files Community

gaia_unit4_space / tools /web_tools.py

hawkdev

Fix YouTube transcript API; add deterministic GAIA shortcuts

f1ad045 10 days ago

raw

history blame contribute delete

3.56 kB

	import re
	from html import unescape
	from typing import Optional
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup

	try:
	from duckduckgo_search import DDGS
	except ImportError:
	DDGS = None # type: ignore

	DEFAULT_UA = (
	"Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)"
	)
	MAX_FETCH_BYTES = 1_500_000


	def web_search(query: str, max_results: int = 8) -> str:
	"""Return short snippets and URLs from DuckDuckGo text search."""
	if not query.strip():
	return "Error: empty query."
	if DDGS is None:
	return "Error: duckduckgo_search is not installed."
	lines: list[str] = []
	try:
	with DDGS() as ddgs:
	for i, r in enumerate(ddgs.text(query, max_results=max_results)):
	title = r.get("title") or ""
	body = r.get("body") or ""
	href = r.get("href") or ""
	lines.append(f"{i + 1}. {title}\n {body[:400]}\n URL: {href}")
	except Exception as e:
	return f"Search error: {e}"
	if not lines:
	return "No results."
	return "\n\n".join(lines)


	def _visible_text(html: str) -> str:
	soup = BeautifulSoup(html, "lxml")
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()
	text = soup.get_text(separator="\n")
	text = unescape(text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	def fetch_url(url: str, max_chars: int = 25_000) -> str:
	"""Fetch a URL and return extracted plain text (truncated)."""
	if not url.strip():
	return "Error: empty URL."
	parsed = urlparse(url)
	if parsed.scheme not in ("http", "https"):
	return "Error: only http(s) URLs are allowed."
	try:
	r = requests.get(
	url,
	timeout=45,
	headers={"User-Agent": DEFAULT_UA},
	stream=True,
	)
	r.raise_for_status()
	chunks: list[bytes] = []
	total = 0
	for chunk in r.iter_content(chunk_size=65536):
	if not chunk:
	continue
	chunks.append(chunk)
	total += len(chunk)
	if total >= MAX_FETCH_BYTES:
	break
	raw = b"".join(chunks)
	ctype = r.headers.get("Content-Type", "").lower()
	if "pdf" in ctype or url.lower().endswith(".pdf"):
	return (
	"Error: PDF binary not parsed here. "
	"Search for an HTML abstract page or use web_search instead."
	)
	text = raw.decode("utf-8", errors="replace")
	plain = _visible_text(text) if "<html" in text.lower() else text
	plain = plain[:max_chars]
	return plain if plain.strip() else "(empty body after parse)"
	except Exception as e:
	return f"Fetch error: {e}"


	def youtube_transcript(video_url: str) -> str:
	"""Return transcript text when the video exposes captions (unofficial API)."""
	try:
	from youtube_transcript_api import YouTubeTranscriptApi
	except ImportError:
	return "Error: youtube_transcript_api not installed."

	m = re.search(
	r"(?:youtube\.com/watch\?v=\|youtu\.be/)([a-zA-Z0-9_-]{6,})",
	video_url,
	)
	if not m:
	return "Error: could not parse YouTube video id from URL."
	vid = m.group(1)
	try:
	api = YouTubeTranscriptApi()
	fetched = api.fetch(vid)
	lines = [s.text for s in fetched]
	except Exception as e:
	return f"No transcript available: {e}"
	return "\n".join(lines)[:50_000]