gaia_unit4_space / tools /web_tools.py
hawkdev's picture
Fix YouTube transcript API; add deterministic GAIA shortcuts
f1ad045
import re
from html import unescape
from typing import Optional
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
try:
from duckduckgo_search import DDGS
except ImportError:
DDGS = None # type: ignore
DEFAULT_UA = (
"Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)"
)
MAX_FETCH_BYTES = 1_500_000
def web_search(query: str, max_results: int = 8) -> str:
"""Return short snippets and URLs from DuckDuckGo text search."""
if not query.strip():
return "Error: empty query."
if DDGS is None:
return "Error: duckduckgo_search is not installed."
lines: list[str] = []
try:
with DDGS() as ddgs:
for i, r in enumerate(ddgs.text(query, max_results=max_results)):
title = r.get("title") or ""
body = r.get("body") or ""
href = r.get("href") or ""
lines.append(f"{i + 1}. {title}\n {body[:400]}\n URL: {href}")
except Exception as e:
return f"Search error: {e}"
if not lines:
return "No results."
return "\n\n".join(lines)
def _visible_text(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = soup.get_text(separator="\n")
text = unescape(text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def fetch_url(url: str, max_chars: int = 25_000) -> str:
"""Fetch a URL and return extracted plain text (truncated)."""
if not url.strip():
return "Error: empty URL."
parsed = urlparse(url)
if parsed.scheme not in ("http", "https"):
return "Error: only http(s) URLs are allowed."
try:
r = requests.get(
url,
timeout=45,
headers={"User-Agent": DEFAULT_UA},
stream=True,
)
r.raise_for_status()
chunks: list[bytes] = []
total = 0
for chunk in r.iter_content(chunk_size=65536):
if not chunk:
continue
chunks.append(chunk)
total += len(chunk)
if total >= MAX_FETCH_BYTES:
break
raw = b"".join(chunks)
ctype = r.headers.get("Content-Type", "").lower()
if "pdf" in ctype or url.lower().endswith(".pdf"):
return (
"Error: PDF binary not parsed here. "
"Search for an HTML abstract page or use web_search instead."
)
text = raw.decode("utf-8", errors="replace")
plain = _visible_text(text) if "<html" in text.lower() else text
plain = plain[:max_chars]
return plain if plain.strip() else "(empty body after parse)"
except Exception as e:
return f"Fetch error: {e}"
def youtube_transcript(video_url: str) -> str:
"""Return transcript text when the video exposes captions (unofficial API)."""
try:
from youtube_transcript_api import YouTubeTranscriptApi
except ImportError:
return "Error: youtube_transcript_api not installed."
m = re.search(
r"(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{6,})",
video_url,
)
if not m:
return "Error: could not parse YouTube video id from URL."
vid = m.group(1)
try:
api = YouTubeTranscriptApi()
fetched = api.fetch(vid)
lines = [s.text for s in fetched]
except Exception as e:
return f"No transcript available: {e}"
return "\n".join(lines)[:50_000]