import re from html import unescape from typing import Optional from urllib.parse import urlparse import requests from bs4 import BeautifulSoup try: from duckduckgo_search import DDGS except ImportError: DDGS = None # type: ignore DEFAULT_UA = ( "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)" ) MAX_FETCH_BYTES = 1_500_000 def web_search(query: str, max_results: int = 8) -> str: """Return short snippets and URLs from DuckDuckGo text search.""" if not query.strip(): return "Error: empty query." if DDGS is None: return "Error: duckduckgo_search is not installed." lines: list[str] = [] try: with DDGS() as ddgs: for i, r in enumerate(ddgs.text(query, max_results=max_results)): title = r.get("title") or "" body = r.get("body") or "" href = r.get("href") or "" lines.append(f"{i + 1}. {title}\n {body[:400]}\n URL: {href}") except Exception as e: return f"Search error: {e}" if not lines: return "No results." return "\n\n".join(lines) def _visible_text(html: str) -> str: soup = BeautifulSoup(html, "lxml") for tag in soup(["script", "style", "noscript"]): tag.decompose() text = soup.get_text(separator="\n") text = unescape(text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def fetch_url(url: str, max_chars: int = 25_000) -> str: """Fetch a URL and return extracted plain text (truncated).""" if not url.strip(): return "Error: empty URL." parsed = urlparse(url) if parsed.scheme not in ("http", "https"): return "Error: only http(s) URLs are allowed." try: r = requests.get( url, timeout=45, headers={"User-Agent": DEFAULT_UA}, stream=True, ) r.raise_for_status() chunks: list[bytes] = [] total = 0 for chunk in r.iter_content(chunk_size=65536): if not chunk: continue chunks.append(chunk) total += len(chunk) if total >= MAX_FETCH_BYTES: break raw = b"".join(chunks) ctype = r.headers.get("Content-Type", "").lower() if "pdf" in ctype or url.lower().endswith(".pdf"): return ( "Error: PDF binary not parsed here. " "Search for an HTML abstract page or use web_search instead." ) text = raw.decode("utf-8", errors="replace") plain = _visible_text(text) if " str: """Return transcript text when the video exposes captions (unofficial API).""" try: from youtube_transcript_api import YouTubeTranscriptApi except ImportError: return "Error: youtube_transcript_api not installed." m = re.search( r"(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{6,})", video_url, ) if not m: return "Error: could not parse YouTube video id from URL." vid = m.group(1) try: api = YouTubeTranscriptApi() fetched = api.fetch(vid) lines = [s.text for s in fetched] except Exception as e: return f"No transcript available: {e}" return "\n".join(lines)[:50_000]