Spaces:
Sleeping
Sleeping
| import re | |
| from html import unescape | |
| from typing import Optional | |
| from urllib.parse import urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| try: | |
| from duckduckgo_search import DDGS | |
| except ImportError: | |
| DDGS = None # type: ignore | |
| DEFAULT_UA = ( | |
| "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)" | |
| ) | |
| MAX_FETCH_BYTES = 1_500_000 | |
| def web_search(query: str, max_results: int = 8) -> str: | |
| """Return short snippets and URLs from DuckDuckGo text search.""" | |
| if not query.strip(): | |
| return "Error: empty query." | |
| if DDGS is None: | |
| return "Error: duckduckgo_search is not installed." | |
| lines: list[str] = [] | |
| try: | |
| with DDGS() as ddgs: | |
| for i, r in enumerate(ddgs.text(query, max_results=max_results)): | |
| title = r.get("title") or "" | |
| body = r.get("body") or "" | |
| href = r.get("href") or "" | |
| lines.append(f"{i + 1}. {title}\n {body[:400]}\n URL: {href}") | |
| except Exception as e: | |
| return f"Search error: {e}" | |
| if not lines: | |
| return "No results." | |
| return "\n\n".join(lines) | |
| def _visible_text(html: str) -> str: | |
| soup = BeautifulSoup(html, "lxml") | |
| for tag in soup(["script", "style", "noscript"]): | |
| tag.decompose() | |
| text = soup.get_text(separator="\n") | |
| text = unescape(text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |
| def fetch_url(url: str, max_chars: int = 25_000) -> str: | |
| """Fetch a URL and return extracted plain text (truncated).""" | |
| if not url.strip(): | |
| return "Error: empty URL." | |
| parsed = urlparse(url) | |
| if parsed.scheme not in ("http", "https"): | |
| return "Error: only http(s) URLs are allowed." | |
| try: | |
| r = requests.get( | |
| url, | |
| timeout=45, | |
| headers={"User-Agent": DEFAULT_UA}, | |
| stream=True, | |
| ) | |
| r.raise_for_status() | |
| chunks: list[bytes] = [] | |
| total = 0 | |
| for chunk in r.iter_content(chunk_size=65536): | |
| if not chunk: | |
| continue | |
| chunks.append(chunk) | |
| total += len(chunk) | |
| if total >= MAX_FETCH_BYTES: | |
| break | |
| raw = b"".join(chunks) | |
| ctype = r.headers.get("Content-Type", "").lower() | |
| if "pdf" in ctype or url.lower().endswith(".pdf"): | |
| return ( | |
| "Error: PDF binary not parsed here. " | |
| "Search for an HTML abstract page or use web_search instead." | |
| ) | |
| text = raw.decode("utf-8", errors="replace") | |
| plain = _visible_text(text) if "<html" in text.lower() else text | |
| plain = plain[:max_chars] | |
| return plain if plain.strip() else "(empty body after parse)" | |
| except Exception as e: | |
| return f"Fetch error: {e}" | |
| def youtube_transcript(video_url: str) -> str: | |
| """Return transcript text when the video exposes captions (unofficial API).""" | |
| try: | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| except ImportError: | |
| return "Error: youtube_transcript_api not installed." | |
| m = re.search( | |
| r"(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{6,})", | |
| video_url, | |
| ) | |
| if not m: | |
| return "Error: could not parse YouTube video id from URL." | |
| vid = m.group(1) | |
| try: | |
| api = YouTubeTranscriptApi() | |
| fetched = api.fetch(vid) | |
| lines = [s.text for s in fetched] | |
| except Exception as e: | |
| return f"No transcript available: {e}" | |
| return "\n".join(lines)[:50_000] | |