Spaces:

W01fAI
/

gaia_unit4_space

Sleeping

File size: 3,559 Bytes

import re
from html import unescape
from typing import Optional
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

try:
    from duckduckgo_search import DDGS
except ImportError:
    DDGS = None  # type: ignore

DEFAULT_UA = (
    "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)"
)
MAX_FETCH_BYTES = 1_500_000


def web_search(query: str, max_results: int = 8) -> str:
    """Return short snippets and URLs from DuckDuckGo text search."""
    if not query.strip():
        return "Error: empty query."
    if DDGS is None:
        return "Error: duckduckgo_search is not installed."
    lines: list[str] = []
    try:
        with DDGS() as ddgs:
            for i, r in enumerate(ddgs.text(query, max_results=max_results)):
                title = r.get("title") or ""
                body = r.get("body") or ""
                href = r.get("href") or ""
                lines.append(f"{i + 1}. {title}\n   {body[:400]}\n   URL: {href}")
    except Exception as e:
        return f"Search error: {e}"
    if not lines:
        return "No results."
    return "\n\n".join(lines)


def _visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    text = unescape(text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def fetch_url(url: str, max_chars: int = 25_000) -> str:
    """Fetch a URL and return extracted plain text (truncated)."""
    if not url.strip():
        return "Error: empty URL."
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        return "Error: only http(s) URLs are allowed."
    try:
        r = requests.get(
            url,
            timeout=45,
            headers={"User-Agent": DEFAULT_UA},
            stream=True,
        )
        r.raise_for_status()
        chunks: list[bytes] = []
        total = 0
        for chunk in r.iter_content(chunk_size=65536):
            if not chunk:
                continue
            chunks.append(chunk)
            total += len(chunk)
            if total >= MAX_FETCH_BYTES:
                break
        raw = b"".join(chunks)
        ctype = r.headers.get("Content-Type", "").lower()
        if "pdf" in ctype or url.lower().endswith(".pdf"):
            return (
                "Error: PDF binary not parsed here. "
                "Search for an HTML abstract page or use web_search instead."
            )
        text = raw.decode("utf-8", errors="replace")
        plain = _visible_text(text) if "<html" in text.lower() else text
        plain = plain[:max_chars]
        return plain if plain.strip() else "(empty body after parse)"
    except Exception as e:
        return f"Fetch error: {e}"


def youtube_transcript(video_url: str) -> str:
    """Return transcript text when the video exposes captions (unofficial API)."""
    try:
        from youtube_transcript_api import YouTubeTranscriptApi
    except ImportError:
        return "Error: youtube_transcript_api not installed."

    m = re.search(
        r"(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{6,})",
        video_url,
    )
    if not m:
        return "Error: could not parse YouTube video id from URL."
    vid = m.group(1)
    try:
        api = YouTubeTranscriptApi()
        fetched = api.fetch(vid)
        lines = [s.text for s in fetched]
    except Exception as e:
        return f"No transcript available: {e}"
    return "\n".join(lines)[:50_000]