File size: 3,559 Bytes
ac299d5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1ad045
 
 
ac299d5
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
from html import unescape
from typing import Optional
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup

try:
    from duckduckgo_search import DDGS
except ImportError:
    DDGS = None  # type: ignore

DEFAULT_UA = (
    "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co/spaces)"
)
MAX_FETCH_BYTES = 1_500_000


def web_search(query: str, max_results: int = 8) -> str:
    """Return short snippets and URLs from DuckDuckGo text search."""
    if not query.strip():
        return "Error: empty query."
    if DDGS is None:
        return "Error: duckduckgo_search is not installed."
    lines: list[str] = []
    try:
        with DDGS() as ddgs:
            for i, r in enumerate(ddgs.text(query, max_results=max_results)):
                title = r.get("title") or ""
                body = r.get("body") or ""
                href = r.get("href") or ""
                lines.append(f"{i + 1}. {title}\n   {body[:400]}\n   URL: {href}")
    except Exception as e:
        return f"Search error: {e}"
    if not lines:
        return "No results."
    return "\n\n".join(lines)


def _visible_text(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    text = unescape(text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def fetch_url(url: str, max_chars: int = 25_000) -> str:
    """Fetch a URL and return extracted plain text (truncated)."""
    if not url.strip():
        return "Error: empty URL."
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        return "Error: only http(s) URLs are allowed."
    try:
        r = requests.get(
            url,
            timeout=45,
            headers={"User-Agent": DEFAULT_UA},
            stream=True,
        )
        r.raise_for_status()
        chunks: list[bytes] = []
        total = 0
        for chunk in r.iter_content(chunk_size=65536):
            if not chunk:
                continue
            chunks.append(chunk)
            total += len(chunk)
            if total >= MAX_FETCH_BYTES:
                break
        raw = b"".join(chunks)
        ctype = r.headers.get("Content-Type", "").lower()
        if "pdf" in ctype or url.lower().endswith(".pdf"):
            return (
                "Error: PDF binary not parsed here. "
                "Search for an HTML abstract page or use web_search instead."
            )
        text = raw.decode("utf-8", errors="replace")
        plain = _visible_text(text) if "<html" in text.lower() else text
        plain = plain[:max_chars]
        return plain if plain.strip() else "(empty body after parse)"
    except Exception as e:
        return f"Fetch error: {e}"


def youtube_transcript(video_url: str) -> str:
    """Return transcript text when the video exposes captions (unofficial API)."""
    try:
        from youtube_transcript_api import YouTubeTranscriptApi
    except ImportError:
        return "Error: youtube_transcript_api not installed."

    m = re.search(
        r"(?:youtube\.com/watch\?v=|youtu\.be/)([a-zA-Z0-9_-]{6,})",
        video_url,
    )
    if not m:
        return "Error: could not parse YouTube video id from URL."
    vid = m.group(1)
    try:
        api = YouTubeTranscriptApi()
        fetched = api.fetch(vid)
        lines = [s.text for s in fetched]
    except Exception as e:
        return f"No transcript available: {e}"
    return "\n".join(lines)[:50_000]