Spaces:

0xmoose
/

webby

Sleeping

App Files Files Community

0xmoose commited on 7 days ago

Commit

db419e8

verified ·

1 Parent(s): 2f7c08a

chore: mod to webscrape

Browse files

Files changed (1) hide show

app.py +246 -67

app.py CHANGED Viewed

@@ -1,86 +1,265 @@
-import os
 import gradio as gr
-from duckduckgo_search import DDGS
-def web_search(
-    query: str,
-    max_results: int = 5,
-    region: str = "wt-wt",
-    safesearch: str = "moderate",
-    timelimit: str | None = None,
 ):
     """
-    Search the web using DuckDuckGo and return the top results.
     Args:
-        query: Search query string.
-        max_results: Maximum number of results to return (1-25 recommended).
-        region: DDG region (e.g. "wt-wt" global, "us-en", "ca-en", "uk-en").
-        safesearch: "off", "moderate", or "strict".
-        timelimit: Optional time filter: "d" (day), "w" (week), "m" (month), "y" (year), or None.
     Returns:
-        A list of results, each with title, url, and snippet.
     """
-    q = (query or "").strip()
-    if not q:
-        return []
-    max_results = int(max(1, min(max_results, 25)))
-    results = []
-    # DDGS returns dicts with keys like: title, href, body
-    with DDGS() as ddgs:
-        for r in ddgs.text(
-            q,
-            region=region,
-            safesearch=safesearch,
-            timelimit=timelimit,
-            max_results=max_results,
-        ):
-            results.append(
-                {
-                    "title": r.get("title"),
-                    "url": r.get("href"),
-                    "snippet": r.get("body"),
-                }
-            )
-    return results
-with gr.Blocks(title="Web Search (MCP-ready)") as demo:
     gr.Markdown(
         """
-# Web Search (DuckDuckGo) — UI + MCP Tool
-- Use the UI below, or connect as an MCP server from your agent/editor.
-- The MCP endpoint appears in the Space footer → **View API** → **MCP**.
-"""
     )
     with gr.Row():
-        query = gr.Textbox(label="Query", placeholder="e.g. gradio mcp server hugging face spaces")
     with gr.Row():
-        max_results = gr.Slider(1, 25, value=5, step=1, label="Max results")
-        region = gr.Dropdown(
-            choices=["wt-wt", "us-en", "ca-en", "uk-en", "au-en", "de-de", "fr-fr", "es-es", "it-it"],
-            value="wt-wt",
-            label="Region",
-        )
-        safesearch = gr.Dropdown(choices=["off", "moderate", "strict"], value="moderate", label="SafeSearch")
-        timelimit = gr.Dropdown(choices=[None, "d", "w", "m", "y"], value=None, label="Time limit")
-    run_btn = gr.Button("Search")
-    out = gr.JSON(label="Results (title/url/snippet)")
-    run_btn.click(
-        fn=web_search,
-        inputs=[query, max_results, region, safesearch, timelimit],
-        outputs=out,
-        api_name="web_search",  # nicer MCP tool name
-        queue=False,            # optional: reduces MCP progress overhead
     )
 if __name__ == "__main__":
-    # Gradio will expose MCP endpoints when mcp_server=True
-    demo.launch(mcp_server=True, server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

+import re
+import socket
+import ipaddress
+from urllib.parse import urlparse, urljoin
+import httpx
 import gradio as gr
+from bs4 import BeautifulSoup
+try:
+    # Optional but recommended for cleaner article-style extraction
+    from readability import Document
+    HAS_READABILITY = True
+except Exception:
+    HAS_READABILITY = False
+# ----------------------------
+# Security / validation helpers
+# ----------------------------
+def _is_public_hostname(hostname: str) -> bool:
+    """
+    Resolve hostname and block private/loopback/link-local/reserved ranges.
+    Mitigates SSRF against internal networks via DNS.
+    """
+    if not hostname:
+        return False
+    try:
+        # Disallow obvious local hostnames
+        hn = hostname.strip().lower()
+        if hn in {"localhost", "localhost.localdomain"}:
+            return False
+        infos = socket.getaddrinfo(hostname, None)
+        ips = {info[4][0] for info in infos}
+        for ip_str in ips:
+            ip = ipaddress.ip_address(ip_str)
+            if (
+                ip.is_private
+                or ip.is_loopback
+                or ip.is_link_local
+                or ip.is_reserved
+                or ip.is_multicast
+            ):
+                return False
+        return True
+    except Exception:
+        return False
+def _validate_url(url: str) -> str:
+    url = (url or "").strip()
+    if not url:
+        raise ValueError("URL is required.")
+    parsed = urlparse(url)
+    if parsed.scheme not in {"http", "https"}:
+        raise ValueError("Only http:// and https:// URLs are allowed.")
+    if not parsed.netloc:
+        raise ValueError("Invalid URL (missing hostname).")
+    # Block credentials in URL (e.g., http://user:pass@host)
+    if parsed.username or parsed.password:
+        raise ValueError("URLs containing credentials are not allowed.")
+    if not _is_public_hostname(parsed.hostname):
+        raise ValueError("Blocked hostname/IP (possible local/private network).")
+    return url
+# ----------------------------
+# Extraction helpers
+# ----------------------------
+def _strip_text(text: str) -> str:
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    return text.strip()
+def _extract_with_bs4(html: str, base_url: str):
+    soup = BeautifulSoup(html, "html.parser")
+    # Remove noisy tags
+    for tag in soup(["script", "style", "noscript", "iframe"]):
+        tag.decompose()
+    title = (soup.title.string.strip() if soup.title and soup.title.string else "")[:300]
+    # Basic meta
+    meta = {}
+    for m in soup.find_all("meta"):
+        name = (m.get("name") or m.get("property") or "").strip()
+        content = (m.get("content") or "").strip()
+        if name and content and name.lower() in {
+            "description",
+            "og:title",
+            "og:description",
+            "og:url",
+            "twitter:title",
+            "twitter:description",
+        }:
+            meta[name] = content[:500]
+    text = _strip_text(soup.get_text("\n"))
+    # Links
+    links = []
+    for a in soup.find_all("a", href=True):
+        href = a.get("href", "").strip()
+        if not href:
+            continue
+        abs_url = urljoin(base_url, href)
+        # keep only http(s)
+        if urlparse(abs_url).scheme in {"http", "https"}:
+            label = _strip_text(a.get_text(" "))[:200]
+            links.append({"text": label, "url": abs_url})
+    return title, meta, text, links
+def _extract_readable(html: str, base_url: str):
+    """
+    Use readability-lxml if available; fallback to BeautifulSoup extraction.
+    """
+    if not HAS_READABILITY:
+        return _extract_with_bs4(html, base_url)
+    doc = Document(html)
+    title = (doc.short_title() or "")[:300]
+    content_html = doc.summary(html_partial=True)
+    return _extract_with_bs4(content_html, base_url)
+# ----------------------------
+# Fetcher (with redirect checks)
+# ----------------------------
+def _fetch_html(url: str, timeout_s: float, max_bytes: int, user_agent: str, max_redirects: int = 5):
+    headers = {"User-Agent": user_agent}
+    limits = httpx.Limits(max_keepalive_connections=5, max_connections=10)
+    with httpx.Client(timeout=timeout_s, headers=headers, limits=limits, follow_redirects=False) as client:
+        current = url
+        for _ in range(max_redirects + 1):
+            r = client.get(current)
+            # Handle redirects manually so we can validate each hop
+            if 300 <= r.status_code < 400 and "location" in r.headers:
+                nxt = urljoin(current, r.headers["location"])
+                _validate_url(nxt)
+                current = nxt
+                continue
+            r.raise_for_status()
+            ctype = (r.headers.get("content-type") or "").lower()
+            if "text/html" not in ctype and "application/xhtml+xml" not in ctype:
+                raise ValueError(f"Unsupported content-type: {ctype or 'unknown'} (expected HTML)")
+            content = r.content
+            if len(content) > max_bytes:
+                raise ValueError(f"Response too large ({len(content)} bytes). Limit is {max_bytes} bytes.")
+            # Best-effort decode
+            try:
+                html = content.decode(r.encoding or "utf-8", errors="replace")
+            except Exception:
+                html = content.decode("utf-8", errors="replace")
+            return current, html
+        raise ValueError("Too many redirects.")
+# ----------------------------
+# MCP tool + UI function
+# ----------------------------
+def scrape_url(
+    url: str,
+    include_links: bool = True,
+    max_chars: int = 12000,
+    timeout_seconds: float = 15.0,
+    max_kb: int = 1024,
+    user_agent: str = "Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)"
 ):
     """
+    Scrape a single web page and return clean text + metadata.
     Args:
+        url (str): The http(s) URL to fetch.
+        include_links (bool): If true, include extracted hyperlinks.
+        max_chars (int): Maximum number of characters returned for the main text.
+        timeout_seconds (float): Network timeout in seconds.
+        max_kb (int): Maximum HTML response size in kilobytes.
+        user_agent (str): User-Agent header to send.
     Returns:
+        dict: {final_url, title, meta, text, links}
     """
+    url = _validate_url(url)
+    max_bytes = int(max_kb) * 1024
+    final_url, html = _fetch_html(
+        url=url,
+        timeout_s=float(timeout_seconds),
+        max_bytes=max_bytes,
+        user_agent=user_agent,
+    )
+    title, meta, text, links = _extract_readable(html, final_url)
+    text = text[: max(0, int(max_chars))]
+    if not include_links:
+        links = []
+    return {
+        "final_url": final_url,
+        "title": title,
+        "meta": meta,
+        "text": text,
+        "links": links[:200],  # cap link count
+        "note": "readability-lxml enabled" if HAS_READABILITY else "readability-lxml not installed; using basic extraction",
+    }
+# ----------------------------
+# Gradio UI
+# ----------------------------
+with gr.Blocks(title="URL Scraper (MCP)") as demo:
     gr.Markdown(
         """
+# URL Scraper (MCP-compatible)
+- Paste a URL → get extracted text, title, metadata, and links.
+- This Space also exposes the scraper as an **MCP tool**.
+**MCP endpoint (after deploy):** `https://<your-space>.hf.space/gradio_api/mcp/`
+        """
     )
     with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
     with gr.Row():
+        include_links_in = gr.Checkbox(label="Include links", value=True)
+        max_chars_in = gr.Slider(1000, 50000, value=12000, step=500, label="Max returned characters")
+    with gr.Accordion("Advanced", open=False):
+        timeout_in = gr.Slider(5, 60, value=15, step=1, label="Timeout (seconds)")
+        max_kb_in = gr.Slider(128, 4096, value=1024, step=128, label="Max HTML size (KB)")
+        ua_in = gr.Textbox(label="User-Agent", value="Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)")
+    scrape_btn = gr.Button("Scrape")
+    out = gr.JSON(label="Result")
+    scrape_btn.click(
+        fn=scrape_url,
+        inputs=[url_in, include_links_in, max_chars_in, timeout_in, max_kb_in, ua_in],
+        outputs=[out],
+        api_name="scrape_url",  # tool name in Gradio API (and MCP)
     )
 if __name__ == "__main__":
+    demo.launch(mcp_server=True)