Spaces:

0xmoose
/

webby

Sleeping

App Files Files Community

0xmoose commited on Dec 19, 2025

Commit

20f9d00

verified ·

1 Parent(s): 3bd24c5

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -200

app.py CHANGED Viewed

@@ -1,265 +1,224 @@
 import re
-import socket
-import ipaddress
-from urllib.parse import urlparse, urljoin
 import httpx
 import gradio as gr
 from bs4 import BeautifulSoup
-try:
-    # Optional but recommended for cleaner article-style extraction
-    from readability import Document
-    HAS_READABILITY = True
-except Exception:
-    HAS_READABILITY = False
-# ----------------------------
-# Security / validation helpers
-# ----------------------------
-def _is_public_hostname(hostname: str) -> bool:
-    """
-    Resolve hostname and block private/loopback/link-local/reserved ranges.
-    Mitigates SSRF against internal networks via DNS.
-    """
-    if not hostname:
-        return False
     try:
-        # Disallow obvious local hostnames
-        hn = hostname.strip().lower()
-        if hn in {"localhost", "localhost.localdomain"}:
-            return False
-        infos = socket.getaddrinfo(hostname, None)
-        ips = {info[4][0] for info in infos}
-        for ip_str in ips:
-            ip = ipaddress.ip_address(ip_str)
-            if (
-                ip.is_private
-                or ip.is_loopback
-                or ip.is_link_local
-                or ip.is_reserved
-                or ip.is_multicast
-            ):
-                return False
-        return True
     except Exception:
         return False
-def _validate_url(url: str) -> str:
-    url = (url or "").strip()
-    if not url:
-        raise ValueError("URL is required.")
-    parsed = urlparse(url)
-    if parsed.scheme not in {"http", "https"}:
-        raise ValueError("Only http:// and https:// URLs are allowed.")
-    if not parsed.netloc:
-        raise ValueError("Invalid URL (missing hostname).")
-    # Block credentials in URL (e.g., http://user:pass@host)
-    if parsed.username or parsed.password:
-        raise ValueError("URLs containing credentials are not allowed.")
-    if not _is_public_hostname(parsed.hostname):
-        raise ValueError("Blocked hostname/IP (possible local/private network).")
-    return url
-# ----------------------------
-# Extraction helpers
-# ----------------------------
-def _strip_text(text: str) -> str:
-    text = re.sub(r"\n{3,}", "\n\n", text)
-    text = re.sub(r"[ \t]{2,}", " ", text)
-    return text.strip()
-def _extract_with_bs4(html: str, base_url: str):
-    soup = BeautifulSoup(html, "html.parser")
-    # Remove noisy tags
-    for tag in soup(["script", "style", "noscript", "iframe"]):
-        tag.decompose()
-    title = (soup.title.string.strip() if soup.title and soup.title.string else "")[:300]
-    # Basic meta
-    meta = {}
-    for m in soup.find_all("meta"):
-        name = (m.get("name") or m.get("property") or "").strip()
-        content = (m.get("content") or "").strip()
-        if name and content and name.lower() in {
-            "description",
-            "og:title",
-            "og:description",
-            "og:url",
-            "twitter:title",
-            "twitter:description",
-        }:
-            meta[name] = content[:500]
-    text = _strip_text(soup.get_text("\n"))
-    # Links
     links = []
     for a in soup.find_all("a", href=True):
         href = a.get("href", "").strip()
         if not href:
             continue
         abs_url = urljoin(base_url, href)
-        # keep only http(s)
-        if urlparse(abs_url).scheme in {"http", "https"}:
-            label = _strip_text(a.get_text(" "))[:200]
-            links.append({"text": label, "url": abs_url})
-    return title, meta, text, links
-def _extract_readable(html: str, base_url: str):
-    """
-    Use readability-lxml if available; fallback to BeautifulSoup extraction.
-    """
-    if not HAS_READABILITY:
-        return _extract_with_bs4(html, base_url)
-    doc = Document(html)
-    title = (doc.short_title() or "")[:300]
-    content_html = doc.summary(html_partial=True)
-    return _extract_with_bs4(content_html, base_url)
-# ----------------------------
-# Fetcher (with redirect checks)
-# ----------------------------
-def _fetch_html(url: str, timeout_s: float, max_bytes: int, user_agent: str, max_redirects: int = 5):
     headers = {"User-Agent": user_agent}
-    limits = httpx.Limits(max_keepalive_connections=5, max_connections=10)
-    with httpx.Client(timeout=timeout_s, headers=headers, limits=limits, follow_redirects=False) as client:
-        current = url
-        for _ in range(max_redirects + 1):
-            r = client.get(current)
-            # Handle redirects manually so we can validate each hop
-            if 300 <= r.status_code < 400 and "location" in r.headers:
-                nxt = urljoin(current, r.headers["location"])
-                _validate_url(nxt)
-                current = nxt
-                continue
-            r.raise_for_status()
-            ctype = (r.headers.get("content-type") or "").lower()
-            if "text/html" not in ctype and "application/xhtml+xml" not in ctype:
-                raise ValueError(f"Unsupported content-type: {ctype or 'unknown'} (expected HTML)")
-            content = r.content
-            if len(content) > max_bytes:
-                raise ValueError(f"Response too large ({len(content)} bytes). Limit is {max_bytes} bytes.")
-            # Best-effort decode
-            try:
-                html = content.decode(r.encoding or "utf-8", errors="replace")
-            except Exception:
-                html = content.decode("utf-8", errors="replace")
-            return current, html
-        raise ValueError("Too many redirects.")
-# ----------------------------
-# MCP tool + UI function
-# ----------------------------
-def scrape_url(
-    url: str,
-    include_links: bool = True,
-    max_chars: int = 12000,
-    timeout_seconds: float = 15.0,
-    max_kb: int = 1024,
-    user_agent: str = "Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)"
-):
     """
-    Scrape a single web page and return clean text + metadata.
-    Args:
-        url (str): The http(s) URL to fetch.
-        include_links (bool): If true, include extracted hyperlinks.
-        max_chars (int): Maximum number of characters returned for the main text.
-        timeout_seconds (float): Network timeout in seconds.
-        max_kb (int): Maximum HTML response size in kilobytes.
-        user_agent (str): User-Agent header to send.
     Returns:
-        dict: {final_url, title, meta, text, links}
     """
-    url = _validate_url(url)
-    max_bytes = int(max_kb) * 1024
-    final_url, html = _fetch_html(
-        url=url,
-        timeout_s=float(timeout_seconds),
-        max_bytes=max_bytes,
-        user_agent=user_agent,
-    )
-    title, meta, text, links = _extract_readable(html, final_url)
-    text = text[: max(0, int(max_chars))]
-    if not include_links:
-        links = []
-    return {
-        "final_url": final_url,
-        "title": title,
-        "meta": meta,
-        "text": text,
-        "links": links[:200],  # cap link count
-        "note": "readability-lxml enabled" if HAS_READABILITY else "readability-lxml not installed; using basic extraction",
-    }
-# ----------------------------
-# Gradio UI
-# ----------------------------
-with gr.Blocks(title="URL Scraper (MCP)") as demo:
     gr.Markdown(
         """
-# URL Scraper (MCP-compatible)
-- Paste a URL → get extracted text, title, metadata, and links.
-- This Space also exposes the scraper as an **MCP tool**.
-**MCP endpoint (after deploy):** `https://<your-space>.hf.space/gradio_api/mcp/`
-        """
     )
     with gr.Row():
-        url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
-    with gr.Row():
-        include_links_in = gr.Checkbox(label="Include links", value=True)
-        max_chars_in = gr.Slider(1000, 50000, value=12000, step=500, label="Max returned characters")
-    with gr.Accordion("Advanced", open=False):
-        timeout_in = gr.Slider(5, 60, value=15, step=1, label="Timeout (seconds)")
-        max_kb_in = gr.Slider(128, 4096, value=1024, step=128, label="Max HTML size (KB)")
-        ua_in = gr.Textbox(label="User-Agent", value="Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)")
-    scrape_btn = gr.Button("Scrape")
-    out = gr.JSON(label="Result")
-    scrape_btn.click(
         fn=scrape_url,
-        inputs=[url_in, include_links_in, max_chars_in, timeout_in, max_kb_in, ua_in],
-        outputs=[out],
-        api_name="scrape_url",  # tool name in Gradio API (and MCP)
     )
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

+import os
 import re
+import json
+import asyncio
+from urllib.parse import urljoin, urlparse
 import httpx
 import gradio as gr
 from bs4 import BeautifulSoup
+# --- Scraper core helpers ---
+def _is_valid_url(url: str) -> bool:
     try:
+        u = urlparse(url.strip())
+        return u.scheme in {"http", "https"} and bool(u.netloc)
     except Exception:
         return False
+def _clean_text(s: str) -> str:
+    s = re.sub(r"\s+", " ", s or "").strip()
+    return s
+def _extract_main_text(html: str) -> str:
+    """
+    Lightweight "main text" extraction (no heavy ML deps):
+    - remove script/style/nav/footer/header/aside
+    - prefer <main> or <article>, otherwise body
+    """
+    soup = BeautifulSoup(html, "lxml")
+    for tag in soup(["script", "style", "noscript"]):
+        tag.decompose()
+    for selector in ["nav", "footer", "header", "aside"]:
+        for tag in soup.select(selector):
+            tag.decompose()
+    container = soup.find("main") or soup.find("article") or soup.body or soup
+    text = container.get_text(" ", strip=True)
+    return _clean_text(text)
+def _extract_title(html: str) -> str:
+    soup = BeautifulSoup(html, "lxml")
+    if soup.title and soup.title.string:
+        return _clean_text(soup.title.string)
+    h1 = soup.find("h1")
+    return _clean_text(h1.get_text(strip=True)) if h1 else ""
+def _extract_links(base_url: str, html: str, limit: int = 50) -> list[dict]:
+    soup = BeautifulSoup(html, "lxml")
     links = []
+    seen = set()
     for a in soup.find_all("a", href=True):
         href = a.get("href", "").strip()
         if not href:
             continue
         abs_url = urljoin(base_url, href)
+        abs_url = abs_url.split("#", 1)[0]
+        if not _is_valid_url(abs_url):
+            continue
+        if abs_url in seen:
+            continue
+        seen.add(abs_url)
+        links.append(
+            {
+                "url": abs_url,
+                "text": _clean_text(a.get_text(" ", strip=True))[:200],
+            }
+        )
+        if len(links) >= limit:
+            break
+    return links
+def _safe_truncate(s: str, max_chars: int) -> str:
+    if len(s) <= max_chars:
+        return s
+    return s[: max_chars - 3] + "..."
+# --- MCP-exposed tool functions (type hints + docstrings help MCP clients) ---
+def scrape_url(
+    url: str,
+    *,
+    mode: str = "text",
+    timeout_s: int = 20,
+    max_chars: int = 12000,
+    follow_redirects: bool = True,
+    user_agent: str = "Mozilla/5.0 (compatible; GradioMCPUrlScraper/1.0)",
+) -> dict:
+    """
+    Fetch and scrape a URL.
+    Parameters:
+        url: The http(s) URL to fetch.
+        mode: One of:
+            - "text": returns title + extracted main text
+            - "html": returns raw HTML (truncated)
+            - "links": returns list of outgoing links (url + anchor text)
+            - "all": returns title + text + links + html (truncated)
+        timeout_s: Request timeout in seconds.
+        max_chars: Maximum characters returned for large fields.
+        follow_redirects: Whether to follow redirects.
+        user_agent: Custom User-Agent header.
+    Returns:
+        A JSON-serializable dict with fields depending on mode.
+    """
+    url = (url or "").strip()
+    if not _is_valid_url(url):
+        return {"ok": False, "error": "Invalid URL. Must start with http:// or https://", "url": url}
     headers = {"User-Agent": user_agent}
+    try:
+        with httpx.Client(headers=headers, timeout=timeout_s, follow_redirects=follow_redirects) as client:
+            r = client.get(url)
+            content_type = (r.headers.get("content-type") or "").lower()
+            html = r.text if "text" in content_type or "html" in content_type or not content_type else r.text
+        out: dict = {
+            "ok": True,
+            "url": str(r.url),
+            "status_code": r.status_code,
+            "content_type": content_type,
+        }
+        # Always compute title if HTML-ish
+        title = _extract_title(html)
+        if title:
+            out["title"] = title
+        mode = (mode or "text").strip().lower()
+        if mode not in {"text", "html", "links", "all"}:
+            return {"ok": False, "error": f"Invalid mode '{mode}'. Use text|html|links|all.", "url": url}
+        if mode in {"text", "all"}:
+            text = _extract_main_text(html)
+            out["text"] = _safe_truncate(text, max_chars)
+        if mode in {"links", "all"}:
+            out["links"] = _extract_links(str(r.url), html, limit=50)
+        if mode in {"html", "all"}:
+            out["html"] = _safe_truncate(html, max_chars)
+        return out
+    except httpx.HTTPError as e:
+        return {"ok": False, "error": f"HTTP error: {type(e).__name__}: {str(e)}", "url": url}
+    except Exception as e:
+        return {"ok": False, "error": f"Unexpected error: {type(e).__name__}: {str(e)}", "url": url}
+def scrape_many(urls_json: str, mode: str = "text") -> list[dict]:
     """
+    Scrape multiple URLs in one call.
+    Parameters:
+        urls_json: JSON array of URLs, e.g. ["https://example.com", "https://example.org"]
+        mode: text|html|links|all
     Returns:
+        List of scrape_url() results.
     """
+    try:
+        urls = json.loads(urls_json)
+        if not isinstance(urls, list):
+            raise ValueError("urls_json must be a JSON array")
+    except Exception as e:
+        return [{"ok": False, "error": f"Invalid JSON array: {str(e)}", "url": ""}]
+    results = []
+    for u in urls[:25]:  # prevent abuse
+        results.append(scrape_url(str(u), mode=mode))
+    return results
+# --- Gradio UI ---
+with gr.Blocks(title="MCP URL Scraper") as demo:
     gr.Markdown(
         """
+# MCP URL Scraper (Gradio + Hugging Face Spaces)
+- Use the UI to scrape a single URL
+- Or connect as an MCP server (tools: `scrape_url`, `scrape_many`)
+"""
     )
     with gr.Row():
+        url_in = gr.Textbox(label="URL", placeholder="https://example.com", scale=3)
+        mode_in = gr.Dropdown(["text", "links", "html", "all"], value="text", label="Mode", scale=1)
+    with gr.Row():
+        timeout_in = gr.Slider(5, 60, value=20, step=1, label="Timeout (s)")
+        maxchars_in = gr.Slider(1000, 50000, value=12000, step=1000, label="Max chars returned")
+    run_btn = gr.Button("Scrape")
+    out_json = gr.JSON(label="Result")
+    run_btn.click(
         fn=scrape_url,
+        inputs=[url_in, mode_in, timeout_in, maxchars_in],
+        outputs=[out_json],
     )
 if __name__ == "__main__":
+    # Helps avoid some SSR-related weirdness on Spaces; users have reported ssr_mode=False as a workaround. :contentReference[oaicite:2]{index=2}
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=int(os.getenv("PORT", "7860")),
+        ssr_mode=False,
+        mcp_server=True,  # this enables the MCP endpoints :contentReference[oaicite:3]{index=3}
+    )