webby / app.py
0xmoose's picture
Update app.py
20f9d00 verified
import os
import re
import json
import asyncio
from urllib.parse import urljoin, urlparse
import httpx
import gradio as gr
from bs4 import BeautifulSoup
# --- Scraper core helpers ---
def _is_valid_url(url: str) -> bool:
try:
u = urlparse(url.strip())
return u.scheme in {"http", "https"} and bool(u.netloc)
except Exception:
return False
def _clean_text(s: str) -> str:
s = re.sub(r"\s+", " ", s or "").strip()
return s
def _extract_main_text(html: str) -> str:
"""
Lightweight "main text" extraction (no heavy ML deps):
- remove script/style/nav/footer/header/aside
- prefer <main> or <article>, otherwise body
"""
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
for selector in ["nav", "footer", "header", "aside"]:
for tag in soup.select(selector):
tag.decompose()
container = soup.find("main") or soup.find("article") or soup.body or soup
text = container.get_text(" ", strip=True)
return _clean_text(text)
def _extract_title(html: str) -> str:
soup = BeautifulSoup(html, "lxml")
if soup.title and soup.title.string:
return _clean_text(soup.title.string)
h1 = soup.find("h1")
return _clean_text(h1.get_text(strip=True)) if h1 else ""
def _extract_links(base_url: str, html: str, limit: int = 50) -> list[dict]:
soup = BeautifulSoup(html, "lxml")
links = []
seen = set()
for a in soup.find_all("a", href=True):
href = a.get("href", "").strip()
if not href:
continue
abs_url = urljoin(base_url, href)
abs_url = abs_url.split("#", 1)[0]
if not _is_valid_url(abs_url):
continue
if abs_url in seen:
continue
seen.add(abs_url)
links.append(
{
"url": abs_url,
"text": _clean_text(a.get_text(" ", strip=True))[:200],
}
)
if len(links) >= limit:
break
return links
def _safe_truncate(s: str, max_chars: int) -> str:
if len(s) <= max_chars:
return s
return s[: max_chars - 3] + "..."
# --- MCP-exposed tool functions (type hints + docstrings help MCP clients) ---
def scrape_url(
url: str,
*,
mode: str = "text",
timeout_s: int = 20,
max_chars: int = 12000,
follow_redirects: bool = True,
user_agent: str = "Mozilla/5.0 (compatible; GradioMCPUrlScraper/1.0)",
) -> dict:
"""
Fetch and scrape a URL.
Parameters:
url: The http(s) URL to fetch.
mode: One of:
- "text": returns title + extracted main text
- "html": returns raw HTML (truncated)
- "links": returns list of outgoing links (url + anchor text)
- "all": returns title + text + links + html (truncated)
timeout_s: Request timeout in seconds.
max_chars: Maximum characters returned for large fields.
follow_redirects: Whether to follow redirects.
user_agent: Custom User-Agent header.
Returns:
A JSON-serializable dict with fields depending on mode.
"""
url = (url or "").strip()
if not _is_valid_url(url):
return {"ok": False, "error": "Invalid URL. Must start with http:// or https://", "url": url}
headers = {"User-Agent": user_agent}
try:
with httpx.Client(headers=headers, timeout=timeout_s, follow_redirects=follow_redirects) as client:
r = client.get(url)
content_type = (r.headers.get("content-type") or "").lower()
html = r.text if "text" in content_type or "html" in content_type or not content_type else r.text
out: dict = {
"ok": True,
"url": str(r.url),
"status_code": r.status_code,
"content_type": content_type,
}
# Always compute title if HTML-ish
title = _extract_title(html)
if title:
out["title"] = title
mode = (mode or "text").strip().lower()
if mode not in {"text", "html", "links", "all"}:
return {"ok": False, "error": f"Invalid mode '{mode}'. Use text|html|links|all.", "url": url}
if mode in {"text", "all"}:
text = _extract_main_text(html)
out["text"] = _safe_truncate(text, max_chars)
if mode in {"links", "all"}:
out["links"] = _extract_links(str(r.url), html, limit=50)
if mode in {"html", "all"}:
out["html"] = _safe_truncate(html, max_chars)
return out
except httpx.HTTPError as e:
return {"ok": False, "error": f"HTTP error: {type(e).__name__}: {str(e)}", "url": url}
except Exception as e:
return {"ok": False, "error": f"Unexpected error: {type(e).__name__}: {str(e)}", "url": url}
def scrape_many(urls_json: str, mode: str = "text") -> list[dict]:
"""
Scrape multiple URLs in one call.
Parameters:
urls_json: JSON array of URLs, e.g. ["https://example.com", "https://example.org"]
mode: text|html|links|all
Returns:
List of scrape_url() results.
"""
try:
urls = json.loads(urls_json)
if not isinstance(urls, list):
raise ValueError("urls_json must be a JSON array")
except Exception as e:
return [{"ok": False, "error": f"Invalid JSON array: {str(e)}", "url": ""}]
results = []
for u in urls[:25]: # prevent abuse
results.append(scrape_url(str(u), mode=mode))
return results
# --- Gradio UI ---
with gr.Blocks(title="MCP URL Scraper") as demo:
gr.Markdown(
"""
# MCP URL Scraper (Gradio + Hugging Face Spaces)
- Use the UI to scrape a single URL
- Or connect as an MCP server (tools: `scrape_url`, `scrape_many`)
"""
)
with gr.Row():
url_in = gr.Textbox(label="URL", placeholder="https://example.com", scale=3)
mode_in = gr.Dropdown(["text", "links", "html", "all"], value="text", label="Mode", scale=1)
with gr.Row():
timeout_in = gr.Slider(5, 60, value=20, step=1, label="Timeout (s)")
maxchars_in = gr.Slider(1000, 50000, value=12000, step=1000, label="Max chars returned")
run_btn = gr.Button("Scrape")
out_json = gr.JSON(label="Result")
run_btn.click(
fn=scrape_url,
inputs=[url_in, mode_in, timeout_in, maxchars_in],
outputs=[out_json],
)
if __name__ == "__main__":
# Helps avoid some SSR-related weirdness on Spaces; users have reported ssr_mode=False as a workaround. :contentReference[oaicite:2]{index=2}
demo.launch(
server_name="0.0.0.0",
server_port=int(os.getenv("PORT", "7860")),
ssr_mode=False,
mcp_server=True, # this enables the MCP endpoints :contentReference[oaicite:3]{index=3}
)