0xmoose commited on
Commit
db419e8
·
verified ·
1 Parent(s): 2f7c08a

chore: mod to webscrape

Browse files
Files changed (1) hide show
  1. app.py +246 -67
app.py CHANGED
@@ -1,86 +1,265 @@
1
- import os
 
 
 
 
 
2
  import gradio as gr
3
- from duckduckgo_search import DDGS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
 
 
5
 
6
- def web_search(
7
- query: str,
8
- max_results: int = 5,
9
- region: str = "wt-wt",
10
- safesearch: str = "moderate",
11
- timelimit: str | None = None,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ):
13
  """
14
- Search the web using DuckDuckGo and return the top results.
15
 
16
  Args:
17
- query: Search query string.
18
- max_results: Maximum number of results to return (1-25 recommended).
19
- region: DDG region (e.g. "wt-wt" global, "us-en", "ca-en", "uk-en").
20
- safesearch: "off", "moderate", or "strict".
21
- timelimit: Optional time filter: "d" (day), "w" (week), "m" (month), "y" (year), or None.
 
22
 
23
  Returns:
24
- A list of results, each with title, url, and snippet.
25
  """
26
- q = (query or "").strip()
27
- if not q:
28
- return []
29
-
30
- max_results = int(max(1, min(max_results, 25)))
31
-
32
- results = []
33
- # DDGS returns dicts with keys like: title, href, body
34
- with DDGS() as ddgs:
35
- for r in ddgs.text(
36
- q,
37
- region=region,
38
- safesearch=safesearch,
39
- timelimit=timelimit,
40
- max_results=max_results,
41
- ):
42
- results.append(
43
- {
44
- "title": r.get("title"),
45
- "url": r.get("href"),
46
- "snippet": r.get("body"),
47
- }
48
- )
49
- return results
50
-
51
-
52
- with gr.Blocks(title="Web Search (MCP-ready)") as demo:
 
 
 
53
  gr.Markdown(
54
  """
55
- # Web Search (DuckDuckGo) — UI + MCP Tool
56
- - Use the UI below, or connect as an MCP server from your agent/editor.
57
- - The MCP endpoint appears in the Space footer **View API** → **MCP**.
58
- """
 
 
 
59
  )
60
 
61
  with gr.Row():
62
- query = gr.Textbox(label="Query", placeholder="e.g. gradio mcp server hugging face spaces")
63
  with gr.Row():
64
- max_results = gr.Slider(1, 25, value=5, step=1, label="Max results")
65
- region = gr.Dropdown(
66
- choices=["wt-wt", "us-en", "ca-en", "uk-en", "au-en", "de-de", "fr-fr", "es-es", "it-it"],
67
- value="wt-wt",
68
- label="Region",
69
- )
70
- safesearch = gr.Dropdown(choices=["off", "moderate", "strict"], value="moderate", label="SafeSearch")
71
- timelimit = gr.Dropdown(choices=[None, "d", "w", "m", "y"], value=None, label="Time limit")
72
-
73
- run_btn = gr.Button("Search")
74
- out = gr.JSON(label="Results (title/url/snippet)")
75
-
76
- run_btn.click(
77
- fn=web_search,
78
- inputs=[query, max_results, region, safesearch, timelimit],
79
- outputs=out,
80
- api_name="web_search", # nicer MCP tool name
81
- queue=False, # optional: reduces MCP progress overhead
82
  )
83
 
84
  if __name__ == "__main__":
85
- # Gradio will expose MCP endpoints when mcp_server=True
86
- demo.launch(mcp_server=True, server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
1
+ import re
2
+ import socket
3
+ import ipaddress
4
+ from urllib.parse import urlparse, urljoin
5
+
6
+ import httpx
7
  import gradio as gr
8
+ from bs4 import BeautifulSoup
9
+
10
+ try:
11
+ # Optional but recommended for cleaner article-style extraction
12
+ from readability import Document
13
+ HAS_READABILITY = True
14
+ except Exception:
15
+ HAS_READABILITY = False
16
+
17
+
18
+ # ----------------------------
19
+ # Security / validation helpers
20
+ # ----------------------------
21
+ def _is_public_hostname(hostname: str) -> bool:
22
+ """
23
+ Resolve hostname and block private/loopback/link-local/reserved ranges.
24
+ Mitigates SSRF against internal networks via DNS.
25
+ """
26
+ if not hostname:
27
+ return False
28
+
29
+ try:
30
+ # Disallow obvious local hostnames
31
+ hn = hostname.strip().lower()
32
+ if hn in {"localhost", "localhost.localdomain"}:
33
+ return False
34
+
35
+ infos = socket.getaddrinfo(hostname, None)
36
+ ips = {info[4][0] for info in infos}
37
+
38
+ for ip_str in ips:
39
+ ip = ipaddress.ip_address(ip_str)
40
+ if (
41
+ ip.is_private
42
+ or ip.is_loopback
43
+ or ip.is_link_local
44
+ or ip.is_reserved
45
+ or ip.is_multicast
46
+ ):
47
+ return False
48
+ return True
49
+ except Exception:
50
+ return False
51
+
52
+
53
+ def _validate_url(url: str) -> str:
54
+ url = (url or "").strip()
55
+ if not url:
56
+ raise ValueError("URL is required.")
57
+
58
+ parsed = urlparse(url)
59
+ if parsed.scheme not in {"http", "https"}:
60
+ raise ValueError("Only http:// and https:// URLs are allowed.")
61
+
62
+ if not parsed.netloc:
63
+ raise ValueError("Invalid URL (missing hostname).")
64
+
65
+ # Block credentials in URL (e.g., http://user:pass@host)
66
+ if parsed.username or parsed.password:
67
+ raise ValueError("URLs containing credentials are not allowed.")
68
 
69
+ if not _is_public_hostname(parsed.hostname):
70
+ raise ValueError("Blocked hostname/IP (possible local/private network).")
71
 
72
+ return url
73
+
74
+
75
+ # ----------------------------
76
+ # Extraction helpers
77
+ # ----------------------------
78
+ def _strip_text(text: str) -> str:
79
+ text = re.sub(r"\n{3,}", "\n\n", text)
80
+ text = re.sub(r"[ \t]{2,}", " ", text)
81
+ return text.strip()
82
+
83
+
84
+ def _extract_with_bs4(html: str, base_url: str):
85
+ soup = BeautifulSoup(html, "html.parser")
86
+
87
+ # Remove noisy tags
88
+ for tag in soup(["script", "style", "noscript", "iframe"]):
89
+ tag.decompose()
90
+
91
+ title = (soup.title.string.strip() if soup.title and soup.title.string else "")[:300]
92
+
93
+ # Basic meta
94
+ meta = {}
95
+ for m in soup.find_all("meta"):
96
+ name = (m.get("name") or m.get("property") or "").strip()
97
+ content = (m.get("content") or "").strip()
98
+ if name and content and name.lower() in {
99
+ "description",
100
+ "og:title",
101
+ "og:description",
102
+ "og:url",
103
+ "twitter:title",
104
+ "twitter:description",
105
+ }:
106
+ meta[name] = content[:500]
107
+
108
+ text = _strip_text(soup.get_text("\n"))
109
+
110
+ # Links
111
+ links = []
112
+ for a in soup.find_all("a", href=True):
113
+ href = a.get("href", "").strip()
114
+ if not href:
115
+ continue
116
+ abs_url = urljoin(base_url, href)
117
+ # keep only http(s)
118
+ if urlparse(abs_url).scheme in {"http", "https"}:
119
+ label = _strip_text(a.get_text(" "))[:200]
120
+ links.append({"text": label, "url": abs_url})
121
+
122
+ return title, meta, text, links
123
+
124
+
125
+ def _extract_readable(html: str, base_url: str):
126
+ """
127
+ Use readability-lxml if available; fallback to BeautifulSoup extraction.
128
+ """
129
+ if not HAS_READABILITY:
130
+ return _extract_with_bs4(html, base_url)
131
+
132
+ doc = Document(html)
133
+ title = (doc.short_title() or "")[:300]
134
+ content_html = doc.summary(html_partial=True)
135
+ return _extract_with_bs4(content_html, base_url)
136
+
137
+
138
+ # ----------------------------
139
+ # Fetcher (with redirect checks)
140
+ # ----------------------------
141
+ def _fetch_html(url: str, timeout_s: float, max_bytes: int, user_agent: str, max_redirects: int = 5):
142
+ headers = {"User-Agent": user_agent}
143
+ limits = httpx.Limits(max_keepalive_connections=5, max_connections=10)
144
+
145
+ with httpx.Client(timeout=timeout_s, headers=headers, limits=limits, follow_redirects=False) as client:
146
+ current = url
147
+ for _ in range(max_redirects + 1):
148
+ r = client.get(current)
149
+ # Handle redirects manually so we can validate each hop
150
+ if 300 <= r.status_code < 400 and "location" in r.headers:
151
+ nxt = urljoin(current, r.headers["location"])
152
+ _validate_url(nxt)
153
+ current = nxt
154
+ continue
155
+
156
+ r.raise_for_status()
157
+
158
+ ctype = (r.headers.get("content-type") or "").lower()
159
+ if "text/html" not in ctype and "application/xhtml+xml" not in ctype:
160
+ raise ValueError(f"Unsupported content-type: {ctype or 'unknown'} (expected HTML)")
161
+
162
+ content = r.content
163
+ if len(content) > max_bytes:
164
+ raise ValueError(f"Response too large ({len(content)} bytes). Limit is {max_bytes} bytes.")
165
+
166
+ # Best-effort decode
167
+ try:
168
+ html = content.decode(r.encoding or "utf-8", errors="replace")
169
+ except Exception:
170
+ html = content.decode("utf-8", errors="replace")
171
+
172
+ return current, html
173
+
174
+ raise ValueError("Too many redirects.")
175
+
176
+
177
+ # ----------------------------
178
+ # MCP tool + UI function
179
+ # ----------------------------
180
+ def scrape_url(
181
+ url: str,
182
+ include_links: bool = True,
183
+ max_chars: int = 12000,
184
+ timeout_seconds: float = 15.0,
185
+ max_kb: int = 1024,
186
+ user_agent: str = "Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)"
187
  ):
188
  """
189
+ Scrape a single web page and return clean text + metadata.
190
 
191
  Args:
192
+ url (str): The http(s) URL to fetch.
193
+ include_links (bool): If true, include extracted hyperlinks.
194
+ max_chars (int): Maximum number of characters returned for the main text.
195
+ timeout_seconds (float): Network timeout in seconds.
196
+ max_kb (int): Maximum HTML response size in kilobytes.
197
+ user_agent (str): User-Agent header to send.
198
 
199
  Returns:
200
+ dict: {final_url, title, meta, text, links}
201
  """
202
+ url = _validate_url(url)
203
+ max_bytes = int(max_kb) * 1024
204
+
205
+ final_url, html = _fetch_html(
206
+ url=url,
207
+ timeout_s=float(timeout_seconds),
208
+ max_bytes=max_bytes,
209
+ user_agent=user_agent,
210
+ )
211
+
212
+ title, meta, text, links = _extract_readable(html, final_url)
213
+
214
+ text = text[: max(0, int(max_chars))]
215
+ if not include_links:
216
+ links = []
217
+
218
+ return {
219
+ "final_url": final_url,
220
+ "title": title,
221
+ "meta": meta,
222
+ "text": text,
223
+ "links": links[:200], # cap link count
224
+ "note": "readability-lxml enabled" if HAS_READABILITY else "readability-lxml not installed; using basic extraction",
225
+ }
226
+
227
+
228
+ # ----------------------------
229
+ # Gradio UI
230
+ # ----------------------------
231
+ with gr.Blocks(title="URL Scraper (MCP)") as demo:
232
  gr.Markdown(
233
  """
234
+ # URL Scraper (MCP-compatible)
235
+
236
+ - Paste a URL get extracted text, title, metadata, and links.
237
+ - This Space also exposes the scraper as an **MCP tool**.
238
+
239
+ **MCP endpoint (after deploy):** `https://<your-space>.hf.space/gradio_api/mcp/`
240
+ """
241
  )
242
 
243
  with gr.Row():
244
+ url_in = gr.Textbox(label="URL", placeholder="https://example.com/article")
245
  with gr.Row():
246
+ include_links_in = gr.Checkbox(label="Include links", value=True)
247
+ max_chars_in = gr.Slider(1000, 50000, value=12000, step=500, label="Max returned characters")
248
+ with gr.Accordion("Advanced", open=False):
249
+ timeout_in = gr.Slider(5, 60, value=15, step=1, label="Timeout (seconds)")
250
+ max_kb_in = gr.Slider(128, 4096, value=1024, step=128, label="Max HTML size (KB)")
251
+ ua_in = gr.Textbox(label="User-Agent", value="Mozilla/5.0 (compatible; HFSpaceScraper/1.0; +https://huggingface.co/spaces)")
252
+
253
+ scrape_btn = gr.Button("Scrape")
254
+
255
+ out = gr.JSON(label="Result")
256
+
257
+ scrape_btn.click(
258
+ fn=scrape_url,
259
+ inputs=[url_in, include_links_in, max_chars_in, timeout_in, max_kb_in, ua_in],
260
+ outputs=[out],
261
+ api_name="scrape_url", # tool name in Gradio API (and MCP)
 
 
262
  )
263
 
264
  if __name__ == "__main__":
265
+ demo.launch(mcp_server=True)