CultriX commited on
Commit
5458065
·
verified ·
1 Parent(s): 20dc7c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +221 -100
app.py CHANGED
@@ -1,43 +1,81 @@
1
  # app.py
2
  from __future__ import annotations
3
 
4
- import os, csv, json, re, subprocess, tempfile
 
 
 
 
 
5
  from typing import Optional, Tuple, Literal
6
- from typing_extensions import Annotated, Doc
7
 
8
  import gradio as gr
9
  import markdown_pdf
 
 
 
10
 
11
  from rag_scraper.scraper import Scraper
12
  from rag_scraper.converter import Converter
13
  from rag_scraper.link_extractor import LinkExtractor, LinkType
14
  from rag_scraper.utils import URLUtils
15
 
16
- # Cache dir for HF Spaces
 
 
17
  os.environ["HF_HOME"] = "/tmp/hf_cache"
18
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
 
20
- # ---------- helpers ----------
 
 
 
21
  def check_repomix_installed() -> bool:
22
  """Return True if `repomix` is available on PATH."""
23
  try:
24
- r = subprocess.run(["repomix", "--version"], capture_output=True, text=True, check=False)
 
 
 
 
 
25
  return r.returncode == 0
26
  except Exception:
27
  return False
28
 
29
- def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, Optional[str]]:
 
 
 
 
30
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
31
  progress(0, desc="Starting Repomix…")
32
  try:
33
  with tempfile.TemporaryDirectory() as td:
34
  out_path = os.path.join(td, "repomix-output.md")
35
- repo_url = f"https://github.com/{repo_url_or_id}" if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) else repo_url_or_id
36
- cmd = ["repomix", "--remote", repo_url, "--output", out_path, "--style", "markdown", "--compress"]
37
- p = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding="utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  progress(0.8, desc="Repomix done.")
39
  if p.returncode != 0:
40
- err = f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
 
 
41
  return f"Error running Repomix:\n{err}", None
42
  if os.path.exists(out_path):
43
  with open(out_path, "r", encoding="utf-8") as f:
@@ -47,7 +85,12 @@ def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_t
47
  progress(1, desc="Error")
48
  return f"Error processing GitHub repository: {e}", None
49
 
50
- def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str]:
 
 
 
 
 
51
  """Recursively scrape a website and convert visited pages to Markdown."""
52
  progress(0, desc=f"Scraping {url}…")
53
  visited = set()
@@ -61,11 +104,21 @@ def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.
61
  html = Scraper.fetch_html(u)
62
  except Exception as e:
63
  return f"Error fetching {u}: {e}\n"
64
- md = f"## Extracted from: {u}\n\n" + Converter.html_to_markdown(html=html, base_url=u, parser_features="html.parser", ignore_links=True) + "\n\n"
 
 
 
 
 
 
65
  if d > 0:
66
  try:
67
  links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
68
- valid = [l for l in links if URLUtils.is_internal(l, u) and l not in visited]
 
 
 
 
69
  for j, nxt in enumerate(valid):
70
  md += rec(nxt, d - 1, len(valid), j)
71
  except Exception as e:
@@ -73,23 +126,30 @@ def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.
73
  return md
74
 
75
  all_md = rec(url, depth)
76
- with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp:
 
 
77
  tmp.write(all_md)
78
  return all_md, tmp.name
79
 
 
80
  def convert_to_json(markdown_content: str, source: str) -> str:
81
  """Wrap Markdown in a tiny JSON schema."""
82
  return json.dumps({"source": source, "content": markdown_content}, indent=2)
83
 
 
84
  def convert_to_csv(markdown_content: str, source: str) -> str:
85
  """Write a simple 2-column CSV and return its path."""
86
- f = tempfile.NamedTemporaryFile(mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8")
 
 
87
  w = csv.writer(f)
88
  w.writerow(["source", "content"])
89
  w.writerow([source, markdown_content])
90
  f.close()
91
  return f.name
92
 
 
93
  def save_output_to_file(content: str, fmt: str, source: str) -> str:
94
  """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
95
  if fmt == "JSON":
@@ -111,133 +171,174 @@ def save_output_to_file(content: str, fmt: str, source: str) -> str:
111
  else:
112
  data, suffix = content, ".md"
113
 
114
- with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp:
 
 
115
  tmp.write(data)
116
  return tmp.name
117
 
118
- # ---------- MCP-exposed tool ----------
119
- from typing import Optional, Tuple, Literal
120
- from typing_extensions import Annotated, Doc
121
- import os
122
- import gradio as gr
123
 
 
 
 
124
  def process_input_updated(
125
- url_or_id: Annotated[
126
- str,
127
- Doc("For webpages: full URL (e.g. https://example.com). For GitHub: owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
128
- ],
129
- source_type: Annotated[
130
- Literal["Webpage", "GitHub Repository"],
131
- Doc('Choose source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
132
- ],
133
- depth: Annotated[
134
- int,
135
- Doc("Crawl depth for webpages (0–3). 0 = main page only. Ignored for GitHub repositories."),
136
- ],
137
- output_format_selection: Annotated[
138
- Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
139
- Doc("Output format for the processed content."),
140
- ],
141
  progress: gr.Progress = gr.Progress(track_tqdm=True),
142
  ) -> Tuple[str, str, Optional[str]]:
143
  """
144
- Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
145
-
146
- Parameters
147
- ----------
148
- url_or_id : str
149
- For webpages: full URL (e.g. ``https://example.com``).
150
- For GitHub: either ``owner/repo`` or a full GitHub URL
151
- (e.g. ``https://github.com/owner/repo``).
152
- source_type : {"Webpage", "GitHub Repository"}
153
- Choose the content source. Use **Webpage** to crawl HTML; use
154
- **GitHub Repository** to run Repomix.
155
- depth : int
156
- Crawl depth for webpages in the range 0–3 where 0 = only the main page.
157
- Ignored when ``source_type`` is ``"GitHub Repository"``.
158
- output_format_selection : {"Markdown", "JSON", "CSV", "Text", "PDF"}
159
- Desired output format for the processed content.
160
- progress : gr.Progress, optional
161
- (UI only) Gradio progress tracker. MCP callers can omit this.
162
-
163
- Returns
164
- -------
165
- (status, preview, file_path) : tuple[str, str, Optional[str]]
166
- - **status**: Human-readable status line.
167
- - **preview**: Text preview (full Markdown/JSON/Text, or a short note for CSV/PDF).
168
- - **file_path**: Path to the generated artifact, or ``None`` on error.
169
  """
170
  progress(0, desc="Initializing…")
171
- raw_content = ""
172
- output_file_path: Optional[str] = None
173
 
174
  if source_type == "GitHub Repository":
175
  if not check_repomix_installed():
176
- return "Repomix is not installed or not accessible. Please install it.", "", None
177
- raw_content, _ = run_repomix(url_or_id, progress=progress)
178
- if raw_content.startswith("Error"):
179
- return raw_content, "", None
180
  elif source_type == "Webpage":
181
- raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
182
- if raw_content.startswith("Error"):
183
- return raw_content, "", None
184
  else:
185
  return "Invalid source type selected.", "", None
186
 
187
  try:
188
  progress(0.9, desc=f"Converting to {output_format_selection}…")
189
- output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
190
 
191
- preview_content = raw_content
192
  if output_format_selection == "JSON":
193
- preview_content = convert_to_json(raw_content, url_or_id)
194
- elif output_format_selection == "CSV" and output_file_path:
195
  try:
196
- with open(output_file_path, "r", encoding="utf-8") as f_csv:
197
- csv_preview_lines = [next(f_csv) for _ in range(5)]
198
- preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
199
  except StopIteration:
200
- with open(output_file_path, "r", encoding="utf-8") as f_csv:
201
- preview_content = f_csv.read() or "[CSV content is empty]"
202
- except Exception as e_csv_preview:
203
- preview_content = f"[Error reading CSV for preview: {e_csv_preview}]"
204
- elif output_format_selection == "CSV" and not output_file_path:
205
- preview_content = "[CSV file path not available for preview]"
206
  elif output_format_selection == "PDF":
207
- preview_content = (
 
 
208
  f"[PDF generated. Download to view: "
209
- f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
210
  )
211
 
212
- progress(1, desc="Processing complete.")
213
- return f"Successfully processed: {url_or_id}", preview_content, output_file_path
214
 
215
  except Exception as e:
216
- return f"Error during file conversion/saving: {e}", raw_content, None
217
 
218
 
219
- # ---------- UI ----------
220
- with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  gr.Markdown("# RAG-Ready Content Scraper")
222
- gr.Markdown("Scrape webpage content or GitHub repositories to generate RAG-ready datasets.")
 
 
223
 
224
  with gr.Row():
225
  with gr.Column(scale=2):
226
- url_input = gr.Textbox(label="Enter URL or GitHub Repository ID", placeholder="https://example.com or owner/repo")
227
- source_type_input = gr.Radio(choices=["Webpage", "GitHub Repository"], value="Webpage", label="Select Source Type")
228
- depth_input = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Scraping Depth (for Webpages)", info="0 = only main page. Ignored for GitHub.")
229
- output_format_input = gr.Dropdown(choices=["Markdown", "JSON", "CSV", "Text", "PDF"], value="Markdown", label="Select Output Format")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  submit_button = gr.Button("Process Content", variant="primary")
231
  with gr.Column(scale=3):
232
  status_output = gr.Textbox(label="Status", interactive=False)
233
- preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
234
- file_download_output = gr.File(label="Download Processed File", interactive=False)
 
 
 
 
235
 
236
  gr.Examples(
237
  examples=[
238
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
239
  ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
240
- ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
 
 
 
 
 
241
  ],
242
  inputs=[url_input, source_type_input, depth_input, output_format_input],
243
  outputs=[status_output, preview_output, file_download_output],
@@ -251,6 +352,26 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
251
  outputs=[status_output, preview_output, file_download_output],
252
  )
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  if __name__ == "__main__":
255
  # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
256
- iface.queue().launch(share=True, mcp_server=True)
 
1
  # app.py
2
  from __future__ import annotations
3
 
4
+ import os
5
+ import csv
6
+ import json
7
+ import re
8
+ import subprocess
9
+ import tempfile
10
  from typing import Optional, Tuple, Literal
 
11
 
12
  import gradio as gr
13
  import markdown_pdf
14
+ from typing_extensions import Annotated, Doc
15
+
16
+ from pydantic import BaseModel, Field, conint
17
 
18
  from rag_scraper.scraper import Scraper
19
  from rag_scraper.converter import Converter
20
  from rag_scraper.link_extractor import LinkExtractor, LinkType
21
  from rag_scraper.utils import URLUtils
22
 
23
+ # -----------------------------
24
+ # Environment (HF cache dir)
25
+ # -----------------------------
26
  os.environ["HF_HOME"] = "/tmp/hf_cache"
27
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
28
 
29
+
30
+ # -----------------------------
31
+ # Helper utilities
32
+ # -----------------------------
33
  def check_repomix_installed() -> bool:
34
  """Return True if `repomix` is available on PATH."""
35
  try:
36
+ r = subprocess.run(
37
+ ["repomix", "--version"],
38
+ capture_output=True,
39
+ text=True,
40
+ check=False,
41
+ )
42
  return r.returncode == 0
43
  except Exception:
44
  return False
45
 
46
+
47
+ def run_repomix(
48
+ repo_url_or_id: str,
49
+ progress: gr.Progress = gr.Progress(track_tqdm=True),
50
+ ) -> Tuple[str, Optional[str]]:
51
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
52
  progress(0, desc="Starting Repomix…")
53
  try:
54
  with tempfile.TemporaryDirectory() as td:
55
  out_path = os.path.join(td, "repomix-output.md")
56
+ repo_url = (
57
+ f"https://github.com/{repo_url_or_id}"
58
+ if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
59
+ else repo_url_or_id
60
+ )
61
+ cmd = [
62
+ "repomix",
63
+ "--remote",
64
+ repo_url,
65
+ "--output",
66
+ out_path,
67
+ "--style",
68
+ "markdown",
69
+ "--compress",
70
+ ]
71
+ p = subprocess.run(
72
+ cmd, capture_output=True, text=True, check=False, encoding="utf-8"
73
+ )
74
  progress(0.8, desc="Repomix done.")
75
  if p.returncode != 0:
76
+ err = (
77
+ f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
78
+ )
79
  return f"Error running Repomix:\n{err}", None
80
  if os.path.exists(out_path):
81
  with open(out_path, "r", encoding="utf-8") as f:
 
85
  progress(1, desc="Error")
86
  return f"Error processing GitHub repository: {e}", None
87
 
88
+
89
+ def scrape_and_convert_website(
90
+ url: str,
91
+ depth: int,
92
+ progress: gr.Progress = gr.Progress(track_tqdm=True),
93
+ ) -> Tuple[str, str]:
94
  """Recursively scrape a website and convert visited pages to Markdown."""
95
  progress(0, desc=f"Scraping {url}…")
96
  visited = set()
 
104
  html = Scraper.fetch_html(u)
105
  except Exception as e:
106
  return f"Error fetching {u}: {e}\n"
107
+ md = (
108
+ f"## Extracted from: {u}\n\n"
109
+ + Converter.html_to_markdown(
110
+ html=html, base_url=u, parser_features="html.parser", ignore_links=True
111
+ )
112
+ + "\n\n"
113
+ )
114
  if d > 0:
115
  try:
116
  links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
117
+ valid = [
118
+ l
119
+ for l in links
120
+ if URLUtils.is_internal(l, u) and l not in visited
121
+ ]
122
  for j, nxt in enumerate(valid):
123
  md += rec(nxt, d - 1, len(valid), j)
124
  except Exception as e:
 
126
  return md
127
 
128
  all_md = rec(url, depth)
129
+ with tempfile.NamedTemporaryFile(
130
+ mode="w+", delete=False, suffix=".md", encoding="utf-8"
131
+ ) as tmp:
132
  tmp.write(all_md)
133
  return all_md, tmp.name
134
 
135
+
136
  def convert_to_json(markdown_content: str, source: str) -> str:
137
  """Wrap Markdown in a tiny JSON schema."""
138
  return json.dumps({"source": source, "content": markdown_content}, indent=2)
139
 
140
+
141
  def convert_to_csv(markdown_content: str, source: str) -> str:
142
  """Write a simple 2-column CSV and return its path."""
143
+ f = tempfile.NamedTemporaryFile(
144
+ mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
145
+ )
146
  w = csv.writer(f)
147
  w.writerow(["source", "content"])
148
  w.writerow([source, markdown_content])
149
  f.close()
150
  return f.name
151
 
152
+
153
  def save_output_to_file(content: str, fmt: str, source: str) -> str:
154
  """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
155
  if fmt == "JSON":
 
171
  else:
172
  data, suffix = content, ".md"
173
 
174
+ with tempfile.NamedTemporaryFile(
175
+ mode="w+", delete=False, suffix=suffix, encoding="utf-8"
176
+ ) as tmp:
177
  tmp.write(data)
178
  return tmp.name
179
 
 
 
 
 
 
180
 
181
+ # -----------------------------
182
+ # Core UI-bound function
183
+ # -----------------------------
184
  def process_input_updated(
185
+ url_or_id: str,
186
+ source_type: Literal["Webpage", "GitHub Repository"],
187
+ depth: int,
188
+ output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
 
 
 
 
 
 
 
 
 
 
 
 
189
  progress: gr.Progress = gr.Progress(track_tqdm=True),
190
  ) -> Tuple[str, str, Optional[str]]:
191
  """
192
+ UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
193
+ then export as Markdown/JSON/CSV/Text/PDF.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  """
195
  progress(0, desc="Initializing…")
196
+ out_path: Optional[str] = None
 
197
 
198
  if source_type == "GitHub Repository":
199
  if not check_repomix_installed():
200
+ return "Repomix is not installed or not accessible.", "", None
201
+ raw, _ = run_repomix(url_or_id, progress=progress)
202
+ if raw.startswith("Error"):
203
+ return raw, "", None
204
  elif source_type == "Webpage":
205
+ raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
206
+ if raw.startswith("Error"):
207
+ return raw, "", None
208
  else:
209
  return "Invalid source type selected.", "", None
210
 
211
  try:
212
  progress(0.9, desc=f"Converting to {output_format_selection}…")
213
+ out_path = save_output_to_file(raw, output_format_selection, url_or_id)
214
 
215
+ preview = raw
216
  if output_format_selection == "JSON":
217
+ preview = convert_to_json(raw, url_or_id)
218
+ elif output_format_selection == "CSV":
219
  try:
220
+ with open(out_path, "r", encoding="utf-8") as f:
221
+ first_lines = [next(f) for _ in range(5)]
222
+ preview = "".join(first_lines) or "[CSV content is empty or very short]"
223
  except StopIteration:
224
+ with open(out_path, "r", encoding="utf-8") as f:
225
+ preview = f.read() or "[CSV content is empty]"
226
+ except Exception as e:
227
+ preview = f"[Error reading CSV for preview: {e}]"
 
 
228
  elif output_format_selection == "PDF":
229
+ from os.path import basename
230
+
231
+ preview = (
232
  f"[PDF generated. Download to view: "
233
+ f"{basename(out_path) if out_path else 'file.pdf'}]"
234
  )
235
 
236
+ progress(1, desc="Done.")
237
+ return f"Successfully processed: {url_or_id}", preview, out_path
238
 
239
  except Exception as e:
240
+ return f"Error during conversion: {e}", "", None
241
 
242
 
243
+ # -----------------------------
244
+ # Pydantic models for MCP tool
245
+ # -----------------------------
246
+ class ProcessArgs(BaseModel):
247
+ url_or_id: str = Field(
248
+ ...,
249
+ description=(
250
+ "For webpages, a full URL (e.g., https://example.com). "
251
+ "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
252
+ ),
253
+ )
254
+ source_type: Literal["Webpage", "GitHub Repository"] = Field(
255
+ ...,
256
+ description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
257
+ )
258
+ depth: conint(ge=0, le=3) = Field(
259
+ ...,
260
+ description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
261
+ )
262
+ output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
263
+ ...,
264
+ description="Desired output format for the processed content.",
265
+ )
266
+
267
+
268
+ class ProcessResult(BaseModel):
269
+ status: str = Field(..., description="Human-readable status line.")
270
+ preview: str = Field(
271
+ ...,
272
+ description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
273
+ )
274
+ file_path: Optional[str] = Field(
275
+ None, description="Temp file path for the artifact, or null if not created."
276
+ )
277
+
278
+
279
+ def process_input_mcp(args: ProcessArgs) -> ProcessResult:
280
+ """
281
+ MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
282
+ """
283
+ status, preview, path = process_input_updated(
284
+ args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
285
+ )
286
+ return ProcessResult(status=status, preview=preview, file_path=path)
287
+
288
+
289
+ # -----------------------------
290
+ # Gradio UI
291
+ # -----------------------------
292
+ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
293
  gr.Markdown("# RAG-Ready Content Scraper")
294
+ gr.Markdown(
295
+ "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
296
+ )
297
 
298
  with gr.Row():
299
  with gr.Column(scale=2):
300
+ url_input = gr.Textbox(
301
+ label="Enter URL or GitHub Repository ID",
302
+ placeholder="https://example.com or owner/repo",
303
+ )
304
+ source_type_input = gr.Radio(
305
+ choices=["Webpage", "GitHub Repository"],
306
+ value="Webpage",
307
+ label="Select Source Type",
308
+ )
309
+ depth_input = gr.Slider(
310
+ minimum=0,
311
+ maximum=3,
312
+ step=1,
313
+ value=0,
314
+ label="Scraping Depth (for Webpages)",
315
+ info="0 = only main page. Ignored for GitHub.",
316
+ )
317
+ output_format_input = gr.Dropdown(
318
+ choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
319
+ value="Markdown",
320
+ label="Select Output Format",
321
+ )
322
  submit_button = gr.Button("Process Content", variant="primary")
323
  with gr.Column(scale=3):
324
  status_output = gr.Textbox(label="Status", interactive=False)
325
+ preview_output = gr.Code(
326
+ label="Preview Content", language="markdown", interactive=False
327
+ )
328
+ file_download_output = gr.File(
329
+ label="Download Processed File", interactive=False
330
+ )
331
 
332
  gr.Examples(
333
  examples=[
334
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
335
  ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
336
+ [
337
+ "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
338
+ "Webpage",
339
+ 0,
340
+ "JSON",
341
+ ],
342
  ],
343
  inputs=[url_input, source_type_input, depth_input, output_format_input],
344
  outputs=[status_output, preview_output, file_download_output],
 
352
  outputs=[status_output, preview_output, file_download_output],
353
  )
354
 
355
+ # -----------------------------
356
+ # MCP-only Interface (Pydantic tool)
357
+ # -----------------------------
358
+ # We expose a second interface whose *function signature* uses Pydantic models.
359
+ # MCP reads this signature to build a JSON Schema with rich field descriptions.
360
+ mcp_iface = gr.Interface(
361
+ fn=process_input_mcp,
362
+ # Components are placeholders; MCP ignores them and reads the Python types.
363
+ # Keep them simple so the tab is usable if someone clicks it.
364
+ inputs=gr.JSON(label="ProcessArgs (JSON)"),
365
+ outputs=gr.JSON(label="ProcessResult (JSON)"),
366
+ title="MCP Tool: process_input_mcp",
367
+ description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
368
+ allow_flagging="never",
369
+ )
370
+
371
+ # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
372
+ app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
373
+
374
+
375
  if __name__ == "__main__":
376
  # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
377
+ app.queue().launch(share=True, mcp_server=True)