CultriX commited on
Commit
7af8fe7
·
verified ·
1 Parent(s): 315dac2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -354
app.py CHANGED
@@ -1,51 +1,8 @@
1
  # app.py
2
- """
3
- RAG-Ready Content Scraper — Gradio + MCP (SSE)
4
-
5
- Exposes an MCP SSE endpoint on Hugging Face Spaces at:
6
- /gradio_api/mcp/sse
7
-
8
- Example MCP configs:
9
-
10
- 1) Direct SSE (Cursor, Windsurf, Cline, etc.)
11
- {
12
- "mcpServers": {
13
- "gradio": {
14
- "url": "https://cultrix-rag-scraper.hf.space/gradio_api/mcp/sse"
15
- }
16
- }
17
- }
18
-
19
- 2) Experimental stdio via Node:
20
- {
21
- "mcpServers": {
22
- "gradio": {
23
- "command": "npx",
24
- "args": [
25
- "mcp-remote",
26
- "https://cultrix-rag-scraper.hf.space/gradio_api/mcp/sse",
27
- "--transport",
28
- "sse-only"
29
- ]
30
- }
31
- }
32
- }
33
- """
34
-
35
  from __future__ import annotations
36
 
37
- import os
38
- os.environ["HF_HOME"] = "/tmp/hf_cache"
39
- os.makedirs(os.environ["HF_HOME"], exist_ok=True)
40
-
41
- import csv
42
- import json
43
- import re
44
- import subprocess
45
- import tempfile
46
  from typing import Optional, Tuple, Literal
47
-
48
- # NEW: use Annotated+Doc so MCP can show per-parameter descriptions
49
  from typing_extensions import Annotated, Doc
50
 
51
  import gradio as gr
@@ -56,229 +13,121 @@ from rag_scraper.converter import Converter
56
  from rag_scraper.link_extractor import LinkExtractor, LinkType
57
  from rag_scraper.utils import URLUtils
58
 
 
 
 
59
 
60
- # -----------------------------
61
- # Helper utilities
62
- # -----------------------------
63
-
64
- def is_github_repo(url_or_id: str) -> bool:
65
- """Return True if the string looks like a GitHub repository reference."""
66
- if "github.com" in url_or_id:
67
- return True
68
- return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
69
-
70
-
71
  def check_repomix_installed() -> bool:
72
- """Check if the `repomix` CLI is available on PATH."""
73
  try:
74
- result = subprocess.run(
75
- ["repomix", "--version"],
76
- capture_output=True,
77
- text=True,
78
- check=False,
79
- )
80
- return result.returncode == 0
81
  except Exception:
82
  return False
83
 
84
-
85
- def run_repomix(
86
- repo_url_or_id: str,
87
- progress: gr.Progress = gr.Progress(track_tqdm=True),
88
- ) -> Tuple[str, Optional[str]]:
89
- """Run Repomix on a GitHub repository and return combined Markdown."""
90
- progress(0, desc="Starting Repomix processing...")
91
  try:
92
- with tempfile.TemporaryDirectory() as temp_dir:
93
- output_file_name = "repomix-output.md"
94
- output_file_path = os.path.join(temp_dir, output_file_name)
95
-
96
- if "/" in repo_url_or_id and not repo_url_or_id.startswith("http"):
97
- repo_url = f"https://github.com/{repo_url_or_id}"
98
- else:
99
- repo_url = repo_url_or_id
100
-
101
- progress(0.2, desc=f"Running Repomix on {repo_url}...")
102
- cmd = [
103
- "repomix",
104
- "--remote", repo_url,
105
- "--output", output_file_path,
106
- "--style", "markdown",
107
- "--compress",
108
- ]
109
- process = subprocess.run(
110
- cmd, capture_output=True, text=True, check=False, encoding="utf-8"
111
- )
112
- progress(0.8, desc="Repomix command executed.")
113
-
114
- if process.returncode != 0:
115
- error_details = (
116
- f"Return Code: {process.returncode}\n"
117
- f"Stderr: {process.stderr}\n"
118
- f"Stdout: {process.stdout}"
119
- )
120
- return f"Error running Repomix:\n{error_details}", None
121
-
122
- if os.path.exists(output_file_path):
123
- with open(output_file_path, "r", encoding="utf-8") as f:
124
- content = f.read()
125
- progress(1, desc="Repomix output processed.")
126
- return content, output_file_path
127
-
128
- error_details = (
129
- f"Return Code: {process.returncode}\n"
130
- f"Stderr: {process.stderr}\n"
131
- f"Stdout: {process.stdout}"
132
- )
133
- return (
134
- f"Error: Repomix did not generate an output file at '{output_file_path}'.\n"
135
- f"Repomix Output:\n{error_details}",
136
- None,
137
- )
138
-
139
  except Exception as e:
140
- progress(1, desc="Error during Repomix processing.")
141
- return f"Error processing GitHub repository: {str(e)}", None
142
-
143
 
144
- def scrape_and_convert_website(
145
- url: str,
146
- depth: int,
147
- progress: gr.Progress = gr.Progress(track_tqdm=True),
148
- ) -> Tuple[str, str]:
149
- """Recursively scrape a website and convert pages to Markdown."""
150
- progress(0, desc=f"Starting web scrape for {url}...")
151
- visited_urls = set()
152
 
153
- def recursive_scrape(
154
- current_url: str,
155
- current_depth: int,
156
- total_links_estimate: int = 1,
157
- link_index: int = 0,
158
- ) -> str:
159
- if current_url in visited_urls or current_depth < 0:
160
  return ""
161
-
162
- visited_urls.add(current_url)
163
-
164
  try:
165
- progress_val = (
166
- link_index / total_links_estimate if total_links_estimate > 0 else 0
167
- )
168
- progress(
169
- progress_val,
170
- desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})",
171
- )
172
- html_content = Scraper.fetch_html(current_url)
173
  except Exception as e:
174
- return f"Error fetching {current_url}: {str(e)}\n"
175
-
176
- markdown_content = f"## Extracted from: {current_url}\n\n"
177
- markdown_content += Converter.html_to_markdown(
178
- html=html_content,
179
- base_url=current_url,
180
- parser_features="html.parser",
181
- ignore_links=True,
182
- )
183
- page_content = markdown_content + "\n\n"
184
-
185
- if current_depth > 0:
186
  try:
187
- links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
188
- valid_links = [
189
- link for link in links
190
- if URLUtils.is_internal(link, current_url) and link not in visited_urls
191
- ]
192
- num_links = len(valid_links)
193
- for i, link_url in enumerate(valid_links):
194
- page_content += recursive_scrape(
195
- link_url, current_depth - 1, num_links, i
196
- )
197
  except Exception as e:
198
- page_content += f"Error extracting links from {current_url}: {str(e)}\n"
199
-
200
- return page_content
201
-
202
- all_markdown_content = recursive_scrape(url, depth)
203
- progress(1, desc="Web scraping complete.")
204
-
205
- with tempfile.NamedTemporaryFile(
206
- mode="w+", delete=False, suffix=".md", encoding="utf-8"
207
- ) as tmp_file:
208
- tmp_file.write(all_markdown_content)
209
- return all_markdown_content, tmp_file.name
210
-
211
-
212
- def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
213
- """Wrap Markdown text in a JSON object with `source` and `content` keys."""
214
- data = {"source": source_url_or_id, "content": markdown_content}
215
- return json.dumps(data, indent=2)
216
-
217
-
218
- def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
219
- """Persist Markdown as a simple CSV with two columns: `source`, `content`."""
220
- output = tempfile.NamedTemporaryFile(
221
- mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
222
- )
223
- writer = csv.writer(output)
224
- writer.writerow(["source", "content"])
225
- writer.writerow([source_url_or_id, markdown_content])
226
- output.close()
227
- return output.name
228
-
229
-
230
- def save_output_to_file(
231
- content: str,
232
- output_format: str,
233
- source_url_or_id: str,
234
- ) -> str:
235
- """Save processed content in the selected format and return a file path."""
236
- processed_content = content # default for Markdown/Text
237
-
238
- if output_format == "JSON":
239
  suffix = ".json"
240
- processed_content = convert_to_json(content, source_url_or_id)
241
- elif output_format == "CSV":
242
- return convert_to_csv(content, source_url_or_id)
243
- elif output_format == "Text":
244
- suffix = ".txt"
245
- elif output_format == "PDF":
246
  try:
247
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
248
- pdf_output_path = tmp_pdf.name
249
- md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
250
- md_pdf.convert_from_string(content, pdf_output_path)
251
- return pdf_output_path
252
  except Exception as e:
253
- # Fallback: persist as Markdown with .pdf.md suffix.
254
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
255
- suffix = ".pdf.md"
256
  else:
257
- suffix = ".md"
258
 
259
- with tempfile.NamedTemporaryFile(
260
- mode="w+", delete=False, suffix=suffix, encoding="utf-8"
261
- ) as tmp_file:
262
- tmp_file.write(processed_content)
263
- return tmp_file.name
264
-
265
-
266
- # ----------------------------------------------------------
267
- # Main tool function (exposed to MCP via SSE)
268
- # ----------------------------------------------------------
269
 
 
270
  def process_input_updated(
271
  url_or_id: Annotated[
272
  str,
273
- Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
274
  ],
275
  source_type: Annotated[
276
  Literal["Webpage", "GitHub Repository"],
277
- Doc('Select the content source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
278
  ],
279
  depth: Annotated[
280
  int,
281
- Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub repositories."),
282
  ],
283
  output_format_selection: Annotated[
284
  Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
@@ -287,131 +136,77 @@ def process_input_updated(
287
  progress: gr.Progress = gr.Progress(track_tqdm=True),
288
  ) -> Tuple[str, str, Optional[str]]:
289
  """
290
- Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
291
 
292
  Returns:
293
- Tuple[str, str, Optional[str]]: (status, preview, file_path)
294
  """
295
- progress(0, desc="Initializing...")
296
- raw_content = ""
297
- error_message = ""
298
- output_file_path: Optional[str] = None
299
 
300
  if source_type == "GitHub Repository":
301
  if not check_repomix_installed():
302
- error_message = (
303
- "Repomix is not installed or not accessible. "
304
- "Please ensure it's installed globally."
305
- )
306
- return error_message, "", None
307
- raw_content, _ = run_repomix(url_or_id, progress=progress)
308
- if raw_content.startswith("Error"):
309
- error_message = raw_content
310
- raw_content = ""
311
  elif source_type == "Webpage":
312
- raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
313
- if raw_content.startswith("Error"):
314
- error_message = raw_content
315
- raw_content = ""
316
  else:
317
- error_message = "Invalid source type selected."
318
- return error_message, "", None
319
-
320
- if error_message:
321
- return error_message, "", None
322
 
323
  try:
324
- progress(0.9, desc=f"Converting to {output_format_selection}...")
325
- output_file_path = save_output_to_file(
326
- raw_content, output_format_selection, url_or_id
327
- )
328
 
329
- # Prepare preview content
330
- preview_content = raw_content
331
  if output_format_selection == "JSON":
332
- preview_content = convert_to_json(raw_content, url_or_id)
333
- elif output_format_selection == "CSV" and output_file_path:
334
- # Show a small preview of the CSV
335
  try:
336
- with open(output_file_path, "r", encoding="utf-8") as f_csv:
337
- csv_preview_lines = [next(f_csv) for _ in range(5)]
338
- preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
339
  except StopIteration:
340
- with open(output_file_path, "r", encoding="utf-8") as f_csv:
341
- preview_content = f_csv.read() or "[CSV content is empty]"
342
- except Exception as e_csv_preview:
343
- preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
344
- elif output_format_selection == "CSV" and not output_file_path:
345
- preview_content = "[CSV file path not available for preview]"
346
  elif output_format_selection == "PDF":
347
- # Can't render PDF in text preview
348
- preview_content = (
349
- f"[PDF generated. Download to view: "
350
- f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
351
- )
352
 
353
- progress(1, desc="Processing complete.")
354
- return f"Successfully processed: {url_or_id}", preview_content, output_file_path
355
 
356
  except Exception as e:
357
- return f"Error during file conversion/saving: {str(e)}", raw_content, None
358
-
359
-
360
- # -----------------------------
361
- # Gradio UI
362
- # -----------------------------
363
 
 
364
  with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
365
  gr.Markdown("# RAG-Ready Content Scraper")
366
- gr.Markdown(
367
- "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
368
- )
369
 
370
  with gr.Row():
371
  with gr.Column(scale=2):
372
- url_input = gr.Textbox(
373
- label="Enter URL or GitHub Repository ID",
374
- placeholder="e.g., https://example.com OR username/repo",
375
- )
376
- source_type_input = gr.Radio(
377
- choices=["Webpage", "GitHub Repository"],
378
- value="Webpage",
379
- label="Select Source Type",
380
- )
381
- depth_input = gr.Slider(
382
- minimum=0,
383
- maximum=3,
384
- step=1,
385
- value=0,
386
- label="Scraping Depth (for Webpages)",
387
- info="0: Only main page. Ignored for GitHub repos.",
388
- )
389
- output_format_input = gr.Dropdown(
390
- choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
391
- value="Markdown",
392
- label="Select Output Format",
393
- )
394
  submit_button = gr.Button("Process Content", variant="primary")
395
-
396
  with gr.Column(scale=3):
397
  status_output = gr.Textbox(label="Status", interactive=False)
398
- preview_output = gr.Code(
399
- label="Preview Content", language="markdown", interactive=False
400
- )
401
- file_download_output = gr.File(
402
- label="Download Processed File", interactive=False
403
- )
404
 
405
  gr.Examples(
406
  examples=[
407
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
408
  ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
409
- [
410
- "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
411
- "Webpage",
412
- 0,
413
- "JSON",
414
- ],
415
  ],
416
  inputs=[url_input, source_type_input, depth_input, output_format_input],
417
  outputs=[status_output, preview_output, file_download_output],
@@ -419,30 +214,6 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
419
  cache_examples=False,
420
  )
421
 
422
- with gr.Accordion("How it Works & More Info", open=False):
423
- gr.Markdown(
424
- """
425
- **Webpage Scraping**
426
- 1. Enter a full URL (e.g., `https://example.com`).
427
- 2. Select "Webpage" as the source type.
428
- 3. Set the desired scraping depth.
429
- 4. Choose your output format.
430
-
431
- **GitHub Repository Processing**
432
- 1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
433
- 2. Select "GitHub Repository". (Depth is ignored.)
434
- 3. Choose your output format. Uses **Repomix**.
435
-
436
- **Output Formats**
437
- Markdown, JSON, CSV, Text, PDF.
438
-
439
- **Notes**
440
- - PDF generation requires the `markdown-pdf` library.
441
- - Designed for Docker/Hugging Face Spaces.
442
- - MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
443
- """
444
- )
445
-
446
  submit_button.click(
447
  fn=process_input_updated,
448
  inputs=[url_input, source_type_input, depth_input, output_format_input],
@@ -450,5 +221,5 @@ Markdown, JSON, CSV, Text, PDF.
450
  )
451
 
452
  if __name__ == "__main__":
453
- # Enable queuing for concurrency; Spaces generally manage hosting.
454
- iface.queue().launch(share=True)
 
1
  # app.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from __future__ import annotations
3
 
4
+ import os, csv, json, re, subprocess, tempfile
 
 
 
 
 
 
 
 
5
  from typing import Optional, Tuple, Literal
 
 
6
  from typing_extensions import Annotated, Doc
7
 
8
  import gradio as gr
 
13
  from rag_scraper.link_extractor import LinkExtractor, LinkType
14
  from rag_scraper.utils import URLUtils
15
 
16
+ # Cache dir for HF Spaces
17
+ os.environ["HF_HOME"] = "/tmp/hf_cache"
18
+ os.makedirs(os.environ["HF_HOME"], exist_ok=True)
19
 
20
+ # ---------- helpers ----------
 
 
 
 
 
 
 
 
 
 
21
  def check_repomix_installed() -> bool:
22
+ """Return True if `repomix` is available on PATH."""
23
  try:
24
+ r = subprocess.run(["repomix", "--version"], capture_output=True, text=True, check=False)
25
+ return r.returncode == 0
 
 
 
 
 
26
  except Exception:
27
  return False
28
 
29
+ def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, Optional[str]]:
30
+ """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
31
+ progress(0, desc="Starting Repomix…")
 
 
 
 
32
  try:
33
+ with tempfile.TemporaryDirectory() as td:
34
+ out_path = os.path.join(td, "repomix-output.md")
35
+ repo_url = f"https://github.com/{repo_url_or_id}" if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) else repo_url_or_id
36
+ cmd = ["repomix", "--remote", repo_url, "--output", out_path, "--style", "markdown", "--compress"]
37
+ p = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding="utf-8")
38
+ progress(0.8, desc="Repomix done.")
39
+ if p.returncode != 0:
40
+ err = f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
41
+ return f"Error running Repomix:\n{err}", None
42
+ if os.path.exists(out_path):
43
+ with open(out_path, "r", encoding="utf-8") as f:
44
+ return f.read(), out_path
45
+ return "Error: Repomix did not produce an output file.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  except Exception as e:
47
+ progress(1, desc="Error")
48
+ return f"Error processing GitHub repository: {e}", None
 
49
 
50
+ def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str]:
51
+ """Recursively scrape a website and convert visited pages to Markdown."""
52
+ progress(0, desc=f"Scraping {url}…")
53
+ visited = set()
 
 
 
 
54
 
55
+ def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
56
+ if u in visited or d < 0:
 
 
 
 
 
57
  return ""
58
+ visited.add(u)
 
 
59
  try:
60
+ progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
61
+ html = Scraper.fetch_html(u)
 
 
 
 
 
 
62
  except Exception as e:
63
+ return f"Error fetching {u}: {e}\n"
64
+ md = f"## Extracted from: {u}\n\n" + Converter.html_to_markdown(html=html, base_url=u, parser_features="html.parser", ignore_links=True) + "\n\n"
65
+ if d > 0:
 
 
 
 
 
 
 
 
 
66
  try:
67
+ links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
68
+ valid = [l for l in links if URLUtils.is_internal(l, u) and l not in visited]
69
+ for j, nxt in enumerate(valid):
70
+ md += rec(nxt, d - 1, len(valid), j)
 
 
 
 
 
 
71
  except Exception as e:
72
+ md += f"Error extracting links from {u}: {e}\n"
73
+ return md
74
+
75
+ all_md = rec(url, depth)
76
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp:
77
+ tmp.write(all_md)
78
+ return all_md, tmp.name
79
+
80
+ def convert_to_json(markdown_content: str, source: str) -> str:
81
+ """Wrap Markdown in a tiny JSON schema."""
82
+ return json.dumps({"source": source, "content": markdown_content}, indent=2)
83
+
84
+ def convert_to_csv(markdown_content: str, source: str) -> str:
85
+ """Write a simple 2-column CSV and return its path."""
86
+ f = tempfile.NamedTemporaryFile(mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8")
87
+ w = csv.writer(f)
88
+ w.writerow(["source", "content"])
89
+ w.writerow([source, markdown_content])
90
+ f.close()
91
+ return f.name
92
+
93
+ def save_output_to_file(content: str, fmt: str, source: str) -> str:
94
+ """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
95
+ if fmt == "JSON":
96
+ data = convert_to_json(content, source)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  suffix = ".json"
98
+ elif fmt == "CSV":
99
+ return convert_to_csv(content, source)
100
+ elif fmt == "Text":
101
+ data, suffix = content, ".txt"
102
+ elif fmt == "PDF":
 
103
  try:
104
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
105
+ path = tmp_pdf.name
106
+ markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
107
+ return path
 
108
  except Exception as e:
 
109
  print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
110
+ data, suffix = content, ".pdf.md"
111
  else:
112
+ data, suffix = content, ".md"
113
 
114
+ with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp:
115
+ tmp.write(data)
116
+ return tmp.name
 
 
 
 
 
 
 
117
 
118
+ # ---------- MCP-exposed tool ----------
119
  def process_input_updated(
120
  url_or_id: Annotated[
121
  str,
122
+ Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or the full GitHub URL."),
123
  ],
124
  source_type: Annotated[
125
  Literal["Webpage", "GitHub Repository"],
126
+ Doc('Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
127
  ],
128
  depth: Annotated[
129
  int,
130
+ Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub."),
131
  ],
132
  output_format_selection: Annotated[
133
  Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
 
136
  progress: gr.Progress = gr.Progress(track_tqdm=True),
137
  ) -> Tuple[str, str, Optional[str]]:
138
  """
139
+ Scrape a webpage (with configurable depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
140
 
141
  Returns:
142
+ (status, preview, file_path)
143
  """
144
+ progress(0, desc="Initializing")
145
+ raw, err = "", ""
146
+ out_path: Optional[str] = None
 
147
 
148
  if source_type == "GitHub Repository":
149
  if not check_repomix_installed():
150
+ return "Repomix is not installed or not accessible.", "", None
151
+ raw, _ = run_repomix(url_or_id, progress=progress)
152
+ if raw.startswith("Error"):
153
+ return raw, "", None
 
 
 
 
 
154
  elif source_type == "Webpage":
155
+ raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
156
+ if raw.startswith("Error"):
157
+ return raw, "", None
 
158
  else:
159
+ return "Invalid source type selected.", "", None
 
 
 
 
160
 
161
  try:
162
+ progress(0.9, desc=f"Converting to {output_format_selection}")
163
+ out_path = save_output_to_file(raw, output_format_selection, url_or_id)
 
 
164
 
165
+ preview = raw
 
166
  if output_format_selection == "JSON":
167
+ preview = convert_to_json(raw, url_or_id)
168
+ elif output_format_selection == "CSV":
 
169
  try:
170
+ with open(out_path, "r", encoding="utf-8") as f:
171
+ first_lines = [next(f) for _ in range(5)]
172
+ preview = "".join(first_lines) or "[CSV content is empty or very short]"
173
  except StopIteration:
174
+ with open(out_path, "r", encoding="utf-8") as f:
175
+ preview = f.read() or "[CSV content is empty]"
176
+ except Exception as e:
177
+ preview = f"[Error reading CSV for preview: {e}]"
 
 
178
  elif output_format_selection == "PDF":
179
+ from os.path import basename
180
+ preview = f"[PDF generated. Download to view: {basename(out_path) if out_path else 'file.pdf'}]"
 
 
 
181
 
182
+ progress(1, desc="Done.")
183
+ return f"Successfully processed: {url_or_id}", preview, out_path
184
 
185
  except Exception as e:
186
+ return f"Error during conversion: {e}", raw, None
 
 
 
 
 
187
 
188
+ # ---------- UI ----------
189
  with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
190
  gr.Markdown("# RAG-Ready Content Scraper")
191
+ gr.Markdown("Scrape webpage content or GitHub repositories to generate RAG-ready datasets.")
 
 
192
 
193
  with gr.Row():
194
  with gr.Column(scale=2):
195
+ url_input = gr.Textbox(label="Enter URL or GitHub Repository ID", placeholder="https://example.com or owner/repo")
196
+ source_type_input = gr.Radio(choices=["Webpage", "GitHub Repository"], value="Webpage", label="Select Source Type")
197
+ depth_input = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Scraping Depth (for Webpages)", info="0 = only main page. Ignored for GitHub.")
198
+ output_format_input = gr.Dropdown(choices=["Markdown", "JSON", "CSV", "Text", "PDF"], value="Markdown", label="Select Output Format")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  submit_button = gr.Button("Process Content", variant="primary")
 
200
  with gr.Column(scale=3):
201
  status_output = gr.Textbox(label="Status", interactive=False)
202
+ preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
203
+ file_download_output = gr.File(label="Download Processed File", interactive=False)
 
 
 
 
204
 
205
  gr.Examples(
206
  examples=[
207
  ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
208
  ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
209
+ ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
 
 
 
 
 
210
  ],
211
  inputs=[url_input, source_type_input, depth_input, output_format_input],
212
  outputs=[status_output, preview_output, file_download_output],
 
214
  cache_examples=False,
215
  )
216
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  submit_button.click(
218
  fn=process_input_updated,
219
  inputs=[url_input, source_type_input, depth_input, output_format_input],
 
221
  )
222
 
223
  if __name__ == "__main__":
224
+ # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
225
+ iface.queue().launch(share=True, mcp_server=True)