CultriX commited on
Commit
36555b1
·
verified ·
1 Parent(s): 675ca61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -396
app.py CHANGED
@@ -12,6 +12,7 @@ from typing import Optional, Tuple, Literal
12
  import gradio as gr
13
  import markdown_pdf
14
  from typing_extensions import Annotated, Doc
 
15
  from pydantic import BaseModel, Field, conint
16
 
17
  from rag_scraper.scraper import Scraper
@@ -25,206 +26,6 @@ from rag_scraper.utils import URLUtils
25
  os.environ["HF_HOME"] = "/tmp/hf_cache"
26
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
27
 
28
- # -----------------------------
29
- # Custom CSS for modern UI
30
- # -----------------------------
31
- custom_css = """
32
- .gradio-container {
33
- font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
34
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
35
- min-height: 100vh;
36
- }
37
-
38
- .main-container {
39
- background: rgba(255, 255, 255, 0.95) !important;
40
- border-radius: 20px !important;
41
- box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3) !important;
42
- margin: 20px auto !important;
43
- max-width: 1400px !important;
44
- padding: 30px !important;
45
- backdrop-filter: blur(10px) !important;
46
- }
47
-
48
- .title-container {
49
- text-align: center;
50
- margin-bottom: 30px;
51
- padding: 20px;
52
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
53
- border-radius: 15px;
54
- color: white;
55
- box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4);
56
- }
57
-
58
- .title-container h1 {
59
- font-size: 2.5rem !important;
60
- font-weight: 700 !important;
61
- margin: 0 !important;
62
- text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
63
- }
64
-
65
- .title-container p {
66
- font-size: 1.1rem !important;
67
- margin: 10px 0 0 0 !important;
68
- opacity: 0.95;
69
- }
70
-
71
- .input-panel {
72
- background: white !important;
73
- border-radius: 15px !important;
74
- padding: 25px !important;
75
- box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
76
- border: 1px solid rgba(102, 126, 234, 0.1) !important;
77
- }
78
-
79
- .output-panel {
80
- background: white !important;
81
- border-radius: 15px !important;
82
- padding: 25px !important;
83
- box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
84
- border: 1px solid rgba(102, 126, 234, 0.1) !important;
85
- }
86
-
87
- .gradio-button {
88
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
89
- border: none !important;
90
- border-radius: 10px !important;
91
- color: white !important;
92
- font-weight: 600 !important;
93
- padding: 12px 30px !important;
94
- transition: all 0.3s ease !important;
95
- box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important;
96
- }
97
-
98
- .gradio-button:hover {
99
- transform: translateY(-2px) !important;
100
- box-shadow: 0 6px 20px rgba(102, 126, 234, 0.4) !important;
101
- }
102
-
103
- .gradio-textbox, .gradio-dropdown, .gradio-slider, .gradio-radio {
104
- border-radius: 10px !important;
105
- border: 2px solid #e5e7eb !important;
106
- transition: all 0.3s ease !important;
107
- }
108
-
109
- .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-slider:focus {
110
- border-color: #667eea !important;
111
- box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
112
- }
113
-
114
- .gradio-radio label {
115
- padding: 8px 16px !important;
116
- border-radius: 8px !important;
117
- margin: 4px !important;
118
- transition: all 0.3s ease !important;
119
- }
120
-
121
- .gradio-radio label:hover {
122
- background: rgba(102, 126, 234, 0.1) !important;
123
- }
124
-
125
- .gradio-code {
126
- border-radius: 10px !important;
127
- font-family: 'Fira Code', monospace !important;
128
- box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.06) !important;
129
- }
130
-
131
- .gradio-file {
132
- border-radius: 10px !important;
133
- border: 2px dashed #667eea !important;
134
- background: rgba(102, 126, 234, 0.05) !important;
135
- }
136
-
137
- .progress-bar {
138
- background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
139
- border-radius: 10px !important;
140
- }
141
-
142
- .examples-container {
143
- background: rgba(102, 126, 234, 0.05) !important;
144
- border-radius: 15px !important;
145
- padding: 20px !important;
146
- margin-top: 20px !important;
147
- border: 1px solid rgba(102, 126, 234, 0.2) !important;
148
- }
149
-
150
- .status-box {
151
- background: linear-gradient(135deg, #10b981 0%, #059669 100%) !important;
152
- color: white !important;
153
- border-radius: 10px !important;
154
- padding: 15px !important;
155
- font-weight: 500 !important;
156
- }
157
-
158
- .error-box {
159
- background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
160
- color: white !important;
161
- border-radius: 10px !important;
162
- padding: 15px !important;
163
- font-weight: 500 !important;
164
- }
165
-
166
- .info-box {
167
- background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
168
- color: white !important;
169
- border-radius: 10px !important;
170
- padding: 15px !important;
171
- font-weight: 500 !important;
172
- }
173
-
174
- .feature-card {
175
- background: white !important;
176
- border-radius: 12px !important;
177
- padding: 20px !important;
178
- margin: 10px 0 !important;
179
- box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08) !important;
180
- border-left: 4px solid #667eea !important;
181
- transition: transform 0.3s ease !important;
182
- }
183
-
184
- .feature-card:hover {
185
- transform: translateX(5px) !important;
186
- }
187
-
188
- .tab-nav {
189
- background: rgba(102, 126, 234, 0.1) !important;
190
- border-radius: 10px !important;
191
- padding: 5px !important;
192
- }
193
-
194
- .tab-nav button {
195
- border-radius: 8px !important;
196
- margin: 2px !important;
197
- transition: all 0.3s ease !important;
198
- }
199
-
200
- .tab-nav button:hover {
201
- background: rgba(102, 126, 234, 0.2) !important;
202
- }
203
-
204
- .footer {
205
- text-align: center;
206
- margin-top: 30px;
207
- padding: 20px;
208
- color: white;
209
- font-size: 0.9rem;
210
- }
211
-
212
- .footer a {
213
- color: white;
214
- text-decoration: underline;
215
- }
216
-
217
- /* Animation for loading */
218
- @keyframes pulse {
219
- 0% { opacity: 1; }
220
- 50% { opacity: 0.5; }
221
- 100% { opacity: 1; }
222
- }
223
-
224
- .loading {
225
- animation: pulse 1.5s ease-in-out infinite;
226
- }
227
- """
228
 
229
  # -----------------------------
230
  # Helper utilities
@@ -248,7 +49,7 @@ def run_repomix(
248
  progress: gr.Progress = gr.Progress(track_tqdm=True),
249
  ) -> Tuple[str, Optional[str]]:
250
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
251
- progress(0, desc="🚀 Starting Repomix…")
252
  try:
253
  with tempfile.TemporaryDirectory() as td:
254
  out_path = os.path.join(td, "repomix-output.md")
@@ -270,19 +71,19 @@ def run_repomix(
270
  p = subprocess.run(
271
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
272
  )
273
- progress(0.8, desc="Repomix completed.")
274
  if p.returncode != 0:
275
  err = (
276
  f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
277
  )
278
- return f"Error running Repomix:\n{err}", None
279
  if os.path.exists(out_path):
280
  with open(out_path, "r", encoding="utf-8") as f:
281
  return f.read(), out_path
282
- return "Error: Repomix did not produce an output file.", None
283
  except Exception as e:
284
- progress(1, desc="Error")
285
- return f"Error processing GitHub repository: {e}", None
286
 
287
 
288
  def scrape_and_convert_website(
@@ -291,7 +92,7 @@ def scrape_and_convert_website(
291
  progress: gr.Progress = gr.Progress(track_tqdm=True),
292
  ) -> Tuple[str, str]:
293
  """Recursively scrape a website and convert visited pages to Markdown."""
294
- progress(0, desc=f"🔍 Scraping {url}…")
295
  visited = set()
296
 
297
  def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
@@ -299,12 +100,12 @@ def scrape_and_convert_website(
299
  return ""
300
  visited.add(u)
301
  try:
302
- progress(i / n if n > 0 else 0, desc=f"🌐 Scraping: {u}")
303
  html = Scraper.fetch_html(u)
304
  except Exception as e:
305
- return f"Error fetching {u}: {e}\n"
306
  md = (
307
- f"## 📄 Extracted from: {u}\n\n"
308
  + Converter.html_to_markdown(
309
  html=html, base_url=u, parser_features="html.parser", ignore_links=True
310
  )
@@ -321,7 +122,7 @@ def scrape_and_convert_website(
321
  for j, nxt in enumerate(valid):
322
  md += rec(nxt, d - 1, len(valid), j)
323
  except Exception as e:
324
- md += f"Error extracting links from {u}: {e}\n"
325
  return md
326
 
327
  all_md = rec(url, depth)
@@ -391,28 +192,24 @@ def process_input_updated(
391
  UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
392
  then export as Markdown/JSON/CSV/Text/PDF.
393
  """
394
- progress(0, desc="🚀 Initializing…")
395
  out_path: Optional[str] = None
396
 
397
  if source_type == "GitHub Repository":
398
  if not check_repomix_installed():
399
- return (
400
- "❌ Repomix is not installed or not accessible.",
401
- "",
402
- None
403
- )
404
  raw, _ = run_repomix(url_or_id, progress=progress)
405
- if raw.startswith("Error"):
406
  return raw, "", None
407
  elif source_type == "Webpage":
408
  raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
409
- if raw.startswith("Error"):
410
  return raw, "", None
411
  else:
412
- return "Invalid source type selected.", "", None
413
 
414
  try:
415
- progress(0.9, desc=f"📄 Converting to {output_format_selection}…")
416
  out_path = save_output_to_file(raw, output_format_selection, url_or_id)
417
 
418
  preview = raw
@@ -432,15 +229,15 @@ def process_input_updated(
432
  from os.path import basename
433
 
434
  preview = (
435
- f"📄 PDF generated. Download to view: "
436
- f"{basename(out_path) if out_path else 'file.pdf'}"
437
  )
438
 
439
- progress(1, desc="Done!")
440
- return f"Successfully processed: {url_or_id}", preview, out_path
441
 
442
  except Exception as e:
443
- return f"Error during conversion: {e}", "", None
444
 
445
 
446
  # -----------------------------
@@ -475,7 +272,7 @@ class ProcessResult(BaseModel):
475
  description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
476
  )
477
  file_path: Optional[str] = Field(
478
- None, description="Temp file path for the artifact, or null if not created.",
479
  )
480
 
481
 
@@ -490,193 +287,91 @@ def process_input_mcp(args: ProcessArgs) -> ProcessResult:
490
 
491
 
492
  # -----------------------------
493
- # Gradio UI with Modern Design
494
  # -----------------------------
495
- with gr.Blocks(
496
- title="RAG-Ready Content Scraper",
497
- theme=gr.themes.Soft(),
498
- css=custom_css
499
- ) as ui_iface:
500
-
501
- # Header
502
- with gr.Column(elem_classes=["main-container"]):
503
- with gr.Column(elem_classes=["title-container"]):
504
- gr.HTML("""
505
- <h1>🚀 RAG-Ready Content Scraper</h1>
506
- <p>Transform web content and GitHub repositories into structured datasets for AI applications</p>
507
- <p style="font-size: 0.9rem; opacity: 0.8;">
508
- Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: white;">anycoder</a>
509
- </p>
510
- """)
511
-
512
- # Feature cards
513
- with gr.Row():
514
- with gr.Column(scale=1):
515
- with gr.Column(elem_classes=["feature-card"]):
516
- gr.HTML("""
517
- <h3>🌐 Web Scraping</h3>
518
- <p>Extract clean content from websites with recursive depth control</p>
519
- """)
520
- with gr.Column(scale=1):
521
- with gr.Column(elem_classes=["feature-card"]):
522
- gr.HTML("""
523
- <h3>📦 GitHub Processing</h3>
524
- <p>Process entire repositories using Repomix for AI-friendly output</p>
525
- """)
526
- with gr.Column(scale=1):
527
- with gr.Column(elem_classes=["feature-card"]):
528
- gr.HTML("""
529
- <h3>📄 Multiple Formats</h3>
530
- <p>Export as Markdown, JSON, CSV, Text, or PDF</p>
531
- """)
532
-
533
- # Main content area
534
- with gr.Row():
535
- # Input panel
536
- with gr.Column(scale=1, elem_classes=["input-panel"]):
537
- gr.HTML("<h2>⚙️ Configuration</h2>")
538
-
539
- url_input = gr.Textbox(
540
- label="🔗 URL or GitHub Repository",
541
- placeholder="https://example.com or owner/repo",
542
- lines=1,
543
- max_lines=1
544
- )
545
-
546
- source_type_input = gr.Radio(
547
- choices=["🌐 Webpage", "📦 GitHub Repository"],
548
- value="🌐 Webpage",
549
- label="📂 Source Type",
550
- interactive=True
551
- )
552
-
553
- with gr.Group(visible=True) as webpage_options:
554
- depth_input = gr.Slider(
555
- minimum=0,
556
- maximum=3,
557
- step=1,
558
- value=0,
559
- label="🔍 Scraping Depth",
560
- info="0 = main page only | 1-3 = follow internal links"
561
- )
562
-
563
- output_format_input = gr.Dropdown(
564
- choices=["📝 Markdown", "📄 JSON", "📊 CSV", "📃 Text", "📑 PDF"],
565
- value="📝 Markdown",
566
- label="💾 Output Format"
567
- )
568
-
569
- submit_button = gr.Button(
570
- "🚀 Process Content",
571
- variant="primary",
572
- size="lg"
573
- )
574
-
575
- # Examples section
576
- with gr.Accordion("📚 Quick Examples", open=False):
577
- gr.HTML("""
578
- <p style="margin-bottom: 15px;">Click any example to load the configuration:</p>
579
- """)
580
- example1 = gr.Button("📖 Gradio Docs (Depth 1)")
581
- example2 = gr.Button("📦 Gradio Repository")
582
- example3 = gr.Button("📄 Wikipedia RAG Article")
583
-
584
- # Example handlers
585
- example1.click(
586
- fn=lambda: ("https://gradio.app/docs/js", "🌐 Webpage", 1, "📝 Markdown"),
587
- outputs=[url_input, source_type_input, depth_input, output_format_input]
588
- )
589
- example2.click(
590
- fn=lambda: ("gradio-app/gradio", "📦 GitHub Repository", 0, "📃 Text"),
591
- outputs=[url_input, source_type_input, depth_input, output_format_input]
592
- )
593
- example3.click(
594
- fn=lambda: ("https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "🌐 Webpage", 0, "📄 JSON"),
595
- outputs=[url_input, source_type_input, depth_input, output_format_input]
596
- )
597
-
598
- # Output panel
599
- with gr.Column(scale=2, elem_classes=["output-panel"]):
600
- gr.HTML("<h2>📊 Results</h2>")
601
-
602
- with gr.Tabs():
603
- with gr.TabItem("📈 Status"):
604
- status_output = gr.HTML(
605
- value='<div class="info-box">Ready to process your content...</div>',
606
- label="Status"
607
- )
608
-
609
- with gr.TabItem("👁️ Preview"):
610
- preview_output = gr.Code(
611
- label="Content Preview",
612
- language="markdown",
613
- interactive=False,
614
- lines=15,
615
- max_lines=30
616
- )
617
-
618
- with gr.TabItem("⬇️ Download"):
619
- file_download_output = gr.File(
620
- label="Download Processed File",
621
- interactive=False
622
- )
623
-
624
- # Footer
625
- gr.HTML("""
626
- <div class="footer">
627
- <p>Powered by Gradio • Docker • Repomix • BeautifulSoup4</p>
628
- <p style="font-size: 0.8rem; opacity: 0.7;">
629
- MIT License •
630
- <a href="https://huggingface.co/spaces/CultriX/RAG-Scraper" target="_blank">Source Code</a>
631
- </p>
632
- </div>
633
- """)
634
-
635
- # Toggle depth slider based on source type
636
- def toggle_depth(source_type):
637
- if source_type == "🌐 Webpage":
638
- return gr.Group(visible=True)
639
- else:
640
- return gr.Group(visible=False)
641
-
642
- source_type_input.change(
643
- fn=toggle_depth,
644
- inputs=[source_type_input],
645
- outputs=[webpage_options]
646
  )
647
-
648
- # Main processing function
649
- def process_with_emoji_fix(url, source, depth, fmt):
650
- # Remove emojis from inputs for processing
651
- clean_source = source.replace("🌐 ", "").replace("📦 ", "")
652
- clean_fmt = fmt.replace("📝 ", "").replace("📄 ", "").replace("📊 ", "").replace("📃 ", "").replace("📑 ", "")
653
- return process_input_updated(url, clean_source, depth, clean_fmt)
654
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
655
  submit_button.click(
656
- fn=process_with_emoji_fix,
657
  inputs=[url_input, source_type_input, depth_input, output_format_input],
658
- outputs=[status_output, preview_output, file_download_output]
659
  )
660
 
661
  # -----------------------------
662
  # MCP-only Interface (Pydantic tool)
663
  # -----------------------------
 
 
664
  mcp_iface = gr.Interface(
665
  fn=process_input_mcp,
 
 
666
  inputs=gr.JSON(label="ProcessArgs (JSON)"),
667
  outputs=gr.JSON(label="ProcessResult (JSON)"),
668
  title="MCP Tool: process_input_mcp",
669
  description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
670
  allow_flagging="never",
671
- css=custom_css
672
  )
673
 
674
- # Combine the user UI and the MCP tool as two tabs
675
- app = gr.TabbedInterface(
676
- [ui_iface, mcp_iface],
677
- tab_names=["🚀 App", "🔧 MCP"],
678
- css=custom_css
679
- )
680
 
681
  if __name__ == "__main__":
 
682
  app.queue().launch(share=True, mcp_server=True)
 
12
  import gradio as gr
13
  import markdown_pdf
14
  from typing_extensions import Annotated, Doc
15
+
16
  from pydantic import BaseModel, Field, conint
17
 
18
  from rag_scraper.scraper import Scraper
 
26
  os.environ["HF_HOME"] = "/tmp/hf_cache"
27
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # -----------------------------
31
  # Helper utilities
 
49
  progress: gr.Progress = gr.Progress(track_tqdm=True),
50
  ) -> Tuple[str, Optional[str]]:
51
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
52
+ progress(0, desc="Starting Repomix…")
53
  try:
54
  with tempfile.TemporaryDirectory() as td:
55
  out_path = os.path.join(td, "repomix-output.md")
 
71
  p = subprocess.run(
72
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
73
  )
74
+ progress(0.8, desc="Repomix done.")
75
  if p.returncode != 0:
76
  err = (
77
  f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
78
  )
79
+ return f"Error running Repomix:\n{err}", None
80
  if os.path.exists(out_path):
81
  with open(out_path, "r", encoding="utf-8") as f:
82
  return f.read(), out_path
83
+ return "Error: Repomix did not produce an output file.", None
84
  except Exception as e:
85
+ progress(1, desc="Error")
86
+ return f"Error processing GitHub repository: {e}", None
87
 
88
 
89
  def scrape_and_convert_website(
 
92
  progress: gr.Progress = gr.Progress(track_tqdm=True),
93
  ) -> Tuple[str, str]:
94
  """Recursively scrape a website and convert visited pages to Markdown."""
95
+ progress(0, desc=f"Scraping {url}…")
96
  visited = set()
97
 
98
  def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
 
100
  return ""
101
  visited.add(u)
102
  try:
103
+ progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
104
  html = Scraper.fetch_html(u)
105
  except Exception as e:
106
+ return f"Error fetching {u}: {e}\n"
107
  md = (
108
+ f"## Extracted from: {u}\n\n"
109
  + Converter.html_to_markdown(
110
  html=html, base_url=u, parser_features="html.parser", ignore_links=True
111
  )
 
122
  for j, nxt in enumerate(valid):
123
  md += rec(nxt, d - 1, len(valid), j)
124
  except Exception as e:
125
+ md += f"Error extracting links from {u}: {e}\n"
126
  return md
127
 
128
  all_md = rec(url, depth)
 
192
  UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
193
  then export as Markdown/JSON/CSV/Text/PDF.
194
  """
195
+ progress(0, desc="Initializing…")
196
  out_path: Optional[str] = None
197
 
198
  if source_type == "GitHub Repository":
199
  if not check_repomix_installed():
200
+ return "Repomix is not installed or not accessible.", "", None
 
 
 
 
201
  raw, _ = run_repomix(url_or_id, progress=progress)
202
+ if raw.startswith("Error"):
203
  return raw, "", None
204
  elif source_type == "Webpage":
205
  raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
206
+ if raw.startswith("Error"):
207
  return raw, "", None
208
  else:
209
+ return "Invalid source type selected.", "", None
210
 
211
  try:
212
+ progress(0.9, desc=f"Converting to {output_format_selection}…")
213
  out_path = save_output_to_file(raw, output_format_selection, url_or_id)
214
 
215
  preview = raw
 
229
  from os.path import basename
230
 
231
  preview = (
232
+ f"[PDF generated. Download to view: "
233
+ f"{basename(out_path) if out_path else 'file.pdf'}]"
234
  )
235
 
236
+ progress(1, desc="Done.")
237
+ return f"Successfully processed: {url_or_id}", preview, out_path
238
 
239
  except Exception as e:
240
+ return f"Error during conversion: {e}", "", None
241
 
242
 
243
  # -----------------------------
 
272
  description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
273
  )
274
  file_path: Optional[str] = Field(
275
+ None, description="Temp file path for the artifact, or null if not created."
276
  )
277
 
278
 
 
287
 
288
 
289
  # -----------------------------
290
+ # Gradio UI
291
  # -----------------------------
292
+ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
293
+ gr.Markdown("# RAG-Ready Content Scraper")
294
+ gr.Markdown(
295
+ "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  )
297
+
298
+ with gr.Row():
299
+ with gr.Column(scale=2):
300
+ url_input = gr.Textbox(
301
+ label="Enter URL or GitHub Repository ID",
302
+ placeholder="https://example.com or owner/repo",
303
+ )
304
+ source_type_input = gr.Radio(
305
+ choices=["Webpage", "GitHub Repository"],
306
+ value="Webpage",
307
+ label="Select Source Type",
308
+ )
309
+ depth_input = gr.Slider(
310
+ minimum=0,
311
+ maximum=3,
312
+ step=1,
313
+ value=0,
314
+ label="Scraping Depth (for Webpages)",
315
+ info="0 = only main page. Ignored for GitHub.",
316
+ )
317
+ output_format_input = gr.Dropdown(
318
+ choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
319
+ value="Markdown",
320
+ label="Select Output Format",
321
+ )
322
+ submit_button = gr.Button("Process Content", variant="primary")
323
+ with gr.Column(scale=3):
324
+ status_output = gr.Textbox(label="Status", interactive=False)
325
+ preview_output = gr.Code(
326
+ label="Preview Content", language="markdown", interactive=False
327
+ )
328
+ file_download_output = gr.File(
329
+ label="Download Processed File", interactive=False
330
+ )
331
+
332
+ gr.Examples(
333
+ examples=[
334
+ ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
335
+ ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
336
+ [
337
+ "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
338
+ "Webpage",
339
+ 0,
340
+ "JSON",
341
+ ],
342
+ ],
343
+ inputs=[url_input, source_type_input, depth_input, output_format_input],
344
+ outputs=[status_output, preview_output, file_download_output],
345
+ fn=process_input_updated,
346
+ cache_examples=False,
347
+ )
348
+
349
  submit_button.click(
350
+ fn=process_input_updated,
351
  inputs=[url_input, source_type_input, depth_input, output_format_input],
352
+ outputs=[status_output, preview_output, file_download_output],
353
  )
354
 
355
  # -----------------------------
356
  # MCP-only Interface (Pydantic tool)
357
  # -----------------------------
358
+ # We expose a second interface whose *function signature* uses Pydantic models.
359
+ # MCP reads this signature to build a JSON Schema with rich field descriptions.
360
  mcp_iface = gr.Interface(
361
  fn=process_input_mcp,
362
+ # Components are placeholders; MCP ignores them and reads the Python types.
363
+ # Keep them simple so the tab is usable if someone clicks it.
364
  inputs=gr.JSON(label="ProcessArgs (JSON)"),
365
  outputs=gr.JSON(label="ProcessResult (JSON)"),
366
  title="MCP Tool: process_input_mcp",
367
  description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
368
  allow_flagging="never",
 
369
  )
370
 
371
+ # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
372
+ app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
373
+
 
 
 
374
 
375
  if __name__ == "__main__":
376
+ # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
377
  app.queue().launch(share=True, mcp_server=True)