CultriX commited on
Commit
97b889a
Β·
verified Β·
1 Parent(s): aa93fa6

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +397 -92
  2. requirements.txt +70 -8
app.py CHANGED
@@ -12,7 +12,6 @@ from typing import Optional, Tuple, Literal
12
  import gradio as gr
13
  import markdown_pdf
14
  from typing_extensions import Annotated, Doc
15
-
16
  from pydantic import BaseModel, Field, conint
17
 
18
  from rag_scraper.scraper import Scraper
@@ -26,6 +25,206 @@ from rag_scraper.utils import URLUtils
26
  os.environ["HF_HOME"] = "/tmp/hf_cache"
27
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  # -----------------------------
31
  # Helper utilities
@@ -49,7 +248,7 @@ def run_repomix(
49
  progress: gr.Progress = gr.Progress(track_tqdm=True),
50
  ) -> Tuple[str, Optional[str]]:
51
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
52
- progress(0, desc="Starting Repomix…")
53
  try:
54
  with tempfile.TemporaryDirectory() as td:
55
  out_path = os.path.join(td, "repomix-output.md")
@@ -71,19 +270,19 @@ def run_repomix(
71
  p = subprocess.run(
72
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
73
  )
74
- progress(0.8, desc="Repomix done.")
75
  if p.returncode != 0:
76
  err = (
77
  f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
78
  )
79
- return f"Error running Repomix:\n{err}", None
80
  if os.path.exists(out_path):
81
  with open(out_path, "r", encoding="utf-8") as f:
82
  return f.read(), out_path
83
- return "Error: Repomix did not produce an output file.", None
84
  except Exception as e:
85
- progress(1, desc="Error")
86
- return f"Error processing GitHub repository: {e}", None
87
 
88
 
89
  def scrape_and_convert_website(
@@ -92,7 +291,7 @@ def scrape_and_convert_website(
92
  progress: gr.Progress = gr.Progress(track_tqdm=True),
93
  ) -> Tuple[str, str]:
94
  """Recursively scrape a website and convert visited pages to Markdown."""
95
- progress(0, desc=f"Scraping {url}…")
96
  visited = set()
97
 
98
  def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
@@ -100,12 +299,12 @@ def scrape_and_convert_website(
100
  return ""
101
  visited.add(u)
102
  try:
103
- progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
104
  html = Scraper.fetch_html(u)
105
  except Exception as e:
106
- return f"Error fetching {u}: {e}\n"
107
  md = (
108
- f"## Extracted from: {u}\n\n"
109
  + Converter.html_to_markdown(
110
  html=html, base_url=u, parser_features="html.parser", ignore_links=True
111
  )
@@ -122,7 +321,7 @@ def scrape_and_convert_website(
122
  for j, nxt in enumerate(valid):
123
  md += rec(nxt, d - 1, len(valid), j)
124
  except Exception as e:
125
- md += f"Error extracting links from {u}: {e}\n"
126
  return md
127
 
128
  all_md = rec(url, depth)
@@ -192,24 +391,28 @@ def process_input_updated(
192
  UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
193
  then export as Markdown/JSON/CSV/Text/PDF.
194
  """
195
- progress(0, desc="Initializing…")
196
  out_path: Optional[str] = None
197
 
198
  if source_type == "GitHub Repository":
199
  if not check_repomix_installed():
200
- return "Repomix is not installed or not accessible.", "", None
 
 
 
 
201
  raw, _ = run_repomix(url_or_id, progress=progress)
202
- if raw.startswith("Error"):
203
  return raw, "", None
204
  elif source_type == "Webpage":
205
  raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
206
- if raw.startswith("Error"):
207
  return raw, "", None
208
  else:
209
- return "Invalid source type selected.", "", None
210
 
211
  try:
212
- progress(0.9, desc=f"Converting to {output_format_selection}…")
213
  out_path = save_output_to_file(raw, output_format_selection, url_or_id)
214
 
215
  preview = raw
@@ -229,15 +432,15 @@ def process_input_updated(
229
  from os.path import basename
230
 
231
  preview = (
232
- f"[PDF generated. Download to view: "
233
- f"{basename(out_path) if out_path else 'file.pdf'}]"
234
  )
235
 
236
- progress(1, desc="Done.")
237
- return f"Successfully processed: {url_or_id}", preview, out_path
238
 
239
  except Exception as e:
240
- return f"Error during conversion: {e}", "", None
241
 
242
 
243
  # -----------------------------
@@ -272,7 +475,7 @@ class ProcessResult(BaseModel):
272
  description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
273
  )
274
  file_path: Optional[str] = Field(
275
- None, description="Temp file path for the artifact, or null if not created."
276
  )
277
 
278
 
@@ -287,91 +490,193 @@ def process_input_mcp(args: ProcessArgs) -> ProcessResult:
287
 
288
 
289
  # -----------------------------
290
- # Gradio UI
291
  # -----------------------------
292
- with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
293
- gr.Markdown("# RAG-Ready Content Scraper")
294
- gr.Markdown(
295
- "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
296
- )
297
-
298
- with gr.Row():
299
- with gr.Column(scale=2):
300
- url_input = gr.Textbox(
301
- label="Enter URL or GitHub Repository ID",
302
- placeholder="https://example.com or owner/repo",
303
- )
304
- source_type_input = gr.Radio(
305
- choices=["Webpage", "GitHub Repository"],
306
- value="Webpage",
307
- label="Select Source Type",
308
- )
309
- depth_input = gr.Slider(
310
- minimum=0,
311
- maximum=3,
312
- step=1,
313
- value=0,
314
- label="Scraping Depth (for Webpages)",
315
- info="0 = only main page. Ignored for GitHub.",
316
- )
317
- output_format_input = gr.Dropdown(
318
- choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
319
- value="Markdown",
320
- label="Select Output Format",
321
- )
322
- submit_button = gr.Button("Process Content", variant="primary")
323
- with gr.Column(scale=3):
324
- status_output = gr.Textbox(label="Status", interactive=False)
325
- preview_output = gr.Code(
326
- label="Preview Content", language="markdown", interactive=False
327
- )
328
- file_download_output = gr.File(
329
- label="Download Processed File", interactive=False
330
- )
331
-
332
- gr.Examples(
333
- examples=[
334
- ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
335
- ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
336
- [
337
- "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
338
- "Webpage",
339
- 0,
340
- "JSON",
341
- ],
342
- ],
343
- inputs=[url_input, source_type_input, depth_input, output_format_input],
344
- outputs=[status_output, preview_output, file_download_output],
345
- fn=process_input_updated,
346
- cache_examples=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
  )
348
-
 
 
 
 
 
 
 
349
  submit_button.click(
350
- fn=process_input_updated,
351
  inputs=[url_input, source_type_input, depth_input, output_format_input],
352
- outputs=[status_output, preview_output, file_download_output],
353
  )
354
 
355
  # -----------------------------
356
  # MCP-only Interface (Pydantic tool)
357
  # -----------------------------
358
- # We expose a second interface whose *function signature* uses Pydantic models.
359
- # MCP reads this signature to build a JSON Schema with rich field descriptions.
360
  mcp_iface = gr.Interface(
361
  fn=process_input_mcp,
362
- # Components are placeholders; MCP ignores them and reads the Python types.
363
- # Keep them simple so the tab is usable if someone clicks it.
364
  inputs=gr.JSON(label="ProcessArgs (JSON)"),
365
  outputs=gr.JSON(label="ProcessResult (JSON)"),
366
  title="MCP Tool: process_input_mcp",
367
  description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
368
  allow_flagging="never",
 
369
  )
370
 
371
- # Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
372
- app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
373
-
 
 
 
374
 
375
  if __name__ == "__main__":
376
- # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
377
- app.queue().launch(share=True, mcp_server=True)
 
12
  import gradio as gr
13
  import markdown_pdf
14
  from typing_extensions import Annotated, Doc
 
15
  from pydantic import BaseModel, Field, conint
16
 
17
  from rag_scraper.scraper import Scraper
 
25
  os.environ["HF_HOME"] = "/tmp/hf_cache"
26
  os.makedirs(os.environ["HF_HOME"], exist_ok=True)
27
 
28
+ # -----------------------------
29
+ # Custom CSS for modern UI
30
+ # -----------------------------
31
+ custom_css = """
32
+ .gradio-container {
33
+ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
34
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
35
+ min-height: 100vh;
36
+ }
37
+
38
+ .main-container {
39
+ background: rgba(255, 255, 255, 0.95) !important;
40
+ border-radius: 20px !important;
41
+ box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3) !important;
42
+ margin: 20px auto !important;
43
+ max-width: 1400px !important;
44
+ padding: 30px !important;
45
+ backdrop-filter: blur(10px) !important;
46
+ }
47
+
48
+ .title-container {
49
+ text-align: center;
50
+ margin-bottom: 30px;
51
+ padding: 20px;
52
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
53
+ border-radius: 15px;
54
+ color: white;
55
+ box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4);
56
+ }
57
+
58
+ .title-container h1 {
59
+ font-size: 2.5rem !important;
60
+ font-weight: 700 !important;
61
+ margin: 0 !important;
62
+ text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
63
+ }
64
+
65
+ .title-container p {
66
+ font-size: 1.1rem !important;
67
+ margin: 10px 0 0 0 !important;
68
+ opacity: 0.95;
69
+ }
70
+
71
+ .input-panel {
72
+ background: white !important;
73
+ border-radius: 15px !important;
74
+ padding: 25px !important;
75
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
76
+ border: 1px solid rgba(102, 126, 234, 0.1) !important;
77
+ }
78
+
79
+ .output-panel {
80
+ background: white !important;
81
+ border-radius: 15px !important;
82
+ padding: 25px !important;
83
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
84
+ border: 1px solid rgba(102, 126, 234, 0.1) !important;
85
+ }
86
+
87
+ .gradio-button {
88
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
89
+ border: none !important;
90
+ border-radius: 10px !important;
91
+ color: white !important;
92
+ font-weight: 600 !important;
93
+ padding: 12px 30px !important;
94
+ transition: all 0.3s ease !important;
95
+ box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important;
96
+ }
97
+
98
+ .gradio-button:hover {
99
+ transform: translateY(-2px) !important;
100
+ box-shadow: 0 6px 20px rgba(102, 126, 234, 0.4) !important;
101
+ }
102
+
103
+ .gradio-textbox, .gradio-dropdown, .gradio-slider, .gradio-radio {
104
+ border-radius: 10px !important;
105
+ border: 2px solid #e5e7eb !important;
106
+ transition: all 0.3s ease !important;
107
+ }
108
+
109
+ .gradio-textbox:focus, .gradio-dropdown:focus, .gradio-slider:focus {
110
+ border-color: #667eea !important;
111
+ box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
112
+ }
113
+
114
+ .gradio-radio label {
115
+ padding: 8px 16px !important;
116
+ border-radius: 8px !important;
117
+ margin: 4px !important;
118
+ transition: all 0.3s ease !important;
119
+ }
120
+
121
+ .gradio-radio label:hover {
122
+ background: rgba(102, 126, 234, 0.1) !important;
123
+ }
124
+
125
+ .gradio-code {
126
+ border-radius: 10px !important;
127
+ font-family: 'Fira Code', monospace !important;
128
+ box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.06) !important;
129
+ }
130
+
131
+ .gradio-file {
132
+ border-radius: 10px !important;
133
+ border: 2px dashed #667eea !important;
134
+ background: rgba(102, 126, 234, 0.05) !important;
135
+ }
136
+
137
+ .progress-bar {
138
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
139
+ border-radius: 10px !important;
140
+ }
141
+
142
+ .examples-container {
143
+ background: rgba(102, 126, 234, 0.05) !important;
144
+ border-radius: 15px !important;
145
+ padding: 20px !important;
146
+ margin-top: 20px !important;
147
+ border: 1px solid rgba(102, 126, 234, 0.2) !important;
148
+ }
149
+
150
+ .status-box {
151
+ background: linear-gradient(135deg, #10b981 0%, #059669 100%) !important;
152
+ color: white !important;
153
+ border-radius: 10px !important;
154
+ padding: 15px !important;
155
+ font-weight: 500 !important;
156
+ }
157
+
158
+ .error-box {
159
+ background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
160
+ color: white !important;
161
+ border-radius: 10px !important;
162
+ padding: 15px !important;
163
+ font-weight: 500 !important;
164
+ }
165
+
166
+ .info-box {
167
+ background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
168
+ color: white !important;
169
+ border-radius: 10px !important;
170
+ padding: 15px !important;
171
+ font-weight: 500 !important;
172
+ }
173
+
174
+ .feature-card {
175
+ background: white !important;
176
+ border-radius: 12px !important;
177
+ padding: 20px !important;
178
+ margin: 10px 0 !important;
179
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08) !important;
180
+ border-left: 4px solid #667eea !important;
181
+ transition: transform 0.3s ease !important;
182
+ }
183
+
184
+ .feature-card:hover {
185
+ transform: translateX(5px) !important;
186
+ }
187
+
188
+ .tab-nav {
189
+ background: rgba(102, 126, 234, 0.1) !important;
190
+ border-radius: 10px !important;
191
+ padding: 5px !important;
192
+ }
193
+
194
+ .tab-nav button {
195
+ border-radius: 8px !important;
196
+ margin: 2px !important;
197
+ transition: all 0.3s ease !important;
198
+ }
199
+
200
+ .tab-nav button:hover {
201
+ background: rgba(102, 126, 234, 0.2) !important;
202
+ }
203
+
204
+ .footer {
205
+ text-align: center;
206
+ margin-top: 30px;
207
+ padding: 20px;
208
+ color: white;
209
+ font-size: 0.9rem;
210
+ }
211
+
212
+ .footer a {
213
+ color: white;
214
+ text-decoration: underline;
215
+ }
216
+
217
+ /* Animation for loading */
218
+ @keyframes pulse {
219
+ 0% { opacity: 1; }
220
+ 50% { opacity: 0.5; }
221
+ 100% { opacity: 1; }
222
+ }
223
+
224
+ .loading {
225
+ animation: pulse 1.5s ease-in-out infinite;
226
+ }
227
+ """
228
 
229
  # -----------------------------
230
  # Helper utilities
 
248
  progress: gr.Progress = gr.Progress(track_tqdm=True),
249
  ) -> Tuple[str, Optional[str]]:
250
  """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
251
+ progress(0, desc="πŸš€ Starting Repomix…")
252
  try:
253
  with tempfile.TemporaryDirectory() as td:
254
  out_path = os.path.join(td, "repomix-output.md")
 
270
  p = subprocess.run(
271
  cmd, capture_output=True, text=True, check=False, encoding="utf-8"
272
  )
273
+ progress(0.8, desc="βœ… Repomix completed.")
274
  if p.returncode != 0:
275
  err = (
276
  f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
277
  )
278
+ return f"❌ Error running Repomix:\n{err}", None
279
  if os.path.exists(out_path):
280
  with open(out_path, "r", encoding="utf-8") as f:
281
  return f.read(), out_path
282
+ return "❌ Error: Repomix did not produce an output file.", None
283
  except Exception as e:
284
+ progress(1, desc="❌ Error")
285
+ return f"❌ Error processing GitHub repository: {e}", None
286
 
287
 
288
  def scrape_and_convert_website(
 
291
  progress: gr.Progress = gr.Progress(track_tqdm=True),
292
  ) -> Tuple[str, str]:
293
  """Recursively scrape a website and convert visited pages to Markdown."""
294
+ progress(0, desc=f"πŸ” Scraping {url}…")
295
  visited = set()
296
 
297
  def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
 
299
  return ""
300
  visited.add(u)
301
  try:
302
+ progress(i / n if n > 0 else 0, desc=f"🌐 Scraping: {u}")
303
  html = Scraper.fetch_html(u)
304
  except Exception as e:
305
+ return f"❌ Error fetching {u}: {e}\n"
306
  md = (
307
+ f"## πŸ“„ Extracted from: {u}\n\n"
308
  + Converter.html_to_markdown(
309
  html=html, base_url=u, parser_features="html.parser", ignore_links=True
310
  )
 
321
  for j, nxt in enumerate(valid):
322
  md += rec(nxt, d - 1, len(valid), j)
323
  except Exception as e:
324
+ md += f"❌ Error extracting links from {u}: {e}\n"
325
  return md
326
 
327
  all_md = rec(url, depth)
 
391
  UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
392
  then export as Markdown/JSON/CSV/Text/PDF.
393
  """
394
+ progress(0, desc="πŸš€ Initializing…")
395
  out_path: Optional[str] = None
396
 
397
  if source_type == "GitHub Repository":
398
  if not check_repomix_installed():
399
+ return (
400
+ "❌ Repomix is not installed or not accessible.",
401
+ "",
402
+ None
403
+ )
404
  raw, _ = run_repomix(url_or_id, progress=progress)
405
+ if raw.startswith("❌ Error"):
406
  return raw, "", None
407
  elif source_type == "Webpage":
408
  raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
409
+ if raw.startswith("❌ Error"):
410
  return raw, "", None
411
  else:
412
+ return "❌ Invalid source type selected.", "", None
413
 
414
  try:
415
+ progress(0.9, desc=f"πŸ“„ Converting to {output_format_selection}…")
416
  out_path = save_output_to_file(raw, output_format_selection, url_or_id)
417
 
418
  preview = raw
 
432
  from os.path import basename
433
 
434
  preview = (
435
+ f"πŸ“„ PDF generated. Download to view: "
436
+ f"{basename(out_path) if out_path else 'file.pdf'}"
437
  )
438
 
439
+ progress(1, desc="βœ… Done!")
440
+ return f"βœ… Successfully processed: {url_or_id}", preview, out_path
441
 
442
  except Exception as e:
443
+ return f"❌ Error during conversion: {e}", "", None
444
 
445
 
446
  # -----------------------------
 
475
  description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
476
  )
477
  file_path: Optional[str] = Field(
478
+ None, description="Temp file path for the artifact, or null if not created.",
479
  )
480
 
481
 
 
490
 
491
 
492
  # -----------------------------
493
+ # Gradio UI with Modern Design
494
  # -----------------------------
495
+ with gr.Blocks(
496
+ title="RAG-Ready Content Scraper",
497
+ theme=gr.themes.Soft(),
498
+ css=custom_css
499
+ ) as ui_iface:
500
+
501
+ # Header
502
+ with gr.Column(elem_classes=["main-container"]):
503
+ with gr.Column(elem_classes=["title-container"]):
504
+ gr.HTML("""
505
+ <h1>πŸš€ RAG-Ready Content Scraper</h1>
506
+ <p>Transform web content and GitHub repositories into structured datasets for AI applications</p>
507
+ <p style="font-size: 0.9rem; opacity: 0.8;">
508
+ Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: white;">anycoder</a>
509
+ </p>
510
+ """)
511
+
512
+ # Feature cards
513
+ with gr.Row():
514
+ with gr.Column(scale=1):
515
+ with gr.Column(elem_classes=["feature-card"]):
516
+ gr.HTML("""
517
+ <h3>🌐 Web Scraping</h3>
518
+ <p>Extract clean content from websites with recursive depth control</p>
519
+ """)
520
+ with gr.Column(scale=1):
521
+ with gr.Column(elem_classes=["feature-card"]):
522
+ gr.HTML("""
523
+ <h3>πŸ“¦ GitHub Processing</h3>
524
+ <p>Process entire repositories using Repomix for AI-friendly output</p>
525
+ """)
526
+ with gr.Column(scale=1):
527
+ with gr.Column(elem_classes=["feature-card"]):
528
+ gr.HTML("""
529
+ <h3>πŸ“„ Multiple Formats</h3>
530
+ <p>Export as Markdown, JSON, CSV, Text, or PDF</p>
531
+ """)
532
+
533
+ # Main content area
534
+ with gr.Row():
535
+ # Input panel
536
+ with gr.Column(scale=1, elem_classes=["input-panel"]):
537
+ gr.HTML("<h2>βš™οΈ Configuration</h2>")
538
+
539
+ url_input = gr.Textbox(
540
+ label="πŸ”— URL or GitHub Repository",
541
+ placeholder="https://example.com or owner/repo",
542
+ lines=1,
543
+ max_lines=1
544
+ )
545
+
546
+ source_type_input = gr.Radio(
547
+ choices=["🌐 Webpage", "πŸ“¦ GitHub Repository"],
548
+ value="🌐 Webpage",
549
+ label="πŸ“‚ Source Type",
550
+ interactive=True
551
+ )
552
+
553
+ with gr.Group(visible=True) as webpage_options:
554
+ depth_input = gr.Slider(
555
+ minimum=0,
556
+ maximum=3,
557
+ step=1,
558
+ value=0,
559
+ label="πŸ” Scraping Depth",
560
+ info="0 = main page only | 1-3 = follow internal links"
561
+ )
562
+
563
+ output_format_input = gr.Dropdown(
564
+ choices=["πŸ“ Markdown", "πŸ“„ JSON", "πŸ“Š CSV", "πŸ“ƒ Text", "πŸ“‘ PDF"],
565
+ value="πŸ“ Markdown",
566
+ label="πŸ’Ύ Output Format"
567
+ )
568
+
569
+ submit_button = gr.Button(
570
+ "πŸš€ Process Content",
571
+ variant="primary",
572
+ size="lg"
573
+ )
574
+
575
+ # Examples section
576
+ with gr.Accordion("πŸ“š Quick Examples", open=False):
577
+ gr.HTML("""
578
+ <p style="margin-bottom: 15px;">Click any example to load the configuration:</p>
579
+ """)
580
+ example1 = gr.Button("πŸ“– Gradio Docs (Depth 1)")
581
+ example2 = gr.Button("πŸ“¦ Gradio Repository")
582
+ example3 = gr.Button("πŸ“„ Wikipedia RAG Article")
583
+
584
+ # Example handlers
585
+ example1.click(
586
+ fn=lambda: ("https://gradio.app/docs/js", "🌐 Webpage", 1, "πŸ“ Markdown"),
587
+ outputs=[url_input, source_type_input, depth_input, output_format_input]
588
+ )
589
+ example2.click(
590
+ fn=lambda: ("gradio-app/gradio", "πŸ“¦ GitHub Repository", 0, "πŸ“ƒ Text"),
591
+ outputs=[url_input, source_type_input, depth_input, output_format_input]
592
+ )
593
+ example3.click(
594
+ fn=lambda: ("https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "🌐 Webpage", 0, "πŸ“„ JSON"),
595
+ outputs=[url_input, source_type_input, depth_input, output_format_input]
596
+ )
597
+
598
+ # Output panel
599
+ with gr.Column(scale=2, elem_classes=["output-panel"]):
600
+ gr.HTML("<h2>πŸ“Š Results</h2>")
601
+
602
+ with gr.Tabs():
603
+ with gr.TabItem("πŸ“ˆ Status"):
604
+ status_output = gr.HTML(
605
+ value='<div class="info-box">Ready to process your content...</div>',
606
+ label="Status"
607
+ )
608
+
609
+ with gr.TabItem("πŸ‘οΈ Preview"):
610
+ preview_output = gr.Code(
611
+ label="Content Preview",
612
+ language="markdown",
613
+ interactive=False,
614
+ lines=15,
615
+ max_lines=30
616
+ )
617
+
618
+ with gr.TabItem("⬇️ Download"):
619
+ file_download_output = gr.File(
620
+ label="Download Processed File",
621
+ interactive=False
622
+ )
623
+
624
+ # Footer
625
+ gr.HTML("""
626
+ <div class="footer">
627
+ <p>Powered by Gradio β€’ Docker β€’ Repomix β€’ BeautifulSoup4</p>
628
+ <p style="font-size: 0.8rem; opacity: 0.7;">
629
+ MIT License β€’
630
+ <a href="https://huggingface.co/spaces/CultriX/RAG-Scraper" target="_blank">Source Code</a>
631
+ </p>
632
+ </div>
633
+ """)
634
+
635
+ # Toggle depth slider based on source type
636
+ def toggle_depth(source_type):
637
+ if source_type == "🌐 Webpage":
638
+ return gr.Group(visible=True)
639
+ else:
640
+ return gr.Group(visible=False)
641
+
642
+ source_type_input.change(
643
+ fn=toggle_depth,
644
+ inputs=[source_type_input],
645
+ outputs=[webpage_options]
646
  )
647
+
648
+ # Main processing function
649
+ def process_with_emoji_fix(url, source, depth, fmt):
650
+ # Remove emojis from inputs for processing
651
+ clean_source = source.replace("🌐 ", "").replace("πŸ“¦ ", "")
652
+ clean_fmt = fmt.replace("πŸ“ ", "").replace("πŸ“„ ", "").replace("πŸ“Š ", "").replace("πŸ“ƒ ", "").replace("πŸ“‘ ", "")
653
+ return process_input_updated(url, clean_source, depth, clean_fmt)
654
+
655
  submit_button.click(
656
+ fn=process_with_emoji_fix,
657
  inputs=[url_input, source_type_input, depth_input, output_format_input],
658
+ outputs=[status_output, preview_output, file_download_output]
659
  )
660
 
661
  # -----------------------------
662
  # MCP-only Interface (Pydantic tool)
663
  # -----------------------------
 
 
664
  mcp_iface = gr.Interface(
665
  fn=process_input_mcp,
 
 
666
  inputs=gr.JSON(label="ProcessArgs (JSON)"),
667
  outputs=gr.JSON(label="ProcessResult (JSON)"),
668
  title="MCP Tool: process_input_mcp",
669
  description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
670
  allow_flagging="never",
671
+ css=custom_css
672
  )
673
 
674
+ # Combine the user UI and the MCP tool as two tabs
675
+ app = gr.TabbedInterface(
676
+ [ui_iface, mcp_iface],
677
+ tab_names=["πŸš€ App", "πŸ”§ MCP"],
678
+ css=custom_css
679
+ )
680
 
681
  if __name__ == "__main__":
682
+ app.queue().launch(share=True, mcp_server=True)
 
requirements.txt CHANGED
@@ -1,9 +1,71 @@
1
- html2text
2
- gradio[mcp]
3
- requests>=2.31.0
4
- beautifulsoup4>=4.12.3
5
- lxml>=4.9.3
6
- markdown>=3.5.2
7
  markdown-pdf
8
- pydantic>=2
9
- typing_extensions
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ rag-scraper
2
+ pydantic
 
 
 
 
3
  markdown-pdf
4
+ typing-extensions
5
+ gradio>=6.0
6
+ requests
7
+ Pillow
8
+ markdown
9
+ beautifulsoup4
10
+ lxml
11
+ aiohttp
12
+ fake-useragent
13
+ urllib3
14
+ html5lib
15
+ chardet
16
+ tqdm
17
+ python-dateutil
18
+ pytz
19
+ click
20
+ nltk
21
+ spacy
22
+ scrapy
23
+ selenium
24
+ webdriver-manager
25
+ pandas
26
+ numpy
27
+ openpyxl
28
+ PyPDF2
29
+ python-docx
30
+ python-pptx
31
+ reportlab
32
+ pdfkit
33
+ weasyprint
34
+ cssutils
35
+ tinycss2
36
+ cchardet
37
+ idna
38
+ certifi
39
+ charset-normalizer
40
+ httpx
41
+ httpcore
42
+ anyio
43
+ sniffio
44
+ pysocks
45
+ win-inet-pton
46
+ deprecation
47
+ docstring-parser
48
+ rich
49
+ typer
50
+ pyyaml
51
+ toml
52
+ tomli
53
+ packaging
54
+ filelock
55
+ huggingface-hub
56
+ safetensors
57
+ regex
58
+ tokenizers
59
+ sentencepiece
60
+ accelerate
61
+ torch
62
+ torchvision
63
+ torchaudio
64
+ transformers
65
+ diffusers
66
+ datasets
67
+ evaluate
68
+ scipy
69
+ scikit-learn
70
+ joblib
71
+ threadpoolctl