Spaces:

CultriX
/

RAG-Scraper

Paused

App Files Files Community

CultriX commited on Dec 22, 2025

Commit

97b889a

verified ·

1 Parent(s): aa93fa6

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +397 -92
requirements.txt +70 -8

app.py CHANGED Viewed

@@ -12,7 +12,6 @@ from typing import Optional, Tuple, Literal
 import gradio as gr
 import markdown_pdf
 from typing_extensions import Annotated, Doc
 from pydantic import BaseModel, Field, conint
 from rag_scraper.scraper import Scraper
@@ -26,6 +25,206 @@ from rag_scraper.utils import URLUtils
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
 # -----------------------------
 # Helper utilities
@@ -49,7 +248,7 @@ def run_repomix(
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
     """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
-    progress(0, desc="Starting Repomix…")
     try:
         with tempfile.TemporaryDirectory() as td:
             out_path = os.path.join(td, "repomix-output.md")
@@ -71,19 +270,19 @@ def run_repomix(
             p = subprocess.run(
                 cmd, capture_output=True, text=True, check=False, encoding="utf-8"
             )
-            progress(0.8, desc="Repomix done.")
             if p.returncode != 0:
                 err = (
                     f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
                 )
-                return f"Error running Repomix:\n{err}", None
             if os.path.exists(out_path):
                 with open(out_path, "r", encoding="utf-8") as f:
                     return f.read(), out_path
-            return "Error: Repomix did not produce an output file.", None
     except Exception as e:
-        progress(1, desc="Error")
-        return f"Error processing GitHub repository: {e}", None
 def scrape_and_convert_website(
@@ -92,7 +291,7 @@ def scrape_and_convert_website(
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
     """Recursively scrape a website and convert visited pages to Markdown."""
-    progress(0, desc=f"Scraping {url}…")
     visited = set()
     def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
@@ -100,12 +299,12 @@ def scrape_and_convert_website(
             return ""
         visited.add(u)
         try:
-            progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
             html = Scraper.fetch_html(u)
         except Exception as e:
-            return f"Error fetching {u}: {e}\n"
         md = (
-            f"## Extracted from: {u}\n\n"
             + Converter.html_to_markdown(
                 html=html, base_url=u, parser_features="html.parser", ignore_links=True
             )
@@ -122,7 +321,7 @@ def scrape_and_convert_website(
                 for j, nxt in enumerate(valid):
                     md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
-                md += f"Error extracting links from {u}: {e}\n"
         return md
     all_md = rec(url, depth)
@@ -192,24 +391,28 @@ def process_input_updated(
     UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
     then export as Markdown/JSON/CSV/Text/PDF.
     """
-    progress(0, desc="Initializing…")
     out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            return "Repomix is not installed or not accessible.", "", None
         raw, _ = run_repomix(url_or_id, progress=progress)
-        if raw.startswith("Error"):
             return raw, "", None
     elif source_type == "Webpage":
         raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if raw.startswith("Error"):
             return raw, "", None
     else:
-        return "Invalid source type selected.", "", None
     try:
-        progress(0.9, desc=f"Converting to {output_format_selection}…")
         out_path = save_output_to_file(raw, output_format_selection, url_or_id)
         preview = raw
@@ -229,15 +432,15 @@ def process_input_updated(
             from os.path import basename
             preview = (
-                f"[PDF generated. Download to view: "
-                f"{basename(out_path) if out_path else 'file.pdf'}]"
             )
-        progress(1, desc="Done.")
-        return f"Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
-        return f"Error during conversion: {e}", "", None
 # -----------------------------
@@ -272,7 +475,7 @@ class ProcessResult(BaseModel):
         description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
     )
     file_path: Optional[str] = Field(
-        None, description="Temp file path for the artifact, or null if not created."
     )
@@ -287,91 +490,193 @@ def process_input_mcp(args: ProcessArgs) -> ProcessResult:
 # -----------------------------
-# Gradio UI
 # -----------------------------
-with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
-    gr.Markdown("# RAG-Ready Content Scraper")
-    gr.Markdown(
-        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
-    )
-    with gr.Row():
-        with gr.Column(scale=2):
-            url_input = gr.Textbox(
-                label="Enter URL or GitHub Repository ID",
-                placeholder="https://example.com  or  owner/repo",
-            )
-            source_type_input = gr.Radio(
-                choices=["Webpage", "GitHub Repository"],
-                value="Webpage",
-                label="Select Source Type",
-            )
-            depth_input = gr.Slider(
-                minimum=0,
-                maximum=3,
-                step=1,
-                value=0,
-                label="Scraping Depth (for Webpages)",
-                info="0 = only main page. Ignored for GitHub.",
-            )
-            output_format_input = gr.Dropdown(
-                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
-                value="Markdown",
-                label="Select Output Format",
-            )
-            submit_button = gr.Button("Process Content", variant="primary")
-        with gr.Column(scale=3):
-            status_output = gr.Textbox(label="Status", interactive=False)
-            preview_output = gr.Code(
-                label="Preview Content", language="markdown", interactive=False
-            )
-            file_download_output = gr.File(
-                label="Download Processed File", interactive=False
-            )
-    gr.Examples(
-        examples=[
-            ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
-            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
-            [
-                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
-                "Webpage",
-                0,
-                "JSON",
-            ],
-        ],
-        inputs=[url_input, source_type_input, depth_input, output_format_input],
-        outputs=[status_output, preview_output, file_download_output],
-        fn=process_input_updated,
-        cache_examples=False,
     )
     submit_button.click(
-        fn=process_input_updated,
         inputs=[url_input, source_type_input, depth_input, output_format_input],
-        outputs=[status_output, preview_output, file_download_output],
     )
 # -----------------------------
 # MCP-only Interface (Pydantic tool)
 # -----------------------------
-# We expose a second interface whose *function signature* uses Pydantic models.
-# MCP reads this signature to build a JSON Schema with rich field descriptions.
 mcp_iface = gr.Interface(
     fn=process_input_mcp,
-    # Components are placeholders; MCP ignores them and reads the Python types.
-    # Keep them simple so the tab is usable if someone clicks it.
     inputs=gr.JSON(label="ProcessArgs (JSON)"),
     outputs=gr.JSON(label="ProcessResult (JSON)"),
     title="MCP Tool: process_input_mcp",
     description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
     allow_flagging="never",
 )
-# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
-app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
 if __name__ == "__main__":
-    # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
-    app.queue().launch(share=True, mcp_server=True)

 import gradio as gr
 import markdown_pdf
 from typing_extensions import Annotated, Doc
 from pydantic import BaseModel, Field, conint
 from rag_scraper.scraper import Scraper
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
+# -----------------------------
+# Custom CSS for modern UI
+# -----------------------------
+custom_css = """
+.gradio-container {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif !important;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    min-height: 100vh;
+}
+.main-container {
+    background: rgba(255, 255, 255, 0.95) !important;
+    border-radius: 20px !important;
+    box-shadow: 0 20px 60px rgba(0, 0, 0, 0.3) !important;
+    margin: 20px auto !important;
+    max-width: 1400px !important;
+    padding: 30px !important;
+    backdrop-filter: blur(10px) !important;
+}
+.title-container {
+    text-align: center;
+    margin-bottom: 30px;
+    padding: 20px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    border-radius: 15px;
+    color: white;
+    box-shadow: 0 10px 30px rgba(102, 126, 234, 0.4);
+}
+.title-container h1 {
+    font-size: 2.5rem !important;
+    font-weight: 700 !important;
+    margin: 0 !important;
+    text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.2);
+}
+.title-container p {
+    font-size: 1.1rem !important;
+    margin: 10px 0 0 0 !important;
+    opacity: 0.95;
+}
+.input-panel {
+    background: white !important;
+    border-radius: 15px !important;
+    padding: 25px !important;
+    box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
+    border: 1px solid rgba(102, 126, 234, 0.1) !important;
+}
+.output-panel {
+    background: white !important;
+    border-radius: 15px !important;
+    padding: 25px !important;
+    box-shadow: 0 4px 20px rgba(0, 0, 0, 0.08) !important;
+    border: 1px solid rgba(102, 126, 234, 0.1) !important;
+}
+.gradio-button {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
+    border: none !important;
+    border-radius: 10px !important;
+    color: white !important;
+    font-weight: 600 !important;
+    padding: 12px 30px !important;
+    transition: all 0.3s ease !important;
+    box-shadow: 0 4px 15px rgba(102, 126, 234, 0.3) !important;
+}
+.gradio-button:hover {
+    transform: translateY(-2px) !important;
+    box-shadow: 0 6px 20px rgba(102, 126, 234, 0.4) !important;
+}
+.gradio-textbox, .gradio-dropdown, .gradio-slider, .gradio-radio {
+    border-radius: 10px !important;
+    border: 2px solid #e5e7eb !important;
+    transition: all 0.3s ease !important;
+}
+.gradio-textbox:focus, .gradio-dropdown:focus, .gradio-slider:focus {
+    border-color: #667eea !important;
+    box-shadow: 0 0 0 3px rgba(102, 126, 234, 0.1) !important;
+}
+.gradio-radio label {
+    padding: 8px 16px !important;
+    border-radius: 8px !important;
+    margin: 4px !important;
+    transition: all 0.3s ease !important;
+}
+.gradio-radio label:hover {
+    background: rgba(102, 126, 234, 0.1) !important;
+}
+.gradio-code {
+    border-radius: 10px !important;
+    font-family: 'Fira Code', monospace !important;
+    box-shadow: inset 0 2px 4px rgba(0, 0, 0, 0.06) !important;
+}
+.gradio-file {
+    border-radius: 10px !important;
+    border: 2px dashed #667eea !important;
+    background: rgba(102, 126, 234, 0.05) !important;
+}
+.progress-bar {
+    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
+    border-radius: 10px !important;
+}
+.examples-container {
+    background: rgba(102, 126, 234, 0.05) !important;
+    border-radius: 15px !important;
+    padding: 20px !important;
+    margin-top: 20px !important;
+    border: 1px solid rgba(102, 126, 234, 0.2) !important;
+}
+.status-box {
+    background: linear-gradient(135deg, #10b981 0%, #059669 100%) !important;
+    color: white !important;
+    border-radius: 10px !important;
+    padding: 15px !important;
+    font-weight: 500 !important;
+}
+.error-box {
+    background: linear-gradient(135deg, #ef4444 0%, #dc2626 100%) !important;
+    color: white !important;
+    border-radius: 10px !important;
+    padding: 15px !important;
+    font-weight: 500 !important;
+}
+.info-box {
+    background: linear-gradient(135deg, #3b82f6 0%, #2563eb 100%) !important;
+    color: white !important;
+    border-radius: 10px !important;
+    padding: 15px !important;
+    font-weight: 500 !important;
+}
+.feature-card {
+    background: white !important;
+    border-radius: 12px !important;
+    padding: 20px !important;
+    margin: 10px 0 !important;
+    box-shadow: 0 4px 15px rgba(0, 0, 0, 0.08) !important;
+    border-left: 4px solid #667eea !important;
+    transition: transform 0.3s ease !important;
+}
+.feature-card:hover {
+    transform: translateX(5px) !important;
+}
+.tab-nav {
+    background: rgba(102, 126, 234, 0.1) !important;
+    border-radius: 10px !important;
+    padding: 5px !important;
+}
+.tab-nav button {
+    border-radius: 8px !important;
+    margin: 2px !important;
+    transition: all 0.3s ease !important;
+}
+.tab-nav button:hover {
+    background: rgba(102, 126, 234, 0.2) !important;
+}
+.footer {
+    text-align: center;
+    margin-top: 30px;
+    padding: 20px;
+    color: white;
+    font-size: 0.9rem;
+}
+.footer a {
+    color: white;
+    text-decoration: underline;
+}
+/* Animation for loading */
+@keyframes pulse {
+    0% { opacity: 1; }
+    50% { opacity: 0.5; }
+    100% { opacity: 1; }
+}
+.loading {
+    animation: pulse 1.5s ease-in-out infinite;
+}
+"""
 # -----------------------------
 # Helper utilities
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
     """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
+    progress(0, desc="🚀 Starting Repomix…")
     try:
         with tempfile.TemporaryDirectory() as td:
             out_path = os.path.join(td, "repomix-output.md")
             p = subprocess.run(
                 cmd, capture_output=True, text=True, check=False, encoding="utf-8"
             )
+            progress(0.8, desc="✅ Repomix completed.")
             if p.returncode != 0:
                 err = (
                     f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
                 )
+                return f"❌ Error running Repomix:\n{err}", None
             if os.path.exists(out_path):
                 with open(out_path, "r", encoding="utf-8") as f:
                     return f.read(), out_path
+            return "❌ Error: Repomix did not produce an output file.", None
     except Exception as e:
+        progress(1, desc="❌ Error")
+        return f"❌ Error processing GitHub repository: {e}", None
 def scrape_and_convert_website(
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
     """Recursively scrape a website and convert visited pages to Markdown."""
+    progress(0, desc=f"🔍 Scraping {url}…")
     visited = set()
     def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
             return ""
         visited.add(u)
         try:
+            progress(i / n if n > 0 else 0, desc=f"🌐 Scraping: {u}")
             html = Scraper.fetch_html(u)
         except Exception as e:
+            return f"❌ Error fetching {u}: {e}\n"
         md = (
+            f"## 📄 Extracted from: {u}\n\n"
             + Converter.html_to_markdown(
                 html=html, base_url=u, parser_features="html.parser", ignore_links=True
             )
                 for j, nxt in enumerate(valid):
                     md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
+                md += f"❌ Error extracting links from {u}: {e}\n"
         return md
     all_md = rec(url, depth)
     UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
     then export as Markdown/JSON/CSV/Text/PDF.
     """
+    progress(0, desc="🚀 Initializing…")
     out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            return (
+                "❌ Repomix is not installed or not accessible.",
+                "",
+                None
+            )
         raw, _ = run_repomix(url_or_id, progress=progress)
+        if raw.startswith("❌ Error"):
             return raw, "", None
     elif source_type == "Webpage":
         raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if raw.startswith("❌ Error"):
             return raw, "", None
     else:
+        return "❌ Invalid source type selected.", "", None
     try:
+        progress(0.9, desc=f"📄 Converting to {output_format_selection}…")
         out_path = save_output_to_file(raw, output_format_selection, url_or_id)
         preview = raw
             from os.path import basename
             preview = (
+                f"📄 PDF generated. Download to view: "
+                f"{basename(out_path) if out_path else 'file.pdf'}"
             )
+        progress(1, desc="✅ Done!")
+        return f"✅ Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
+        return f"❌ Error during conversion: {e}", "", None
 # -----------------------------
         description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
     )
     file_path: Optional[str] = Field(
+        None, description="Temp file path for the artifact, or null if not created.",
     )
 # -----------------------------
+# Gradio UI with Modern Design
 # -----------------------------
+with gr.Blocks(
+    title="RAG-Ready Content Scraper",
+    theme=gr.themes.Soft(),
+    css=custom_css
+) as ui_iface:
+    # Header
+    with gr.Column(elem_classes=["main-container"]):
+        with gr.Column(elem_classes=["title-container"]):
+            gr.HTML("""
+                <h1>🚀 RAG-Ready Content Scraper</h1>
+                <p>Transform web content and GitHub repositories into structured datasets for AI applications</p>
+                <p style="font-size: 0.9rem; opacity: 0.8;">
+                    Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: white;">anycoder</a>
+                </p>
+            """)
+        # Feature cards
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Column(elem_classes=["feature-card"]):
+                    gr.HTML("""
+                        <h3>🌐 Web Scraping</h3>
+                        <p>Extract clean content from websites with recursive depth control</p>
+                    """)
+            with gr.Column(scale=1):
+                with gr.Column(elem_classes=["feature-card"]):
+                    gr.HTML("""
+                        <h3>📦 GitHub Processing</h3>
+                        <p>Process entire repositories using Repomix for AI-friendly output</p>
+                    """)
+            with gr.Column(scale=1):
+                with gr.Column(elem_classes=["feature-card"]):
+                    gr.HTML("""
+                        <h3>📄 Multiple Formats</h3>
+                        <p>Export as Markdown, JSON, CSV, Text, or PDF</p>
+                    """)
+        # Main content area
+        with gr.Row():
+            # Input panel
+            with gr.Column(scale=1, elem_classes=["input-panel"]):
+                gr.HTML("<h2>⚙️ Configuration</h2>")
+                url_input = gr.Textbox(
+                    label="🔗 URL or GitHub Repository",
+                    placeholder="https://example.com  or  owner/repo",
+                    lines=1,
+                    max_lines=1
+                )
+                source_type_input = gr.Radio(
+                    choices=["🌐 Webpage", "📦 GitHub Repository"],
+                    value="🌐 Webpage",
+                    label="📂 Source Type",
+                    interactive=True
+                )
+                with gr.Group(visible=True) as webpage_options:
+                    depth_input = gr.Slider(
+                        minimum=0,
+                        maximum=3,
+                        step=1,
+                        value=0,
+                        label="🔍 Scraping Depth",
+                        info="0 = main page only | 1-3 = follow internal links"
+                    )
+                output_format_input = gr.Dropdown(
+                    choices=["📝 Markdown", "📄 JSON", "📊 CSV", "📃 Text", "📑 PDF"],
+                    value="📝 Markdown",
+                    label="💾 Output Format"
+                )
+                submit_button = gr.Button(
+                    "🚀 Process Content",
+                    variant="primary",
+                    size="lg"
+                )
+                # Examples section
+                with gr.Accordion("📚 Quick Examples", open=False):
+                    gr.HTML("""
+                        <p style="margin-bottom: 15px;">Click any example to load the configuration:</p>
+                    """)
+                    example1 = gr.Button("📖 Gradio Docs (Depth 1)")
+                    example2 = gr.Button("📦 Gradio Repository")
+                    example3 = gr.Button("📄 Wikipedia RAG Article")
+                    # Example handlers
+                    example1.click(
+                        fn=lambda: ("https://gradio.app/docs/js", "🌐 Webpage", 1, "📝 Markdown"),
+                        outputs=[url_input, source_type_input, depth_input, output_format_input]
+                    )
+                    example2.click(
+                        fn=lambda: ("gradio-app/gradio", "📦 GitHub Repository", 0, "📃 Text"),
+                        outputs=[url_input, source_type_input, depth_input, output_format_input]
+                    )
+                    example3.click(
+                        fn=lambda: ("https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "🌐 Webpage", 0, "📄 JSON"),
+                        outputs=[url_input, source_type_input, depth_input, output_format_input]
+                    )
+            # Output panel
+            with gr.Column(scale=2, elem_classes=["output-panel"]):
+                gr.HTML("<h2>📊 Results</h2>")
+                with gr.Tabs():
+                    with gr.TabItem("📈 Status"):
+                        status_output = gr.HTML(
+                            value='<div class="info-box">Ready to process your content...</div>',
+                            label="Status"
+                        )
+                    with gr.TabItem("👁️ Preview"):
+                        preview_output = gr.Code(
+                            label="Content Preview",
+                            language="markdown",
+                            interactive=False,
+                            lines=15,
+                            max_lines=30
+                        )
+                    with gr.TabItem("⬇️ Download"):
+                        file_download_output = gr.File(
+                            label="Download Processed File",
+                            interactive=False
+                        )
+        # Footer
+        gr.HTML("""
+            <div class="footer">
+                <p>Powered by Gradio • Docker • Repomix • BeautifulSoup4</p>
+                <p style="font-size: 0.8rem; opacity: 0.7;">
+                    MIT License •
+                    <a href="https://huggingface.co/spaces/CultriX/RAG-Scraper" target="_blank">Source Code</a>
+                </p>
+            </div>
+        """)
+    # Toggle depth slider based on source type
+    def toggle_depth(source_type):
+        if source_type == "🌐 Webpage":
+            return gr.Group(visible=True)
+        else:
+            return gr.Group(visible=False)
+    source_type_input.change(
+        fn=toggle_depth,
+        inputs=[source_type_input],
+        outputs=[webpage_options]
     )
+    # Main processing function
+    def process_with_emoji_fix(url, source, depth, fmt):
+        # Remove emojis from inputs for processing
+        clean_source = source.replace("🌐 ", "").replace("📦 ", "")
+        clean_fmt = fmt.replace("📝 ", "").replace("📄 ", "").replace("📊 ", "").replace("📃 ", "").replace("📑 ", "")
+        return process_input_updated(url, clean_source, depth, clean_fmt)
     submit_button.click(
+        fn=process_with_emoji_fix,
         inputs=[url_input, source_type_input, depth_input, output_format_input],
+        outputs=[status_output, preview_output, file_download_output]
     )
 # -----------------------------
 # MCP-only Interface (Pydantic tool)
 # -----------------------------
 mcp_iface = gr.Interface(
     fn=process_input_mcp,
     inputs=gr.JSON(label="ProcessArgs (JSON)"),
     outputs=gr.JSON(label="ProcessResult (JSON)"),
     title="MCP Tool: process_input_mcp",
     description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
     allow_flagging="never",
+    css=custom_css
 )
+# Combine the user UI and the MCP tool as two tabs
+app = gr.TabbedInterface(
+    [ui_iface, mcp_iface],
+    tab_names=["🚀 App", "🔧 MCP"],
+    css=custom_css
+)
 if __name__ == "__main__":
+    app.queue().launch(share=True, mcp_server=True)

requirements.txt CHANGED Viewed

@@ -1,9 +1,71 @@
-html2text
-gradio[mcp]
-requests>=2.31.0
-beautifulsoup4>=4.12.3
-lxml>=4.9.3
-markdown>=3.5.2
 markdown-pdf
-pydantic>=2
-typing_extensions

+rag-scraper
+pydantic
 markdown-pdf
+typing-extensions
+gradio>=6.0
+requests
+Pillow
+markdown
+beautifulsoup4
+lxml
+aiohttp
+fake-useragent
+urllib3
+html5lib
+chardet
+tqdm
+python-dateutil
+pytz
+click
+nltk
+spacy
+scrapy
+selenium
+webdriver-manager
+pandas
+numpy
+openpyxl
+PyPDF2
+python-docx
+python-pptx
+reportlab
+pdfkit
+weasyprint
+cssutils
+tinycss2
+cchardet
+idna
+certifi
+charset-normalizer
+httpx
+httpcore
+anyio
+sniffio
+pysocks
+win-inet-pton
+deprecation
+docstring-parser
+rich
+typer
+pyyaml
+toml
+tomli
+packaging
+filelock
+huggingface-hub
+safetensors
+regex
+tokenizers
+sentencepiece
+accelerate
+torch
+torchvision
+torchaudio
+transformers
+diffusers
+datasets
+evaluate
+scipy
+scikit-learn
+joblib
+threadpoolctl