Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

CultriX commited on Sep 6, 2025

Commit

7af8fe7

verified ·

1 Parent(s): 315dac2

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -354

app.py CHANGED Viewed

@@ -1,51 +1,8 @@
 # app.py
-"""
-RAG-Ready Content Scraper — Gradio + MCP (SSE)
-Exposes an MCP SSE endpoint on Hugging Face Spaces at:
-    /gradio_api/mcp/sse
-Example MCP configs:
-1) Direct SSE (Cursor, Windsurf, Cline, etc.)
-{
-  "mcpServers": {
-    "gradio": {
-      "url": "https://cultrix-rag-scraper.hf.space/gradio_api/mcp/sse"
-    }
-  }
-}
-2) Experimental stdio via Node:
-{
-  "mcpServers": {
-    "gradio": {
-      "command": "npx",
-      "args": [
-        "mcp-remote",
-        "https://cultrix-rag-scraper.hf.space/gradio_api/mcp/sse",
-        "--transport",
-        "sse-only"
-      ]
-    }
-  }
-}
-"""
 from __future__ import annotations
-import os
-os.environ["HF_HOME"] = "/tmp/hf_cache"
-os.makedirs(os.environ["HF_HOME"], exist_ok=True)
-import csv
-import json
-import re
-import subprocess
-import tempfile
 from typing import Optional, Tuple, Literal
-# NEW: use Annotated+Doc so MCP can show per-parameter descriptions
 from typing_extensions import Annotated, Doc
 import gradio as gr
@@ -56,229 +13,121 @@ from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
-# -----------------------------
-# Helper utilities
-# -----------------------------
-def is_github_repo(url_or_id: str) -> bool:
-    """Return True if the string looks like a GitHub repository reference."""
-    if "github.com" in url_or_id:
-        return True
-    return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
 def check_repomix_installed() -> bool:
-    """Check if the `repomix` CLI is available on PATH."""
     try:
-        result = subprocess.run(
-            ["repomix", "--version"],
-            capture_output=True,
-            text=True,
-            check=False,
-        )
-        return result.returncode == 0
     except Exception:
         return False
-def run_repomix(
-    repo_url_or_id: str,
-    progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[str, Optional[str]]:
-    """Run Repomix on a GitHub repository and return combined Markdown."""
-    progress(0, desc="Starting Repomix processing...")
     try:
-        with tempfile.TemporaryDirectory() as temp_dir:
-            output_file_name = "repomix-output.md"
-            output_file_path = os.path.join(temp_dir, output_file_name)
-            if "/" in repo_url_or_id and not repo_url_or_id.startswith("http"):
-                repo_url = f"https://github.com/{repo_url_or_id}"
-            else:
-                repo_url = repo_url_or_id
-            progress(0.2, desc=f"Running Repomix on {repo_url}...")
-            cmd = [
-                "repomix",
-                "--remote", repo_url,
-                "--output", output_file_path,
-                "--style", "markdown",
-                "--compress",
-            ]
-            process = subprocess.run(
-                cmd, capture_output=True, text=True, check=False, encoding="utf-8"
-            )
-            progress(0.8, desc="Repomix command executed.")
-            if process.returncode != 0:
-                error_details = (
-                    f"Return Code: {process.returncode}\n"
-                    f"Stderr: {process.stderr}\n"
-                    f"Stdout: {process.stdout}"
-                )
-                return f"Error running Repomix:\n{error_details}", None
-            if os.path.exists(output_file_path):
-                with open(output_file_path, "r", encoding="utf-8") as f:
-                    content = f.read()
-                progress(1, desc="Repomix output processed.")
-                return content, output_file_path
-            error_details = (
-                f"Return Code: {process.returncode}\n"
-                f"Stderr: {process.stderr}\n"
-                f"Stdout: {process.stdout}"
-            )
-            return (
-                f"Error: Repomix did not generate an output file at '{output_file_path}'.\n"
-                f"Repomix Output:\n{error_details}",
-                None,
-            )
     except Exception as e:
-        progress(1, desc="Error during Repomix processing.")
-        return f"Error processing GitHub repository: {str(e)}", None
-def scrape_and_convert_website(
-    url: str,
-    depth: int,
-    progress: gr.Progress = gr.Progress(track_tqdm=True),
-) -> Tuple[str, str]:
-    """Recursively scrape a website and convert pages to Markdown."""
-    progress(0, desc=f"Starting web scrape for {url}...")
-    visited_urls = set()
-    def recursive_scrape(
-        current_url: str,
-        current_depth: int,
-        total_links_estimate: int = 1,
-        link_index: int = 0,
-    ) -> str:
-        if current_url in visited_urls or current_depth < 0:
             return ""
-        visited_urls.add(current_url)
         try:
-            progress_val = (
-                link_index / total_links_estimate if total_links_estimate > 0 else 0
-            )
-            progress(
-                progress_val,
-                desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})",
-            )
-            html_content = Scraper.fetch_html(current_url)
         except Exception as e:
-            return f"Error fetching {current_url}: {str(e)}\n"
-        markdown_content = f"## Extracted from: {current_url}\n\n"
-        markdown_content += Converter.html_to_markdown(
-            html=html_content,
-            base_url=current_url,
-            parser_features="html.parser",
-            ignore_links=True,
-        )
-        page_content = markdown_content + "\n\n"
-        if current_depth > 0:
             try:
-                links = LinkExtractor.scrape_url(current_url, link_type=LinkType.INTERNAL)
-                valid_links = [
-                    link for link in links
-                    if URLUtils.is_internal(link, current_url) and link not in visited_urls
-                ]
-                num_links = len(valid_links)
-                for i, link_url in enumerate(valid_links):
-                    page_content += recursive_scrape(
-                        link_url, current_depth - 1, num_links, i
-                    )
             except Exception as e:
-                page_content += f"Error extracting links from {current_url}: {str(e)}\n"
-        return page_content
-    all_markdown_content = recursive_scrape(url, depth)
-    progress(1, desc="Web scraping complete.")
-    with tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, suffix=".md", encoding="utf-8"
-    ) as tmp_file:
-        tmp_file.write(all_markdown_content)
-        return all_markdown_content, tmp_file.name
-def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
-    """Wrap Markdown text in a JSON object with `source` and `content` keys."""
-    data = {"source": source_url_or_id, "content": markdown_content}
-    return json.dumps(data, indent=2)
-def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
-    """Persist Markdown as a simple CSV with two columns: `source`, `content`."""
-    output = tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
-    )
-    writer = csv.writer(output)
-    writer.writerow(["source", "content"])
-    writer.writerow([source_url_or_id, markdown_content])
-    output.close()
-    return output.name
-def save_output_to_file(
-    content: str,
-    output_format: str,
-    source_url_or_id: str,
-) -> str:
-    """Save processed content in the selected format and return a file path."""
-    processed_content = content  # default for Markdown/Text
-    if output_format == "JSON":
         suffix = ".json"
-        processed_content = convert_to_json(content, source_url_or_id)
-    elif output_format == "CSV":
-        return convert_to_csv(content, source_url_or_id)
-    elif output_format == "Text":
-        suffix = ".txt"
-    elif output_format == "PDF":
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
-                pdf_output_path = tmp_pdf.name
-            md_pdf = markdown_pdf.MarkdownPdf(toc_level=2)
-            md_pdf.convert_from_string(content, pdf_output_path)
-            return pdf_output_path
         except Exception as e:
-            # Fallback: persist as Markdown with .pdf.md suffix.
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
-            suffix = ".pdf.md"
     else:
-        suffix = ".md"
-    with tempfile.NamedTemporaryFile(
-        mode="w+", delete=False, suffix=suffix, encoding="utf-8"
-    ) as tmp_file:
-        tmp_file.write(processed_content)
-        return tmp_file.name
-# ----------------------------------------------------------
-# Main tool function (exposed to MCP via SSE)
-# ----------------------------------------------------------
 def process_input_updated(
     url_or_id: Annotated[
         str,
-        Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
     ],
     source_type: Annotated[
         Literal["Webpage", "GitHub Repository"],
-        Doc('Select the content source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
     ],
     depth: Annotated[
         int,
-        Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub repositories."),
     ],
     output_format_selection: Annotated[
         Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
@@ -287,131 +136,77 @@ def process_input_updated(
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
-    Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
     Returns:
-        Tuple[str, str, Optional[str]]: (status, preview, file_path)
     """
-    progress(0, desc="Initializing...")
-    raw_content = ""
-    error_message = ""
-    output_file_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            error_message = (
-                "Repomix is not installed or not accessible. "
-                "Please ensure it's installed globally."
-            )
-            return error_message, "", None
-        raw_content, _ = run_repomix(url_or_id, progress=progress)
-        if raw_content.startswith("Error"):
-            error_message = raw_content
-            raw_content = ""
     elif source_type == "Webpage":
-        raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if raw_content.startswith("Error"):
-            error_message = raw_content
-            raw_content = ""
     else:
-        error_message = "Invalid source type selected."
-        return error_message, "", None
-    if error_message:
-        return error_message, "", None
     try:
-        progress(0.9, desc=f"Converting to {output_format_selection}...")
-        output_file_path = save_output_to_file(
-            raw_content, output_format_selection, url_or_id
-        )
-        # Prepare preview content
-        preview_content = raw_content
         if output_format_selection == "JSON":
-            preview_content = convert_to_json(raw_content, url_or_id)
-        elif output_format_selection == "CSV" and output_file_path:
-            # Show a small preview of the CSV
             try:
-                with open(output_file_path, "r", encoding="utf-8") as f_csv:
-                    csv_preview_lines = [next(f_csv) for _ in range(5)]
-                preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
             except StopIteration:
-                with open(output_file_path, "r", encoding="utf-8") as f_csv:
-                    preview_content = f_csv.read() or "[CSV content is empty]"
-            except Exception as e_csv_preview:
-                preview_content = f"[Error reading CSV for preview: {str(e_csv_preview)}]"
-        elif output_format_selection == "CSV" and not output_file_path:
-            preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
-            # Can't render PDF in text preview
-            preview_content = (
-                f"[PDF generated. Download to view: "
-                f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
-            )
-        progress(1, desc="Processing complete.")
-        return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
-        return f"Error during file conversion/saving: {str(e)}", raw_content, None
-# -----------------------------
-# Gradio UI
-# -----------------------------
 with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
-    gr.Markdown(
-        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
-    )
     with gr.Row():
         with gr.Column(scale=2):
-            url_input = gr.Textbox(
-                label="Enter URL or GitHub Repository ID",
-                placeholder="e.g., https://example.com OR username/repo",
-            )
-            source_type_input = gr.Radio(
-                choices=["Webpage", "GitHub Repository"],
-                value="Webpage",
-                label="Select Source Type",
-            )
-            depth_input = gr.Slider(
-                minimum=0,
-                maximum=3,
-                step=1,
-                value=0,
-                label="Scraping Depth (for Webpages)",
-                info="0: Only main page. Ignored for GitHub repos.",
-            )
-            output_format_input = gr.Dropdown(
-                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
-                value="Markdown",
-                label="Select Output Format",
-            )
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
-            preview_output = gr.Code(
-                label="Preview Content", language="markdown", interactive=False
-            )
-            file_download_output = gr.File(
-                label="Download Processed File", interactive=False
-            )
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
             ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
-            [
-                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
-                "Webpage",
-                0,
-                "JSON",
-            ],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
@@ -419,30 +214,6 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
         cache_examples=False,
     )
-    with gr.Accordion("How it Works & More Info", open=False):
-        gr.Markdown(
-            """
-**Webpage Scraping**
-1. Enter a full URL (e.g., `https://example.com`).
-2. Select "Webpage" as the source type.
-3. Set the desired scraping depth.
-4. Choose your output format.
-**GitHub Repository Processing**
-1. Enter a GitHub repository URL or ID (e.g., `username/repo`).
-2. Select "GitHub Repository". (Depth is ignored.)
-3. Choose your output format. Uses **Repomix**.
-**Output Formats**
-Markdown, JSON, CSV, Text, PDF.
-**Notes**
-- PDF generation requires the `markdown-pdf` library.
-- Designed for Docker/Hugging Face Spaces.
-- MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
-"""
-        )
     submit_button.click(
         fn=process_input_updated,
         inputs=[url_input, source_type_input, depth_input, output_format_input],
@@ -450,5 +221,5 @@ Markdown, JSON, CSV, Text, PDF.
     )
 if __name__ == "__main__":
-    # Enable queuing for concurrency; Spaces generally manage hosting.
-    iface.queue().launch(share=True)

 # app.py
 from __future__ import annotations
+import os, csv, json, re, subprocess, tempfile
 from typing import Optional, Tuple, Literal
 from typing_extensions import Annotated, Doc
 import gradio as gr
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+# Cache dir for HF Spaces
+os.environ["HF_HOME"] = "/tmp/hf_cache"
+os.makedirs(os.environ["HF_HOME"], exist_ok=True)
+# ---------- helpers ----------
 def check_repomix_installed() -> bool:
+    """Return True if `repomix` is available on PATH."""
     try:
+        r = subprocess.run(["repomix", "--version"], capture_output=True, text=True, check=False)
+        return r.returncode == 0
     except Exception:
         return False
+def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, Optional[str]]:
+    """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
+    progress(0, desc="Starting Repomix…")
     try:
+        with tempfile.TemporaryDirectory() as td:
+            out_path = os.path.join(td, "repomix-output.md")
+            repo_url = f"https://github.com/{repo_url_or_id}" if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) else repo_url_or_id
+            cmd = ["repomix", "--remote", repo_url, "--output", out_path, "--style", "markdown", "--compress"]
+            p = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding="utf-8")
+            progress(0.8, desc="Repomix done.")
+            if p.returncode != 0:
+                err = f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
+                return f"Error running Repomix:\n{err}", None
+            if os.path.exists(out_path):
+                with open(out_path, "r", encoding="utf-8") as f:
+                    return f.read(), out_path
+            return "Error: Repomix did not produce an output file.", None
     except Exception as e:
+        progress(1, desc="Error")
+        return f"Error processing GitHub repository: {e}", None
+def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str]:
+    """Recursively scrape a website and convert visited pages to Markdown."""
+    progress(0, desc=f"Scraping {url}…")
+    visited = set()
+    def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
+        if u in visited or d < 0:
             return ""
+        visited.add(u)
         try:
+            progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
+            html = Scraper.fetch_html(u)
         except Exception as e:
+            return f"Error fetching {u}: {e}\n"
+        md = f"## Extracted from: {u}\n\n" + Converter.html_to_markdown(html=html, base_url=u, parser_features="html.parser", ignore_links=True) + "\n\n"
+        if d > 0:
             try:
+                links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
+                valid = [l for l in links if URLUtils.is_internal(l, u) and l not in visited]
+                for j, nxt in enumerate(valid):
+                    md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
+                md += f"Error extracting links from {u}: {e}\n"
+        return md
+    all_md = rec(url, depth)
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp:
+        tmp.write(all_md)
+        return all_md, tmp.name
+def convert_to_json(markdown_content: str, source: str) -> str:
+    """Wrap Markdown in a tiny JSON schema."""
+    return json.dumps({"source": source, "content": markdown_content}, indent=2)
+def convert_to_csv(markdown_content: str, source: str) -> str:
+    """Write a simple 2-column CSV and return its path."""
+    f = tempfile.NamedTemporaryFile(mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8")
+    w = csv.writer(f)
+    w.writerow(["source", "content"])
+    w.writerow([source, markdown_content])
+    f.close()
+    return f.name
+def save_output_to_file(content: str, fmt: str, source: str) -> str:
+    """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
+    if fmt == "JSON":
+        data = convert_to_json(content, source)
         suffix = ".json"
+    elif fmt == "CSV":
+        return convert_to_csv(content, source)
+    elif fmt == "Text":
+        data, suffix = content, ".txt"
+    elif fmt == "PDF":
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
+                path = tmp_pdf.name
+            markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
+            return path
         except Exception as e:
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
+            data, suffix = content, ".pdf.md"
     else:
+        data, suffix = content, ".md"
+    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp:
+        tmp.write(data)
+        return tmp.name
+# ---------- MCP-exposed tool ----------
 def process_input_updated(
     url_or_id: Annotated[
         str,
+        Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or the full GitHub URL."),
     ],
     source_type: Annotated[
         Literal["Webpage", "GitHub Repository"],
+        Doc('Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
     ],
     depth: Annotated[
         int,
+        Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub."),
     ],
     output_format_selection: Annotated[
         Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
+    Scrape a webpage (with configurable depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
     Returns:
+        (status, preview, file_path)
     """
+    progress(0, desc="Initializing…")
+    raw, err = "", ""
+    out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            return "Repomix is not installed or not accessible.", "", None
+        raw, _ = run_repomix(url_or_id, progress=progress)
+        if raw.startswith("Error"):
+            return raw, "", None
     elif source_type == "Webpage":
+        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if raw.startswith("Error"):
+            return raw, "", None
     else:
+        return "Invalid source type selected.", "", None
     try:
+        progress(0.9, desc=f"Converting to {output_format_selection}…")
+        out_path = save_output_to_file(raw, output_format_selection, url_or_id)
+        preview = raw
         if output_format_selection == "JSON":
+            preview = convert_to_json(raw, url_or_id)
+        elif output_format_selection == "CSV":
             try:
+                with open(out_path, "r", encoding="utf-8") as f:
+                    first_lines = [next(f) for _ in range(5)]
+                preview = "".join(first_lines) or "[CSV content is empty or very short]"
             except StopIteration:
+                with open(out_path, "r", encoding="utf-8") as f:
+                    preview = f.read() or "[CSV content is empty]"
+            except Exception as e:
+                preview = f"[Error reading CSV for preview: {e}]"
         elif output_format_selection == "PDF":
+            from os.path import basename
+            preview = f"[PDF generated. Download to view: {basename(out_path) if out_path else 'file.pdf'}]"
+        progress(1, desc="Done.")
+        return f"Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
+        return f"Error during conversion: {e}", raw, None
+# ---------- UI ----------
 with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
+    gr.Markdown("Scrape webpage content or GitHub repositories to generate RAG-ready datasets.")
     with gr.Row():
         with gr.Column(scale=2):
+            url_input = gr.Textbox(label="Enter URL or GitHub Repository ID", placeholder="https://example.com  or  owner/repo")
+            source_type_input = gr.Radio(choices=["Webpage", "GitHub Repository"], value="Webpage", label="Select Source Type")
+            depth_input = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Scraping Depth (for Webpages)", info="0 = only main page. Ignored for GitHub.")
+            output_format_input = gr.Dropdown(choices=["Markdown", "JSON", "CSV", "Text", "PDF"], value="Markdown", label="Select Output Format")
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
+            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
+            file_download_output = gr.File(label="Download Processed File", interactive=False)
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
             ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
+            ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
         cache_examples=False,
     )
     submit_button.click(
         fn=process_input_updated,
         inputs=[url_input, source_type_input, depth_input, output_format_input],
     )
 if __name__ == "__main__":
+    # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
+    iface.queue().launch(share=True, mcp_server=True)