Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

CultriX commited on Sep 6, 2025

Commit

5458065

verified ·

1 Parent(s): 20dc7c9

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -100

app.py CHANGED Viewed

@@ -1,43 +1,81 @@
 # app.py
 from __future__ import annotations
-import os, csv, json, re, subprocess, tempfile
 from typing import Optional, Tuple, Literal
-from typing_extensions import Annotated, Doc
 import gradio as gr
 import markdown_pdf
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
-# Cache dir for HF Spaces
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
-# ---------- helpers ----------
 def check_repomix_installed() -> bool:
     """Return True if `repomix` is available on PATH."""
     try:
-        r = subprocess.run(["repomix", "--version"], capture_output=True, text=True, check=False)
         return r.returncode == 0
     except Exception:
         return False
-def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, Optional[str]]:
     """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
     progress(0, desc="Starting Repomix…")
     try:
         with tempfile.TemporaryDirectory() as td:
             out_path = os.path.join(td, "repomix-output.md")
-            repo_url = f"https://github.com/{repo_url_or_id}" if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http")) else repo_url_or_id
-            cmd = ["repomix", "--remote", repo_url, "--output", out_path, "--style", "markdown", "--compress"]
-            p = subprocess.run(cmd, capture_output=True, text=True, check=False, encoding="utf-8")
             progress(0.8, desc="Repomix done.")
             if p.returncode != 0:
-                err = f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
                 return f"Error running Repomix:\n{err}", None
             if os.path.exists(out_path):
                 with open(out_path, "r", encoding="utf-8") as f:
@@ -47,7 +85,12 @@ def run_repomix(repo_url_or_id: str, progress: gr.Progress = gr.Progress(track_t
         progress(1, desc="Error")
         return f"Error processing GitHub repository: {e}", None
-def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.Progress(track_tqdm=True)) -> Tuple[str, str]:
     """Recursively scrape a website and convert visited pages to Markdown."""
     progress(0, desc=f"Scraping {url}…")
     visited = set()
@@ -61,11 +104,21 @@ def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.
             html = Scraper.fetch_html(u)
         except Exception as e:
             return f"Error fetching {u}: {e}\n"
-        md = f"## Extracted from: {u}\n\n" + Converter.html_to_markdown(html=html, base_url=u, parser_features="html.parser", ignore_links=True) + "\n\n"
         if d > 0:
             try:
                 links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
-                valid = [l for l in links if URLUtils.is_internal(l, u) and l not in visited]
                 for j, nxt in enumerate(valid):
                     md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
@@ -73,23 +126,30 @@ def scrape_and_convert_website(url: str, depth: int, progress: gr.Progress = gr.
         return md
     all_md = rec(url, depth)
-    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".md", encoding="utf-8") as tmp:
         tmp.write(all_md)
         return all_md, tmp.name
 def convert_to_json(markdown_content: str, source: str) -> str:
     """Wrap Markdown in a tiny JSON schema."""
     return json.dumps({"source": source, "content": markdown_content}, indent=2)
 def convert_to_csv(markdown_content: str, source: str) -> str:
     """Write a simple 2-column CSV and return its path."""
-    f = tempfile.NamedTemporaryFile(mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8")
     w = csv.writer(f)
     w.writerow(["source", "content"])
     w.writerow([source, markdown_content])
     f.close()
     return f.name
 def save_output_to_file(content: str, fmt: str, source: str) -> str:
     """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
     if fmt == "JSON":
@@ -111,133 +171,174 @@ def save_output_to_file(content: str, fmt: str, source: str) -> str:
     else:
         data, suffix = content, ".md"
-    with tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=suffix, encoding="utf-8") as tmp:
         tmp.write(data)
         return tmp.name
-# ---------- MCP-exposed tool ----------
-from typing import Optional, Tuple, Literal
-from typing_extensions import Annotated, Doc
-import os
-import gradio as gr
 def process_input_updated(
-    url_or_id: Annotated[
-        str,
-        Doc("For webpages: full URL (e.g. https://example.com). For GitHub: owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
-    ],
-    source_type: Annotated[
-        Literal["Webpage", "GitHub Repository"],
-        Doc('Choose source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
-    ],
-    depth: Annotated[
-        int,
-        Doc("Crawl depth for webpages (0–3). 0 = main page only. Ignored for GitHub repositories."),
-    ],
-    output_format_selection: Annotated[
-        Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
-        Doc("Output format for the processed content."),
-    ],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
-    Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
-    Parameters
-    ----------
-    url_or_id : str
-        For webpages: full URL (e.g. ``https://example.com``).
-        For GitHub: either ``owner/repo`` or a full GitHub URL
-        (e.g. ``https://github.com/owner/repo``).
-    source_type : {"Webpage", "GitHub Repository"}
-        Choose the content source. Use **Webpage** to crawl HTML; use
-        **GitHub Repository** to run Repomix.
-    depth : int
-        Crawl depth for webpages in the range 0–3 where 0 = only the main page.
-        Ignored when ``source_type`` is ``"GitHub Repository"``.
-    output_format_selection : {"Markdown", "JSON", "CSV", "Text", "PDF"}
-        Desired output format for the processed content.
-    progress : gr.Progress, optional
-        (UI only) Gradio progress tracker. MCP callers can omit this.
-    Returns
-    -------
-    (status, preview, file_path) : tuple[str, str, Optional[str]]
-        - **status**: Human-readable status line.
-        - **preview**: Text preview (full Markdown/JSON/Text, or a short note for CSV/PDF).
-        - **file_path**: Path to the generated artifact, or ``None`` on error.
     """
     progress(0, desc="Initializing…")
-    raw_content = ""
-    output_file_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            return "Repomix is not installed or not accessible. Please install it.", "", None
-        raw_content, _ = run_repomix(url_or_id, progress=progress)
-        if raw_content.startswith("Error"):
-            return raw_content, "", None
     elif source_type == "Webpage":
-        raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if raw_content.startswith("Error"):
-            return raw_content, "", None
     else:
         return "Invalid source type selected.", "", None
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}…")
-        output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
-        preview_content = raw_content
         if output_format_selection == "JSON":
-            preview_content = convert_to_json(raw_content, url_or_id)
-        elif output_format_selection == "CSV" and output_file_path:
             try:
-                with open(output_file_path, "r", encoding="utf-8") as f_csv:
-                    csv_preview_lines = [next(f_csv) for _ in range(5)]
-                preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
             except StopIteration:
-                with open(output_file_path, "r", encoding="utf-8") as f_csv:
-                    preview_content = f_csv.read() or "[CSV content is empty]"
-            except Exception as e_csv_preview:
-                preview_content = f"[Error reading CSV for preview: {e_csv_preview}]"
-        elif output_format_selection == "CSV" and not output_file_path:
-            preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
-            preview_content = (
                 f"[PDF generated. Download to view: "
-                f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
             )
-        progress(1, desc="Processing complete.")
-        return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
-        return f"Error during file conversion/saving: {e}", raw_content, None
-# ---------- UI ----------
-with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:
     gr.Markdown("# RAG-Ready Content Scraper")
-    gr.Markdown("Scrape webpage content or GitHub repositories to generate RAG-ready datasets.")
     with gr.Row():
         with gr.Column(scale=2):
-            url_input = gr.Textbox(label="Enter URL or GitHub Repository ID", placeholder="https://example.com  or  owner/repo")
-            source_type_input = gr.Radio(choices=["Webpage", "GitHub Repository"], value="Webpage", label="Select Source Type")
-            depth_input = gr.Slider(minimum=0, maximum=3, step=1, value=0, label="Scraping Depth (for Webpages)", info="0 = only main page. Ignored for GitHub.")
-            output_format_input = gr.Dropdown(choices=["Markdown", "JSON", "CSV", "Text", "PDF"], value="Markdown", label="Select Output Format")
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
-            preview_output = gr.Code(label="Preview Content", language="markdown", interactive=False)
-            file_download_output = gr.File(label="Download Processed File", interactive=False)
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
             ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
-            ["https://en.wikipedia.org/wiki/Retrieval-augmented_generation", "Webpage", 0, "JSON"],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
@@ -251,6 +352,26 @@ with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme")
         outputs=[status_output, preview_output, file_download_output],
     )
 if __name__ == "__main__":
     # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
-    iface.queue().launch(share=True, mcp_server=True)

 # app.py
 from __future__ import annotations
+import os
+import csv
+import json
+import re
+import subprocess
+import tempfile
 from typing import Optional, Tuple, Literal
 import gradio as gr
 import markdown_pdf
+from typing_extensions import Annotated, Doc
+from pydantic import BaseModel, Field, conint
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
 from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.utils import URLUtils
+# -----------------------------
+# Environment (HF cache dir)
+# -----------------------------
 os.environ["HF_HOME"] = "/tmp/hf_cache"
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
+# -----------------------------
+# Helper utilities
+# -----------------------------
 def check_repomix_installed() -> bool:
     """Return True if `repomix` is available on PATH."""
     try:
+        r = subprocess.run(
+            ["repomix", "--version"],
+            capture_output=True,
+            text=True,
+            check=False,
+        )
         return r.returncode == 0
     except Exception:
         return False
+def run_repomix(
+    repo_url_or_id: str,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),
+) -> Tuple[str, Optional[str]]:
     """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
     progress(0, desc="Starting Repomix…")
     try:
         with tempfile.TemporaryDirectory() as td:
             out_path = os.path.join(td, "repomix-output.md")
+            repo_url = (
+                f"https://github.com/{repo_url_or_id}"
+                if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
+                else repo_url_or_id
+            )
+            cmd = [
+                "repomix",
+                "--remote",
+                repo_url,
+                "--output",
+                out_path,
+                "--style",
+                "markdown",
+                "--compress",
+            ]
+            p = subprocess.run(
+                cmd, capture_output=True, text=True, check=False, encoding="utf-8"
+            )
             progress(0.8, desc="Repomix done.")
             if p.returncode != 0:
+                err = (
+                    f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
+                )
                 return f"Error running Repomix:\n{err}", None
             if os.path.exists(out_path):
                 with open(out_path, "r", encoding="utf-8") as f:
         progress(1, desc="Error")
         return f"Error processing GitHub repository: {e}", None
+def scrape_and_convert_website(
+    url: str,
+    depth: int,
+    progress: gr.Progress = gr.Progress(track_tqdm=True),
+) -> Tuple[str, str]:
     """Recursively scrape a website and convert visited pages to Markdown."""
     progress(0, desc=f"Scraping {url}…")
     visited = set()
             html = Scraper.fetch_html(u)
         except Exception as e:
             return f"Error fetching {u}: {e}\n"
+        md = (
+            f"## Extracted from: {u}\n\n"
+            + Converter.html_to_markdown(
+                html=html, base_url=u, parser_features="html.parser", ignore_links=True
+            )
+            + "\n\n"
+        )
         if d > 0:
             try:
                 links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
+                valid = [
+                    l
+                    for l in links
+                    if URLUtils.is_internal(l, u) and l not in visited
+                ]
                 for j, nxt in enumerate(valid):
                     md += rec(nxt, d - 1, len(valid), j)
             except Exception as e:
         return md
     all_md = rec(url, depth)
+    with tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, suffix=".md", encoding="utf-8"
+    ) as tmp:
         tmp.write(all_md)
         return all_md, tmp.name
 def convert_to_json(markdown_content: str, source: str) -> str:
     """Wrap Markdown in a tiny JSON schema."""
     return json.dumps({"source": source, "content": markdown_content}, indent=2)
 def convert_to_csv(markdown_content: str, source: str) -> str:
     """Write a simple 2-column CSV and return its path."""
+    f = tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
+    )
     w = csv.writer(f)
     w.writerow(["source", "content"])
     w.writerow([source, markdown_content])
     f.close()
     return f.name
 def save_output_to_file(content: str, fmt: str, source: str) -> str:
     """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
     if fmt == "JSON":
     else:
         data, suffix = content, ".md"
+    with tempfile.NamedTemporaryFile(
+        mode="w+", delete=False, suffix=suffix, encoding="utf-8"
+    ) as tmp:
         tmp.write(data)
         return tmp.name
+# -----------------------------
+# Core UI-bound function
+# -----------------------------
 def process_input_updated(
+    url_or_id: str,
+    source_type: Literal["Webpage", "GitHub Repository"],
+    depth: int,
+    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
+    UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
+    then export as Markdown/JSON/CSV/Text/PDF.
     """
     progress(0, desc="Initializing…")
+    out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            return "Repomix is not installed or not accessible.", "", None
+        raw, _ = run_repomix(url_or_id, progress=progress)
+        if raw.startswith("Error"):
+            return raw, "", None
     elif source_type == "Webpage":
+        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if raw.startswith("Error"):
+            return raw, "", None
     else:
         return "Invalid source type selected.", "", None
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}…")
+        out_path = save_output_to_file(raw, output_format_selection, url_or_id)
+        preview = raw
         if output_format_selection == "JSON":
+            preview = convert_to_json(raw, url_or_id)
+        elif output_format_selection == "CSV":
             try:
+                with open(out_path, "r", encoding="utf-8") as f:
+                    first_lines = [next(f) for _ in range(5)]
+                preview = "".join(first_lines) or "[CSV content is empty or very short]"
             except StopIteration:
+                with open(out_path, "r", encoding="utf-8") as f:
+                    preview = f.read() or "[CSV content is empty]"
+            except Exception as e:
+                preview = f"[Error reading CSV for preview: {e}]"
         elif output_format_selection == "PDF":
+            from os.path import basename
+            preview = (
                 f"[PDF generated. Download to view: "
+                f"{basename(out_path) if out_path else 'file.pdf'}]"
             )
+        progress(1, desc="Done.")
+        return f"Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
+        return f"Error during conversion: {e}", "", None
+# -----------------------------
+# Pydantic models for MCP tool
+# -----------------------------
+class ProcessArgs(BaseModel):
+    url_or_id: str = Field(
+        ...,
+        description=(
+            "For webpages, a full URL (e.g., https://example.com). "
+            "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
+        ),
+    )
+    source_type: Literal["Webpage", "GitHub Repository"] = Field(
+        ...,
+        description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
+    )
+    depth: conint(ge=0, le=3) = Field(
+        ...,
+        description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
+    )
+    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
+        ...,
+        description="Desired output format for the processed content.",
+    )
+class ProcessResult(BaseModel):
+    status: str = Field(..., description="Human-readable status line.")
+    preview: str = Field(
+        ...,
+        description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
+    )
+    file_path: Optional[str] = Field(
+        None, description="Temp file path for the artifact, or null if not created."
+    )
+def process_input_mcp(args: ProcessArgs) -> ProcessResult:
+    """
+    MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
+    """
+    status, preview, path = process_input_updated(
+        args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
+    )
+    return ProcessResult(status=status, preview=preview, file_path=path)
+# -----------------------------
+# Gradio UI
+# -----------------------------
+with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
     gr.Markdown("# RAG-Ready Content Scraper")
+    gr.Markdown(
+        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
+    )
     with gr.Row():
         with gr.Column(scale=2):
+            url_input = gr.Textbox(
+                label="Enter URL or GitHub Repository ID",
+                placeholder="https://example.com  or  owner/repo",
+            )
+            source_type_input = gr.Radio(
+                choices=["Webpage", "GitHub Repository"],
+                value="Webpage",
+                label="Select Source Type",
+            )
+            depth_input = gr.Slider(
+                minimum=0,
+                maximum=3,
+                step=1,
+                value=0,
+                label="Scraping Depth (for Webpages)",
+                info="0 = only main page. Ignored for GitHub.",
+            )
+            output_format_input = gr.Dropdown(
+                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
+                value="Markdown",
+                label="Select Output Format",
+            )
             submit_button = gr.Button("Process Content", variant="primary")
         with gr.Column(scale=3):
             status_output = gr.Textbox(label="Status", interactive=False)
+            preview_output = gr.Code(
+                label="Preview Content", language="markdown", interactive=False
+            )
+            file_download_output = gr.File(
+                label="Download Processed File", interactive=False
+            )
     gr.Examples(
         examples=[
             ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
             ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
+            [
+                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
+                "Webpage",
+                0,
+                "JSON",
+            ],
         ],
         inputs=[url_input, source_type_input, depth_input, output_format_input],
         outputs=[status_output, preview_output, file_download_output],
         outputs=[status_output, preview_output, file_download_output],
     )
+# -----------------------------
+# MCP-only Interface (Pydantic tool)
+# -----------------------------
+# We expose a second interface whose *function signature* uses Pydantic models.
+# MCP reads this signature to build a JSON Schema with rich field descriptions.
+mcp_iface = gr.Interface(
+    fn=process_input_mcp,
+    # Components are placeholders; MCP ignores them and reads the Python types.
+    # Keep them simple so the tab is usable if someone clicks it.
+    inputs=gr.JSON(label="ProcessArgs (JSON)"),
+    outputs=gr.JSON(label="ProcessResult (JSON)"),
+    title="MCP Tool: process_input_mcp",
+    description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
+    allow_flagging="never",
+)
+# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
+app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
 if __name__ == "__main__":
     # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
+    app.queue().launch(share=True, mcp_server=True)