Spaces:

CultriX
/

RAG-Scraper

Paused

App Files Files Community

CultriX commited on Sep 6, 2025

Commit

315dac2

verified ·

1 Parent(s): 23a446a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -114

app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 """
 RAG-Ready Content Scraper — Gradio + MCP (SSE)
-This app runs on Hugging Face Spaces and exposes an MCP SSE endpoint at:
     /gradio_api/mcp/sse
 Example MCP configs:
@@ -43,7 +43,10 @@ import json
 import re
 import subprocess
 import tempfile
-from typing import Optional, Tuple
 import gradio as gr
 import markdown_pdf
@@ -59,28 +62,14 @@ from rag_scraper.utils import URLUtils
 # -----------------------------
 def is_github_repo(url_or_id: str) -> bool:
-    """
-    Determine whether the string looks like a GitHub repository reference.
-    :param url_or_id: Full GitHub URL containing ``github.com`` or an
-        ``owner/repo`` identifier.
-    :type url_or_id: str
-    :return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
-        otherwise ``False``.
-    :rtype: bool
-    """
     if "github.com" in url_or_id:
         return True
     return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
 def check_repomix_installed() -> bool:
-    """
-    Check if the ``repomix`` CLI is available on PATH.
-    :return: ``True`` if ``repomix --version`` succeeds, else ``False``.
-    :rtype: bool
-    """
     try:
         result = subprocess.run(
             ["repomix", "--version"],
@@ -97,19 +86,7 @@ def run_repomix(
     repo_url_or_id: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
-    """
-    Run Repomix on a GitHub repository and return combined Markdown.
-    :param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
-        or in the form ``owner/repo``.
-    :type repo_url_or_id: str
-    :param progress: Gradio progress tracker (UI only).
-    :type progress: gr.Progress
-    :return: A tuple ``(content, output_path)`` where ``content`` is the
-        combined Markdown or an error string starting with ``"Error"``, and
-        ``output_path`` is the temp file path (or ``None``).
-    :rtype: Tuple[str, Optional[str]]
-    """
     progress(0, desc="Starting Repomix processing...")
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -169,18 +146,7 @@ def scrape_and_convert_website(
     depth: int,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
-    """
-    Recursively scrape a website and convert pages to Markdown.
-    :param url: Starting URL to scrape.
-    :type url: str
-    :param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
-    :type depth: int
-    :param progress: Gradio progress tracker (UI only).
-    :type progress: gr.Progress
-    :return: A tuple ``(combined_markdown, tmp_md_path)``.
-    :rtype: Tuple[str, str]
-    """
     progress(0, desc=f"Starting web scrape for {url}...")
     visited_urls = set()
@@ -244,31 +210,13 @@ def scrape_and_convert_website(
 def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
-    """
-    Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
-    :param markdown_content: The Markdown body to embed.
-    :type markdown_content: str
-    :param source_url_or_id: Original input string identifying the source.
-    :type source_url_or_id: str
-    :return: Pretty-printed JSON string.
-    :rtype: str
-    """
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
-    """
-    Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
-    :param markdown_content: The Markdown body to store.
-    :type markdown_content: str
-    :param source_url_or_id: Original input string identifying the source.
-    :type source_url_or_id: str
-    :return: Path to the created CSV file.
-    :rtype: str
-    """
     output = tempfile.NamedTemporaryFile(
         mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
     )
@@ -284,22 +232,7 @@ def save_output_to_file(
     output_format: str,
     source_url_or_id: str,
 ) -> str:
-    """
-    Save processed content in the selected format and return a file path.
-    :param content: The raw Markdown to save or convert.
-    :type content: str
-    :param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
-    :type output_format: str
-    :param source_url_or_id: Original input string identifying the source.
-    :type source_url_or_id: str
-    :return: Path to a temporary file holding the artifact.
-    :rtype: str
-    .. note::
-       PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
-       CSV uses a 2-column schema: ``['source','content']``.
-    """
     processed_content = content  # default for Markdown/Text
     if output_format == "JSON":
@@ -335,44 +268,29 @@ def save_output_to_file(
 # ----------------------------------------------------------
 def process_input_updated(
-    url_or_id: str,
-    source_type: str,
-    depth: int,
-    output_format_selection: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
-    Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
-    This function is exposed to MCP clients via the Spaces SSE endpoint
-    ``/gradio_api/mcp/sse``.
-    :param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
-        For GitHub, either ``owner/repo`` or a full GitHub URL
-        (e.g., ``https://github.com/owner/repo``).
-    :type url_or_id: str
-    :param source_type: Select the content source. One of
-        {``"Webpage"``, ``"GitHub Repository"``}.
-    :type source_type: str
-    :param depth: Crawl depth for webpages. Integer in the range 0–3 where
-        0 = only the main page. **Ignored** when ``source_type`` is
-        ``"GitHub Repository"``.
-    :type depth: int
-    :param output_format_selection: Desired output format. One of
-        {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
-    :type output_format_selection: str
-    :param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
-    :type progress: gr.Progress
-    :returns: A 3-tuple:
-        - **status** (*str*): Human-readable status line.
-        - **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
-        - **file_path** (*Optional[str]*): Path to the generated artifact for download,
-          or ``None`` on error.
-    :rtype: Tuple[str, str, Optional[str]]
-    :raises Exception: (caught internally) Unexpected processing errors are surfaced
-        as a user-facing status with details in the preview.
     """
     progress(0, desc="Initializing...")
     raw_content = ""

 """
 RAG-Ready Content Scraper — Gradio + MCP (SSE)
+Exposes an MCP SSE endpoint on Hugging Face Spaces at:
     /gradio_api/mcp/sse
 Example MCP configs:
 import re
 import subprocess
 import tempfile
+from typing import Optional, Tuple, Literal
+# NEW: use Annotated+Doc so MCP can show per-parameter descriptions
+from typing_extensions import Annotated, Doc
 import gradio as gr
 import markdown_pdf
 # -----------------------------
 def is_github_repo(url_or_id: str) -> bool:
+    """Return True if the string looks like a GitHub repository reference."""
     if "github.com" in url_or_id:
         return True
     return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
 def check_repomix_installed() -> bool:
+    """Check if the `repomix` CLI is available on PATH."""
     try:
         result = subprocess.run(
             ["repomix", "--version"],
     repo_url_or_id: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
+    """Run Repomix on a GitHub repository and return combined Markdown."""
     progress(0, desc="Starting Repomix processing...")
     try:
         with tempfile.TemporaryDirectory() as temp_dir:
     depth: int,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
+    """Recursively scrape a website and convert pages to Markdown."""
     progress(0, desc=f"Starting web scrape for {url}...")
     visited_urls = set()
 def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
+    """Wrap Markdown text in a JSON object with `source` and `content` keys."""
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
+    """Persist Markdown as a simple CSV with two columns: `source`, `content`."""
     output = tempfile.NamedTemporaryFile(
         mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
     )
     output_format: str,
     source_url_or_id: str,
 ) -> str:
+    """Save processed content in the selected format and return a file path."""
     processed_content = content  # default for Markdown/Text
     if output_format == "JSON":
 # ----------------------------------------------------------
 def process_input_updated(
+    url_or_id: Annotated[
+        str,
+        Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
+    ],
+    source_type: Annotated[
+        Literal["Webpage", "GitHub Repository"],
+        Doc('Select the content source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
+    ],
+    depth: Annotated[
+        int,
+        Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub repositories."),
+    ],
+    output_format_selection: Annotated[
+        Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
+        Doc("Desired output format for the processed content."),
+    ],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
+    Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
+    Returns:
+        Tuple[str, str, Optional[str]]: (status, preview, file_path)
     """
     progress(0, desc="Initializing...")
     raw_content = ""