Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

CultriX commited on Sep 6, 2025

Commit

23a446a

verified ·

1 Parent(s): 2893e36

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -127

app.py CHANGED Viewed

@@ -2,13 +2,12 @@
 """
 RAG-Ready Content Scraper — Gradio + MCP (SSE)
-This Gradio app runs on Hugging Face Spaces and automatically exposes an MCP SSE
-endpoint at `/gradio_api/mcp/sse` (when the environment supports it).
-MCP SSE client config examples:
 1) Direct SSE (Cursor, Windsurf, Cline, etc.)
 {
   "mcpServers": {
     "gradio": {
@@ -17,8 +16,7 @@ MCP SSE client config examples:
   }
 }
-2) Experimental stdio via Node (for clients that only support stdio):
 {
   "mcpServers": {
     "gradio": {
@@ -57,18 +55,19 @@ from rag_scraper.utils import URLUtils
 # -----------------------------
-# Utility / helper functions
 # -----------------------------
 def is_github_repo(url_or_id: str) -> bool:
-    """Return True if the string looks like a GitHub repository reference.
-    Args:
-        url_or_id: Either a full GitHub URL (containing 'github.com') or an
-            "owner/repo" identifier (alphanumeric/._-).
-    Returns:
-        bool: True if value matches a GitHub repo URL or "owner/repo" pattern.
     """
     if "github.com" in url_or_id:
         return True
@@ -76,10 +75,11 @@ def is_github_repo(url_or_id: str) -> bool:
 def check_repomix_installed() -> bool:
-    """Check whether `repomix` is available on PATH.
-    Returns:
-        bool: True if `repomix --version` executes successfully, else False.
     """
     try:
         result = subprocess.run(
@@ -97,22 +97,18 @@ def run_repomix(
     repo_url_or_id: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
-    """Run Repomix on a GitHub repository and return its combined Markdown corpus.
-    Args:
-        repo_url_or_id: GitHub repo as full URL or "owner/repo".
-        progress: Gradio progress object (auto-provided in UI; ignored by MCP).
-    Returns:
-        (content, output_path):
-            content (str): Combined Markdown content, or an error message that
-                starts with "Error".
-            output_path (Optional[str]): Path to the temp file created by Repomix,
-                or None if not applicable.
-    Notes:
-        - Requires `repomix` installed in the environment.
-        - If `repo_url_or_id` is "owner/repo" it is expanded to a full GitHub URL.
     """
     progress(0, desc="Starting Repomix processing...")
     try:
@@ -128,15 +124,11 @@ def run_repomix(
             progress(0.2, desc=f"Running Repomix on {repo_url}...")
             cmd = [
                 "repomix",
-                "--remote",
-                repo_url,
-                "--output",
-                output_file_path,
-                "--style",
-                "markdown",
                 "--compress",
             ]
             process = subprocess.run(
                 cmd, capture_output=True, text=True, check=False, encoding="utf-8"
             )
@@ -177,22 +169,17 @@ def scrape_and_convert_website(
     depth: int,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
-    """Recursively scrape a website and convert each visited page to Markdown.
-    Args:
-        url: Starting URL to scrape.
-        depth: Crawl depth (0 = only the main page, 1..3 will follow internal links).
-        progress: Gradio progress object (auto-provided in UI; ignored by MCP).
-    Returns:
-        (combined_markdown, temp_markdown_filepath):
-            combined_markdown (str): All pages concatenated with headings.
-            temp_markdown_filepath (str): Path to a temp file containing the same
-                combined Markdown content.
-    Notes:
-        - Only internal links are visited.
-        - Link extraction uses `LinkExtractor` with `LinkType.INTERNAL`.
     """
     progress(0, desc=f"Starting web scrape for {url}...")
     visited_urls = set()
@@ -212,7 +199,10 @@ def scrape_and_convert_website(
             progress_val = (
                 link_index / total_links_estimate if total_links_estimate > 0 else 0
             )
-            progress(progress_val, desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})")
             html_content = Scraper.fetch_html(current_url)
         except Exception as e:
             return f"Error fetching {current_url}: {str(e)}\n"
@@ -254,28 +244,30 @@ def scrape_and_convert_website(
 def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
-    """Wrap Markdown text in a simple JSON object with 'source' and 'content' keys.
-    Args:
-        markdown_content: The Markdown body to embed.
-        source_url_or_id: The original input string identifying the source.
-    Returns:
-        str: Pretty-printed JSON string.
     """
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
-    """Write a simple CSV file with columns ['source','content'].
-    Args:
-        markdown_content: The Markdown body to store in CSV.
-        source_url_or_id: The original input string identifying the source.
-    Returns:
-        str: Path to the created CSV file.
     """
     output = tempfile.NamedTemporaryFile(
         mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
@@ -292,19 +284,21 @@ def save_output_to_file(
     output_format: str,
     source_url_or_id: str,
 ) -> str:
-    """Persist processed content in the selected output format and return a filepath.
-    Args:
-        content: The raw Markdown to save or convert.
-        output_format: One of {"Markdown","JSON","CSV","Text","PDF"}.
-        source_url_or_id: The original input string identifying the source.
-    Returns:
-        str: Path to a temporary file holding the artifact (may be a PDF, CSV, etc.).
-    Notes:
-        - PDF uses `markdown_pdf` and writes directly to a temporary `.pdf` file.
-        - CSV uses a 2-column schema: ['source','content'].
     """
     processed_content = content  # default for Markdown/Text
@@ -316,7 +310,6 @@ def save_output_to_file(
     elif output_format == "Text":
         suffix = ".txt"
     elif output_format == "PDF":
-        # Write PDF directly and return the path.
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
                 pdf_output_path = tmp_pdf.name
@@ -327,7 +320,6 @@ def save_output_to_file(
             # Fallback: persist as Markdown with .pdf.md suffix.
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
             suffix = ".pdf.md"
-        # For the fallback, continue below and write Markdown.
     else:
         suffix = ".md"
@@ -339,7 +331,7 @@ def save_output_to_file(
 # ----------------------------------------------------------
-# Main tool function (this is what MCP exposes via SSE)
 # ----------------------------------------------------------
 def process_input_updated(
@@ -349,39 +341,38 @@ def process_input_updated(
     output_format_selection: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
-    """Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
-    This function is exposed to MCP clients at the Spaces SSE endpoint.
-    Args:
-        url_or_id:
-            For webpages: a full URL (e.g. "https://example.com").
-            For GitHub: either "owner/repo" or a full GitHub URL.
-        source_type:
-            One of {"Webpage", "GitHub Repository"}.
-            Selects whether to crawl HTML pages or run Repomix.
-        depth:
-            Crawl depth for webpages (0–3). Ignored when source_type="GitHub Repository".
-        output_format_selection:
-            One of {"Markdown","JSON","CSV","Text","PDF"} specifying the output format.
-        progress:
-            Gradio progress tracker (in UI). MCP callers can omit it.
-    Returns:
-        (status, preview, file_path):
-            status (str): Human-readable status message.
-            preview (str): Short preview or full text (Markdown/JSON/Text). For CSV/PDF,
-                a helpful note is returned with the generated filename.
-            file_path (Optional[str]): Path to the generated file (for download),
-                or None if an error occurred.
-    Behavior:
-        - For "GitHub Repository", requires `repomix` to be installed on PATH.
-        - For "Webpage", recursively scrapes internal links up to `depth`.
-        - Converts to the requested format and saves a temp file for download.
-    Errors:
-        - Any internal exception is caught and returned as a user-facing status + preview.
     """
     progress(0, desc="Initializing...")
     raw_content = ""
@@ -422,7 +413,7 @@ def process_input_updated(
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
-            # Read only a few lines for preview
             try:
                 with open(output_file_path, "r", encoding="utf-8") as f_csv:
                     csv_preview_lines = [next(f_csv) for _ in range(5)]
@@ -435,7 +426,7 @@ def process_input_updated(
         elif output_format_selection == "CSV" and not output_file_path:
             preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
-            # PDF cannot be previewed as text; provide a helpful note.
             preview_content = (
                 f"[PDF generated. Download to view: "
                 f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
@@ -529,7 +520,7 @@ Markdown, JSON, CSV, Text, PDF.
 **Notes**
 - PDF generation requires the `markdown-pdf` library.
-- This app is designed for Docker/Hugging Face Spaces.
 - MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
 """
         )
@@ -541,5 +532,5 @@ Markdown, JSON, CSV, Text, PDF.
     )
 if __name__ == "__main__":
-    # Spaces typically set up their own server. queue() is safe for concurrency.
     iface.queue().launch(share=True)

 """
 RAG-Ready Content Scraper — Gradio + MCP (SSE)
+This app runs on Hugging Face Spaces and exposes an MCP SSE endpoint at:
+    /gradio_api/mcp/sse
+Example MCP configs:
 1) Direct SSE (Cursor, Windsurf, Cline, etc.)
 {
   "mcpServers": {
     "gradio": {
   }
 }
+2) Experimental stdio via Node:
 {
   "mcpServers": {
     "gradio": {
 # -----------------------------
+# Helper utilities
 # -----------------------------
 def is_github_repo(url_or_id: str) -> bool:
+    """
+    Determine whether the string looks like a GitHub repository reference.
+    :param url_or_id: Full GitHub URL containing ``github.com`` or an
+        ``owner/repo`` identifier.
+    :type url_or_id: str
+    :return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
+        otherwise ``False``.
+    :rtype: bool
     """
     if "github.com" in url_or_id:
         return True
 def check_repomix_installed() -> bool:
+    """
+    Check if the ``repomix`` CLI is available on PATH.
+    :return: ``True`` if ``repomix --version`` succeeds, else ``False``.
+    :rtype: bool
     """
     try:
         result = subprocess.run(
     repo_url_or_id: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, Optional[str]]:
+    """
+    Run Repomix on a GitHub repository and return combined Markdown.
+    :param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
+        or in the form ``owner/repo``.
+    :type repo_url_or_id: str
+    :param progress: Gradio progress tracker (UI only).
+    :type progress: gr.Progress
+    :return: A tuple ``(content, output_path)`` where ``content`` is the
+        combined Markdown or an error string starting with ``"Error"``, and
+        ``output_path`` is the temp file path (or ``None``).
+    :rtype: Tuple[str, Optional[str]]
     """
     progress(0, desc="Starting Repomix processing...")
     try:
             progress(0.2, desc=f"Running Repomix on {repo_url}...")
             cmd = [
                 "repomix",
+                "--remote", repo_url,
+                "--output", output_file_path,
+                "--style", "markdown",
                 "--compress",
             ]
             process = subprocess.run(
                 cmd, capture_output=True, text=True, check=False, encoding="utf-8"
             )
     depth: int,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str]:
+    """
+    Recursively scrape a website and convert pages to Markdown.
+    :param url: Starting URL to scrape.
+    :type url: str
+    :param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
+    :type depth: int
+    :param progress: Gradio progress tracker (UI only).
+    :type progress: gr.Progress
+    :return: A tuple ``(combined_markdown, tmp_md_path)``.
+    :rtype: Tuple[str, str]
     """
     progress(0, desc=f"Starting web scrape for {url}...")
     visited_urls = set()
             progress_val = (
                 link_index / total_links_estimate if total_links_estimate > 0 else 0
             )
+            progress(
+                progress_val,
+                desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})",
+            )
             html_content = Scraper.fetch_html(current_url)
         except Exception as e:
             return f"Error fetching {current_url}: {str(e)}\n"
 def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
+    """
+    Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
+    :param markdown_content: The Markdown body to embed.
+    :type markdown_content: str
+    :param source_url_or_id: Original input string identifying the source.
+    :type source_url_or_id: str
+    :return: Pretty-printed JSON string.
+    :rtype: str
     """
     data = {"source": source_url_or_id, "content": markdown_content}
     return json.dumps(data, indent=2)
 def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
+    """
+    Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
+    :param markdown_content: The Markdown body to store.
+    :type markdown_content: str
+    :param source_url_or_id: Original input string identifying the source.
+    :type source_url_or_id: str
+    :return: Path to the created CSV file.
+    :rtype: str
     """
     output = tempfile.NamedTemporaryFile(
         mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
     output_format: str,
     source_url_or_id: str,
 ) -> str:
+    """
+    Save processed content in the selected format and return a file path.
+    :param content: The raw Markdown to save or convert.
+    :type content: str
+    :param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
+    :type output_format: str
+    :param source_url_or_id: Original input string identifying the source.
+    :type source_url_or_id: str
+    :return: Path to a temporary file holding the artifact.
+    :rtype: str
+    .. note::
+       PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
+       CSV uses a 2-column schema: ``['source','content']``.
     """
     processed_content = content  # default for Markdown/Text
     elif output_format == "Text":
         suffix = ".txt"
     elif output_format == "PDF":
         try:
             with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
                 pdf_output_path = tmp_pdf.name
             # Fallback: persist as Markdown with .pdf.md suffix.
             print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
             suffix = ".pdf.md"
     else:
         suffix = ".md"
 # ----------------------------------------------------------
+# Main tool function (exposed to MCP via SSE)
 # ----------------------------------------------------------
 def process_input_updated(
     output_format_selection: str,
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
+    """
+    Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
+    This function is exposed to MCP clients via the Spaces SSE endpoint
+    ``/gradio_api/mcp/sse``.
+    :param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
+        For GitHub, either ``owner/repo`` or a full GitHub URL
+        (e.g., ``https://github.com/owner/repo``).
+    :type url_or_id: str
+    :param source_type: Select the content source. One of
+        {``"Webpage"``, ``"GitHub Repository"``}.
+    :type source_type: str
+    :param depth: Crawl depth for webpages. Integer in the range 0–3 where
+        0 = only the main page. **Ignored** when ``source_type`` is
+        ``"GitHub Repository"``.
+    :type depth: int
+    :param output_format_selection: Desired output format. One of
+        {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
+    :type output_format_selection: str
+    :param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
+    :type progress: gr.Progress
+    :returns: A 3-tuple:
+        - **status** (*str*): Human-readable status line.
+        - **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
+        - **file_path** (*Optional[str]*): Path to the generated artifact for download,
+          or ``None`` on error.
+    :rtype: Tuple[str, str, Optional[str]]
+    :raises Exception: (caught internally) Unexpected processing errors are surfaced
+        as a user-facing status with details in the preview.
     """
     progress(0, desc="Initializing...")
     raw_content = ""
         if output_format_selection == "JSON":
             preview_content = convert_to_json(raw_content, url_or_id)
         elif output_format_selection == "CSV" and output_file_path:
+            # Show a small preview of the CSV
             try:
                 with open(output_file_path, "r", encoding="utf-8") as f_csv:
                     csv_preview_lines = [next(f_csv) for _ in range(5)]
         elif output_format_selection == "CSV" and not output_file_path:
             preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
+            # Can't render PDF in text preview
             preview_content = (
                 f"[PDF generated. Download to view: "
                 f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
 **Notes**
 - PDF generation requires the `markdown-pdf` library.
+- Designed for Docker/Hugging Face Spaces.
 - MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
 """
         )
     )
 if __name__ == "__main__":
+    # Enable queuing for concurrency; Spaces generally manage hosting.
     iface.queue().launch(share=True)