Spaces:

CultriX
/

RAG-Scraper

Sleeping

App Files Files Community

CultriX commited on Sep 6, 2025

Commit

20dc7c9

verified ·

1 Parent(s): 88150f9

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -32

app.py CHANGED Viewed

@@ -116,74 +116,105 @@ def save_output_to_file(content: str, fmt: str, source: str) -> str:
         return tmp.name
 # ---------- MCP-exposed tool ----------
 def process_input_updated(
     url_or_id: Annotated[
         str,
-        Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or the full GitHub URL."),
     ],
     source_type: Annotated[
         Literal["Webpage", "GitHub Repository"],
-        Doc('Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
     ],
     depth: Annotated[
         int,
-        Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub."),
     ],
     output_format_selection: Annotated[
         Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
-        Doc("Desired output format for the processed content."),
     ],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
-    Scrape a webpage (with configurable depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
-    Returns:
-        (status, preview, file_path)
     """
     progress(0, desc="Initializing…")
-    raw, err = "", ""
-    out_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
-            return "Repomix is not installed or not accessible.", "", None
-        raw, _ = run_repomix(url_or_id, progress=progress)
-        if raw.startswith("Error"):
-            return raw, "", None
     elif source_type == "Webpage":
-        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
-        if raw.startswith("Error"):
-            return raw, "", None
     else:
         return "Invalid source type selected.", "", None
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}…")
-        out_path = save_output_to_file(raw, output_format_selection, url_or_id)
-        preview = raw
         if output_format_selection == "JSON":
-            preview = convert_to_json(raw, url_or_id)
-        elif output_format_selection == "CSV":
             try:
-                with open(out_path, "r", encoding="utf-8") as f:
-                    first_lines = [next(f) for _ in range(5)]
-                preview = "".join(first_lines) or "[CSV content is empty or very short]"
             except StopIteration:
-                with open(out_path, "r", encoding="utf-8") as f:
-                    preview = f.read() or "[CSV content is empty]"
-            except Exception as e:
-                preview = f"[Error reading CSV for preview: {e}]"
         elif output_format_selection == "PDF":
-            from os.path import basename
-            preview = f"[PDF generated. Download to view: {basename(out_path) if out_path else 'file.pdf'}]"
-        progress(1, desc="Done.")
-        return f"Successfully processed: {url_or_id}", preview, out_path
     except Exception as e:
-        return f"Error during conversion: {e}", raw, None
 # ---------- UI ----------
 with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface:

         return tmp.name
 # ---------- MCP-exposed tool ----------
+from typing import Optional, Tuple, Literal
+from typing_extensions import Annotated, Doc
+import os
+import gradio as gr
 def process_input_updated(
     url_or_id: Annotated[
         str,
+        Doc("For webpages: full URL (e.g. https://example.com). For GitHub: owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
     ],
     source_type: Annotated[
         Literal["Webpage", "GitHub Repository"],
+        Doc('Choose source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
     ],
     depth: Annotated[
         int,
+        Doc("Crawl depth for webpages (0–3). 0 = main page only. Ignored for GitHub repositories."),
     ],
     output_format_selection: Annotated[
         Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
+        Doc("Output format for the processed content."),
     ],
     progress: gr.Progress = gr.Progress(track_tqdm=True),
 ) -> Tuple[str, str, Optional[str]]:
     """
+    Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
+    Parameters
+    ----------
+    url_or_id : str
+        For webpages: full URL (e.g. ``https://example.com``).
+        For GitHub: either ``owner/repo`` or a full GitHub URL
+        (e.g. ``https://github.com/owner/repo``).
+    source_type : {"Webpage", "GitHub Repository"}
+        Choose the content source. Use **Webpage** to crawl HTML; use
+        **GitHub Repository** to run Repomix.
+    depth : int
+        Crawl depth for webpages in the range 0–3 where 0 = only the main page.
+        Ignored when ``source_type`` is ``"GitHub Repository"``.
+    output_format_selection : {"Markdown", "JSON", "CSV", "Text", "PDF"}
+        Desired output format for the processed content.
+    progress : gr.Progress, optional
+        (UI only) Gradio progress tracker. MCP callers can omit this.
+    Returns
+    -------
+    (status, preview, file_path) : tuple[str, str, Optional[str]]
+        - **status**: Human-readable status line.
+        - **preview**: Text preview (full Markdown/JSON/Text, or a short note for CSV/PDF).
+        - **file_path**: Path to the generated artifact, or ``None`` on error.
     """
     progress(0, desc="Initializing…")
+    raw_content = ""
+    output_file_path: Optional[str] = None
     if source_type == "GitHub Repository":
         if not check_repomix_installed():
+            return "Repomix is not installed or not accessible. Please install it.", "", None
+        raw_content, _ = run_repomix(url_or_id, progress=progress)
+        if raw_content.startswith("Error"):
+            return raw_content, "", None
     elif source_type == "Webpage":
+        raw_content, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
+        if raw_content.startswith("Error"):
+            return raw_content, "", None
     else:
         return "Invalid source type selected.", "", None
     try:
         progress(0.9, desc=f"Converting to {output_format_selection}…")
+        output_file_path = save_output_to_file(raw_content, output_format_selection, url_or_id)
+        preview_content = raw_content
         if output_format_selection == "JSON":
+            preview_content = convert_to_json(raw_content, url_or_id)
+        elif output_format_selection == "CSV" and output_file_path:
             try:
+                with open(output_file_path, "r", encoding="utf-8") as f_csv:
+                    csv_preview_lines = [next(f_csv) for _ in range(5)]
+                preview_content = "".join(csv_preview_lines) or "[CSV content is empty or very short]"
             except StopIteration:
+                with open(output_file_path, "r", encoding="utf-8") as f_csv:
+                    preview_content = f_csv.read() or "[CSV content is empty]"
+            except Exception as e_csv_preview:
+                preview_content = f"[Error reading CSV for preview: {e_csv_preview}]"
+        elif output_format_selection == "CSV" and not output_file_path:
+            preview_content = "[CSV file path not available for preview]"
         elif output_format_selection == "PDF":
+            preview_content = (
+                f"[PDF generated. Download to view: "
+                f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
+            )
+        progress(1, desc="Processing complete.")
+        return f"Successfully processed: {url_or_id}", preview_content, output_file_path
     except Exception as e:
+        return f"Error during file conversion/saving: {e}", raw_content, None
 # ---------- UI ----------
 with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as iface: