Spaces:

CultriX
/

RAG-Scraper

Sleeping

File size: 13,476 Bytes

# app.py
from __future__ import annotations

import os
import csv
import json
import re
import subprocess
import tempfile
from typing import Optional, Tuple, Literal

import gradio as gr
import markdown_pdf
from typing_extensions import Annotated, Doc

from pydantic import BaseModel, Field, conint

from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils

# -----------------------------
# Environment (HF cache dir)
# -----------------------------
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)


# -----------------------------
# Helper utilities
# -----------------------------
def check_repomix_installed() -> bool:
    """Return True if `repomix` is available on PATH."""
    try:
        r = subprocess.run(
            ["repomix", "--version"],
            capture_output=True,
            text=True,
            check=False,
        )
        return r.returncode == 0
    except Exception:
        return False


def run_repomix(
    repo_url_or_id: str,
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, Optional[str]]:
    """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
    progress(0, desc="Starting Repomix…")
    try:
        with tempfile.TemporaryDirectory() as td:
            out_path = os.path.join(td, "repomix-output.md")
            repo_url = (
                f"https://github.com/{repo_url_or_id}"
                if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
                else repo_url_or_id
            )
            cmd = [
                "repomix",
                "--remote",
                repo_url,
                "--output",
                out_path,
                "--style",
                "markdown",
                "--compress",
            ]
            p = subprocess.run(
                cmd, capture_output=True, text=True, check=False, encoding="utf-8"
            )
            progress(0.8, desc="Repomix done.")
            if p.returncode != 0:
                err = (
                    f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
                )
                return f"Error running Repomix:\n{err}", None
            if os.path.exists(out_path):
                with open(out_path, "r", encoding="utf-8") as f:
                    return f.read(), out_path
            return "Error: Repomix did not produce an output file.", None
    except Exception as e:
        progress(1, desc="Error")
        return f"Error processing GitHub repository: {e}", None


def scrape_and_convert_website(
    url: str,
    depth: int,
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str]:
    """Recursively scrape a website and convert visited pages to Markdown."""
    progress(0, desc=f"Scraping {url}…")
    visited = set()

    def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
        if u in visited or d < 0:
            return ""
        visited.add(u)
        try:
            progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
            html = Scraper.fetch_html(u)
        except Exception as e:
            return f"Error fetching {u}: {e}\n"
        md = (
            f"## Extracted from: {u}\n\n"
            + Converter.html_to_markdown(
                html=html, base_url=u, parser_features="html.parser", ignore_links=True
            )
            + "\n\n"
        )
        if d > 0:
            try:
                links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
                valid = [
                    l
                    for l in links
                    if URLUtils.is_internal(l, u) and l not in visited
                ]
                for j, nxt in enumerate(valid):
                    md += rec(nxt, d - 1, len(valid), j)
            except Exception as e:
                md += f"Error extracting links from {u}: {e}\n"
        return md

    all_md = rec(url, depth)
    with tempfile.NamedTemporaryFile(
        mode="w+", delete=False, suffix=".md", encoding="utf-8"
    ) as tmp:
        tmp.write(all_md)
        return all_md, tmp.name


def convert_to_json(markdown_content: str, source: str) -> str:
    """Wrap Markdown in a tiny JSON schema."""
    return json.dumps({"source": source, "content": markdown_content}, indent=2)


def convert_to_csv(markdown_content: str, source: str) -> str:
    """Write a simple 2-column CSV and return its path."""
    f = tempfile.NamedTemporaryFile(
        mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
    )
    w = csv.writer(f)
    w.writerow(["source", "content"])
    w.writerow([source, markdown_content])
    f.close()
    return f.name


def save_output_to_file(content: str, fmt: str, source: str) -> str:
    """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
    if fmt == "JSON":
        data = convert_to_json(content, source)
        suffix = ".json"
    elif fmt == "CSV":
        return convert_to_csv(content, source)
    elif fmt == "Text":
        data, suffix = content, ".txt"
    elif fmt == "PDF":
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
                path = tmp_pdf.name
            markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
            return path
        except Exception as e:
            print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
            data, suffix = content, ".pdf.md"
    else:
        data, suffix = content, ".md"

    with tempfile.NamedTemporaryFile(
        mode="w+", delete=False, suffix=suffix, encoding="utf-8"
    ) as tmp:
        tmp.write(data)
        return tmp.name


# -----------------------------
# Core UI-bound function
# -----------------------------
def process_input_updated(
    url_or_id: str,
    source_type: Literal["Webpage", "GitHub Repository"],
    depth: int,
    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str, Optional[str]]:
    """
    UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
    then export as Markdown/JSON/CSV/Text/PDF.
    """
    progress(0, desc="Initializing…")
    out_path: Optional[str] = None

    if source_type == "GitHub Repository":
        if not check_repomix_installed():
            return "Repomix is not installed or not accessible.", "", None
        raw, _ = run_repomix(url_or_id, progress=progress)
        if raw.startswith("Error"):
            return raw, "", None
    elif source_type == "Webpage":
        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
        if raw.startswith("Error"):
            return raw, "", None
    else:
        return "Invalid source type selected.", "", None

    try:
        progress(0.9, desc=f"Converting to {output_format_selection}…")
        out_path = save_output_to_file(raw, output_format_selection, url_or_id)

        preview = raw
        if output_format_selection == "JSON":
            preview = convert_to_json(raw, url_or_id)
        elif output_format_selection == "CSV":
            try:
                with open(out_path, "r", encoding="utf-8") as f:
                    first_lines = [next(f) for _ in range(5)]
                preview = "".join(first_lines) or "[CSV content is empty or very short]"
            except StopIteration:
                with open(out_path, "r", encoding="utf-8") as f:
                    preview = f.read() or "[CSV content is empty]"
            except Exception as e:
                preview = f"[Error reading CSV for preview: {e}]"
        elif output_format_selection == "PDF":
            from os.path import basename

            preview = (
                f"[PDF generated. Download to view: "
                f"{basename(out_path) if out_path else 'file.pdf'}]"
            )

        progress(1, desc="Done.")
        return f"Successfully processed: {url_or_id}", preview, out_path

    except Exception as e:
        return f"Error during conversion: {e}", "", None


# -----------------------------
# Pydantic models for MCP tool
# -----------------------------
class ProcessArgs(BaseModel):
    url_or_id: str = Field(
        ...,
        description=(
            "For webpages, a full URL (e.g., https://example.com). "
            "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
        ),
    )
    source_type: Literal["Webpage", "GitHub Repository"] = Field(
        ...,
        description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
    )
    depth: conint(ge=0, le=3) = Field(
        ...,
        description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
    )
    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
        ...,
        description="Desired output format for the processed content.",
    )


class ProcessResult(BaseModel):
    status: str = Field(..., description="Human-readable status line.")
    preview: str = Field(
        ...,
        description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
    )
    file_path: Optional[str] = Field(
        None, description="Temp file path for the artifact, or null if not created."
    )


def process_input_mcp(args: ProcessArgs) -> ProcessResult:
    """
    MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
    """
    status, preview, path = process_input_updated(
        args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
    )
    return ProcessResult(status=status, preview=preview, file_path=path)


# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
    gr.Markdown("# RAG-Ready Content Scraper")
    gr.Markdown(
        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
    )

    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Enter URL or GitHub Repository ID",
                placeholder="https://example.com  or  owner/repo",
            )
            source_type_input = gr.Radio(
                choices=["Webpage", "GitHub Repository"],
                value="Webpage",
                label="Select Source Type",
            )
            depth_input = gr.Slider(
                minimum=0,
                maximum=3,
                step=1,
                value=0,
                label="Scraping Depth (for Webpages)",
                info="0 = only main page. Ignored for GitHub.",
            )
            output_format_input = gr.Dropdown(
                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
                value="Markdown",
                label="Select Output Format",
            )
            submit_button = gr.Button("Process Content", variant="primary")
        with gr.Column(scale=3):
            status_output = gr.Textbox(label="Status", interactive=False)
            preview_output = gr.Code(
                label="Preview Content", language="markdown", interactive=False
            )
            file_download_output = gr.File(
                label="Download Processed File", interactive=False
            )

    gr.Examples(
        examples=[
            ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
            [
                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
                "Webpage",
                0,
                "JSON",
            ],
        ],
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
        fn=process_input_updated,
        cache_examples=False,
    )

    submit_button.click(
        fn=process_input_updated,
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
    )

# -----------------------------
# MCP-only Interface (Pydantic tool)
# -----------------------------
# We expose a second interface whose *function signature* uses Pydantic models.
# MCP reads this signature to build a JSON Schema with rich field descriptions.
mcp_iface = gr.Interface(
    fn=process_input_mcp,
    # Components are placeholders; MCP ignores them and reads the Python types.
    # Keep them simple so the tab is usable if someone clicks it.
    inputs=gr.JSON(label="ProcessArgs (JSON)"),
    outputs=gr.JSON(label="ProcessResult (JSON)"),
    title="MCP Tool: process_input_mcp",
    description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
    allow_flagging="never",
)

# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])


if __name__ == "__main__":
    # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
    app.queue().launch(share=True, mcp_server=True)