File size: 13,476 Bytes
2893e36
2471025
2893e36
5458065
 
 
 
 
 
315dac2
2893e36
 
 
5458065
36555b1
5458065
2893e36
1151f26
 
d70a98e
 
2471025
5458065
 
 
7af8fe7
 
1151f26
5458065
 
 
 
2893e36
7af8fe7
1303e35
5458065
 
 
 
 
 
7af8fe7
1303e35
 
 
5458065
 
 
 
 
7af8fe7
36555b1
1303e35
7af8fe7
 
5458065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36555b1
7af8fe7
5458065
 
 
36555b1
7af8fe7
 
 
36555b1
1303e35
36555b1
 
2893e36
5458065
 
 
 
 
 
7af8fe7
36555b1
7af8fe7
2d6afaa
7af8fe7
 
2d6afaa
7af8fe7
2d6afaa
36555b1
7af8fe7
2d6afaa
36555b1
5458065
36555b1
5458065
 
 
 
 
7af8fe7
2d6afaa
7af8fe7
5458065
 
 
 
 
7af8fe7
 
2d6afaa
36555b1
7af8fe7
 
 
5458065
 
 
7af8fe7
 
 
5458065
7af8fe7
 
 
 
5458065
7af8fe7
 
5458065
 
 
7af8fe7
 
 
 
 
 
5458065
7af8fe7
 
 
 
2471025
7af8fe7
 
 
 
 
2471025
2893e36
7af8fe7
 
 
2471025
 
7af8fe7
2893e36
7af8fe7
2d6afaa
5458065
 
 
7af8fe7
 
2893e36
20dc7c9
5458065
 
 
2893e36
5458065
 
 
 
2893e36
 
23a446a
5458065
 
2893e36
36555b1
5458065
2d6afaa
 
 
36555b1
5458065
36555b1
5458065
2d6afaa
5458065
36555b1
5458065
2d6afaa
36555b1
2d6afaa
1151f26
36555b1
5458065
2893e36
5458065
2d6afaa
5458065
 
9f97f87
5458065
 
 
2471025
5458065
 
 
 
2471025
5458065
 
 
36555b1
 
20dc7c9
9f97f87
36555b1
 
2893e36
2d6afaa
36555b1
20dc7c9
2893e36
5458065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36555b1
5458065
 
 
 
 
 
 
 
 
 
 
 
 
 
36555b1
5458065
36555b1
 
 
 
2d6afaa
36555b1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d6afaa
36555b1
2471025
36555b1
2d6afaa
 
5458065
 
 
36555b1
 
5458065
 
36555b1
 
5458065
 
 
 
 
 
 
36555b1
 
 
5458065
1151f26
36555b1
97b889a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
# app.py
from __future__ import annotations

import os
import csv
import json
import re
import subprocess
import tempfile
from typing import Optional, Tuple, Literal

import gradio as gr
import markdown_pdf
from typing_extensions import Annotated, Doc

from pydantic import BaseModel, Field, conint

from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils

# -----------------------------
# Environment (HF cache dir)
# -----------------------------
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)


# -----------------------------
# Helper utilities
# -----------------------------
def check_repomix_installed() -> bool:
    """Return True if `repomix` is available on PATH."""
    try:
        r = subprocess.run(
            ["repomix", "--version"],
            capture_output=True,
            text=True,
            check=False,
        )
        return r.returncode == 0
    except Exception:
        return False


def run_repomix(
    repo_url_or_id: str,
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, Optional[str]]:
    """Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
    progress(0, desc="Starting Repomix…")
    try:
        with tempfile.TemporaryDirectory() as td:
            out_path = os.path.join(td, "repomix-output.md")
            repo_url = (
                f"https://github.com/{repo_url_or_id}"
                if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
                else repo_url_or_id
            )
            cmd = [
                "repomix",
                "--remote",
                repo_url,
                "--output",
                out_path,
                "--style",
                "markdown",
                "--compress",
            ]
            p = subprocess.run(
                cmd, capture_output=True, text=True, check=False, encoding="utf-8"
            )
            progress(0.8, desc="Repomix done.")
            if p.returncode != 0:
                err = (
                    f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
                )
                return f"Error running Repomix:\n{err}", None
            if os.path.exists(out_path):
                with open(out_path, "r", encoding="utf-8") as f:
                    return f.read(), out_path
            return "Error: Repomix did not produce an output file.", None
    except Exception as e:
        progress(1, desc="Error")
        return f"Error processing GitHub repository: {e}", None


def scrape_and_convert_website(
    url: str,
    depth: int,
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str]:
    """Recursively scrape a website and convert visited pages to Markdown."""
    progress(0, desc=f"Scraping {url}…")
    visited = set()

    def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
        if u in visited or d < 0:
            return ""
        visited.add(u)
        try:
            progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
            html = Scraper.fetch_html(u)
        except Exception as e:
            return f"Error fetching {u}: {e}\n"
        md = (
            f"## Extracted from: {u}\n\n"
            + Converter.html_to_markdown(
                html=html, base_url=u, parser_features="html.parser", ignore_links=True
            )
            + "\n\n"
        )
        if d > 0:
            try:
                links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
                valid = [
                    l
                    for l in links
                    if URLUtils.is_internal(l, u) and l not in visited
                ]
                for j, nxt in enumerate(valid):
                    md += rec(nxt, d - 1, len(valid), j)
            except Exception as e:
                md += f"Error extracting links from {u}: {e}\n"
        return md

    all_md = rec(url, depth)
    with tempfile.NamedTemporaryFile(
        mode="w+", delete=False, suffix=".md", encoding="utf-8"
    ) as tmp:
        tmp.write(all_md)
        return all_md, tmp.name


def convert_to_json(markdown_content: str, source: str) -> str:
    """Wrap Markdown in a tiny JSON schema."""
    return json.dumps({"source": source, "content": markdown_content}, indent=2)


def convert_to_csv(markdown_content: str, source: str) -> str:
    """Write a simple 2-column CSV and return its path."""
    f = tempfile.NamedTemporaryFile(
        mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
    )
    w = csv.writer(f)
    w.writerow(["source", "content"])
    w.writerow([source, markdown_content])
    f.close()
    return f.name


def save_output_to_file(content: str, fmt: str, source: str) -> str:
    """Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
    if fmt == "JSON":
        data = convert_to_json(content, source)
        suffix = ".json"
    elif fmt == "CSV":
        return convert_to_csv(content, source)
    elif fmt == "Text":
        data, suffix = content, ".txt"
    elif fmt == "PDF":
        try:
            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
                path = tmp_pdf.name
            markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
            return path
        except Exception as e:
            print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
            data, suffix = content, ".pdf.md"
    else:
        data, suffix = content, ".md"

    with tempfile.NamedTemporaryFile(
        mode="w+", delete=False, suffix=suffix, encoding="utf-8"
    ) as tmp:
        tmp.write(data)
        return tmp.name


# -----------------------------
# Core UI-bound function
# -----------------------------
def process_input_updated(
    url_or_id: str,
    source_type: Literal["Webpage", "GitHub Repository"],
    depth: int,
    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
    progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str, Optional[str]]:
    """
    UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
    then export as Markdown/JSON/CSV/Text/PDF.
    """
    progress(0, desc="Initializing…")
    out_path: Optional[str] = None

    if source_type == "GitHub Repository":
        if not check_repomix_installed():
            return "Repomix is not installed or not accessible.", "", None
        raw, _ = run_repomix(url_or_id, progress=progress)
        if raw.startswith("Error"):
            return raw, "", None
    elif source_type == "Webpage":
        raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
        if raw.startswith("Error"):
            return raw, "", None
    else:
        return "Invalid source type selected.", "", None

    try:
        progress(0.9, desc=f"Converting to {output_format_selection}…")
        out_path = save_output_to_file(raw, output_format_selection, url_or_id)

        preview = raw
        if output_format_selection == "JSON":
            preview = convert_to_json(raw, url_or_id)
        elif output_format_selection == "CSV":
            try:
                with open(out_path, "r", encoding="utf-8") as f:
                    first_lines = [next(f) for _ in range(5)]
                preview = "".join(first_lines) or "[CSV content is empty or very short]"
            except StopIteration:
                with open(out_path, "r", encoding="utf-8") as f:
                    preview = f.read() or "[CSV content is empty]"
            except Exception as e:
                preview = f"[Error reading CSV for preview: {e}]"
        elif output_format_selection == "PDF":
            from os.path import basename

            preview = (
                f"[PDF generated. Download to view: "
                f"{basename(out_path) if out_path else 'file.pdf'}]"
            )

        progress(1, desc="Done.")
        return f"Successfully processed: {url_or_id}", preview, out_path

    except Exception as e:
        return f"Error during conversion: {e}", "", None


# -----------------------------
# Pydantic models for MCP tool
# -----------------------------
class ProcessArgs(BaseModel):
    url_or_id: str = Field(
        ...,
        description=(
            "For webpages, a full URL (e.g., https://example.com). "
            "For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
        ),
    )
    source_type: Literal["Webpage", "GitHub Repository"] = Field(
        ...,
        description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
    )
    depth: conint(ge=0, le=3) = Field(
        ...,
        description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
    )
    output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
        ...,
        description="Desired output format for the processed content.",
    )


class ProcessResult(BaseModel):
    status: str = Field(..., description="Human-readable status line.")
    preview: str = Field(
        ...,
        description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
    )
    file_path: Optional[str] = Field(
        None, description="Temp file path for the artifact, or null if not created."
    )


def process_input_mcp(args: ProcessArgs) -> ProcessResult:
    """
    MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
    """
    status, preview, path = process_input_updated(
        args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
    )
    return ProcessResult(status=status, preview=preview, file_path=path)


# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
    gr.Markdown("# RAG-Ready Content Scraper")
    gr.Markdown(
        "Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
    )

    with gr.Row():
        with gr.Column(scale=2):
            url_input = gr.Textbox(
                label="Enter URL or GitHub Repository ID",
                placeholder="https://example.com  or  owner/repo",
            )
            source_type_input = gr.Radio(
                choices=["Webpage", "GitHub Repository"],
                value="Webpage",
                label="Select Source Type",
            )
            depth_input = gr.Slider(
                minimum=0,
                maximum=3,
                step=1,
                value=0,
                label="Scraping Depth (for Webpages)",
                info="0 = only main page. Ignored for GitHub.",
            )
            output_format_input = gr.Dropdown(
                choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
                value="Markdown",
                label="Select Output Format",
            )
            submit_button = gr.Button("Process Content", variant="primary")
        with gr.Column(scale=3):
            status_output = gr.Textbox(label="Status", interactive=False)
            preview_output = gr.Code(
                label="Preview Content", language="markdown", interactive=False
            )
            file_download_output = gr.File(
                label="Download Processed File", interactive=False
            )

    gr.Examples(
        examples=[
            ["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
            ["gradio-app/gradio", "GitHub Repository", 0, "Text"],
            [
                "https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
                "Webpage",
                0,
                "JSON",
            ],
        ],
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
        fn=process_input_updated,
        cache_examples=False,
    )

    submit_button.click(
        fn=process_input_updated,
        inputs=[url_input, source_type_input, depth_input, output_format_input],
        outputs=[status_output, preview_output, file_download_output],
    )

# -----------------------------
# MCP-only Interface (Pydantic tool)
# -----------------------------
# We expose a second interface whose *function signature* uses Pydantic models.
# MCP reads this signature to build a JSON Schema with rich field descriptions.
mcp_iface = gr.Interface(
    fn=process_input_mcp,
    # Components are placeholders; MCP ignores them and reads the Python types.
    # Keep them simple so the tab is usable if someone clicks it.
    inputs=gr.JSON(label="ProcessArgs (JSON)"),
    outputs=gr.JSON(label="ProcessResult (JSON)"),
    title="MCP Tool: process_input_mcp",
    description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
    allow_flagging="never",
)

# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])


if __name__ == "__main__":
    # IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
    app.queue().launch(share=True, mcp_server=True)