Spaces:
Sleeping
Sleeping
File size: 13,476 Bytes
2893e36 2471025 2893e36 5458065 315dac2 2893e36 5458065 36555b1 5458065 2893e36 1151f26 d70a98e 2471025 5458065 7af8fe7 1151f26 5458065 2893e36 7af8fe7 1303e35 5458065 7af8fe7 1303e35 5458065 7af8fe7 36555b1 1303e35 7af8fe7 5458065 36555b1 7af8fe7 5458065 36555b1 7af8fe7 36555b1 1303e35 36555b1 2893e36 5458065 7af8fe7 36555b1 7af8fe7 2d6afaa 7af8fe7 2d6afaa 7af8fe7 2d6afaa 36555b1 7af8fe7 2d6afaa 36555b1 5458065 36555b1 5458065 7af8fe7 2d6afaa 7af8fe7 5458065 7af8fe7 2d6afaa 36555b1 7af8fe7 5458065 7af8fe7 5458065 7af8fe7 5458065 7af8fe7 5458065 7af8fe7 5458065 7af8fe7 2471025 7af8fe7 2471025 2893e36 7af8fe7 2471025 7af8fe7 2893e36 7af8fe7 2d6afaa 5458065 7af8fe7 2893e36 20dc7c9 5458065 2893e36 5458065 2893e36 23a446a 5458065 2893e36 36555b1 5458065 2d6afaa 36555b1 5458065 36555b1 5458065 2d6afaa 5458065 36555b1 5458065 2d6afaa 36555b1 2d6afaa 1151f26 36555b1 5458065 2893e36 5458065 2d6afaa 5458065 9f97f87 5458065 2471025 5458065 2471025 5458065 36555b1 20dc7c9 9f97f87 36555b1 2893e36 2d6afaa 36555b1 20dc7c9 2893e36 5458065 36555b1 5458065 36555b1 5458065 36555b1 2d6afaa 36555b1 2d6afaa 36555b1 2471025 36555b1 2d6afaa 5458065 36555b1 5458065 36555b1 5458065 36555b1 5458065 1151f26 36555b1 97b889a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
# app.py
from __future__ import annotations
import os
import csv
import json
import re
import subprocess
import tempfile
from typing import Optional, Tuple, Literal
import gradio as gr
import markdown_pdf
from typing_extensions import Annotated, Doc
from pydantic import BaseModel, Field, conint
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
# -----------------------------
# Environment (HF cache dir)
# -----------------------------
os.environ["HF_HOME"] = "/tmp/hf_cache"
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
# -----------------------------
# Helper utilities
# -----------------------------
def check_repomix_installed() -> bool:
"""Return True if `repomix` is available on PATH."""
try:
r = subprocess.run(
["repomix", "--version"],
capture_output=True,
text=True,
check=False,
)
return r.returncode == 0
except Exception:
return False
def run_repomix(
repo_url_or_id: str,
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, Optional[str]]:
"""Run Repomix on a GitHub repo and return combined Markdown (or an Error string)."""
progress(0, desc="Starting Repomix…")
try:
with tempfile.TemporaryDirectory() as td:
out_path = os.path.join(td, "repomix-output.md")
repo_url = (
f"https://github.com/{repo_url_or_id}"
if ("/" in repo_url_or_id and not repo_url_or_id.startswith("http"))
else repo_url_or_id
)
cmd = [
"repomix",
"--remote",
repo_url,
"--output",
out_path,
"--style",
"markdown",
"--compress",
]
p = subprocess.run(
cmd, capture_output=True, text=True, check=False, encoding="utf-8"
)
progress(0.8, desc="Repomix done.")
if p.returncode != 0:
err = (
f"Return Code: {p.returncode}\nStderr: {p.stderr}\nStdout: {p.stdout}"
)
return f"Error running Repomix:\n{err}", None
if os.path.exists(out_path):
with open(out_path, "r", encoding="utf-8") as f:
return f.read(), out_path
return "Error: Repomix did not produce an output file.", None
except Exception as e:
progress(1, desc="Error")
return f"Error processing GitHub repository: {e}", None
def scrape_and_convert_website(
url: str,
depth: int,
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str]:
"""Recursively scrape a website and convert visited pages to Markdown."""
progress(0, desc=f"Scraping {url}…")
visited = set()
def rec(u: str, d: int, n: int = 1, i: int = 0) -> str:
if u in visited or d < 0:
return ""
visited.add(u)
try:
progress(i / n if n > 0 else 0, desc=f"Scraping: {u}")
html = Scraper.fetch_html(u)
except Exception as e:
return f"Error fetching {u}: {e}\n"
md = (
f"## Extracted from: {u}\n\n"
+ Converter.html_to_markdown(
html=html, base_url=u, parser_features="html.parser", ignore_links=True
)
+ "\n\n"
)
if d > 0:
try:
links = LinkExtractor.scrape_url(u, link_type=LinkType.INTERNAL)
valid = [
l
for l in links
if URLUtils.is_internal(l, u) and l not in visited
]
for j, nxt in enumerate(valid):
md += rec(nxt, d - 1, len(valid), j)
except Exception as e:
md += f"Error extracting links from {u}: {e}\n"
return md
all_md = rec(url, depth)
with tempfile.NamedTemporaryFile(
mode="w+", delete=False, suffix=".md", encoding="utf-8"
) as tmp:
tmp.write(all_md)
return all_md, tmp.name
def convert_to_json(markdown_content: str, source: str) -> str:
"""Wrap Markdown in a tiny JSON schema."""
return json.dumps({"source": source, "content": markdown_content}, indent=2)
def convert_to_csv(markdown_content: str, source: str) -> str:
"""Write a simple 2-column CSV and return its path."""
f = tempfile.NamedTemporaryFile(
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
)
w = csv.writer(f)
w.writerow(["source", "content"])
w.writerow([source, markdown_content])
f.close()
return f.name
def save_output_to_file(content: str, fmt: str, source: str) -> str:
"""Persist content in the selected format (Markdown/JSON/CSV/Text/PDF) and return file path."""
if fmt == "JSON":
data = convert_to_json(content, source)
suffix = ".json"
elif fmt == "CSV":
return convert_to_csv(content, source)
elif fmt == "Text":
data, suffix = content, ".txt"
elif fmt == "PDF":
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
path = tmp_pdf.name
markdown_pdf.MarkdownPdf(toc_level=2).convert_from_string(content, path)
return path
except Exception as e:
print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
data, suffix = content, ".pdf.md"
else:
data, suffix = content, ".md"
with tempfile.NamedTemporaryFile(
mode="w+", delete=False, suffix=suffix, encoding="utf-8"
) as tmp:
tmp.write(data)
return tmp.name
# -----------------------------
# Core UI-bound function
# -----------------------------
def process_input_updated(
url_or_id: str,
source_type: Literal["Webpage", "GitHub Repository"],
depth: int,
output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
progress: gr.Progress = gr.Progress(track_tqdm=True),
) -> Tuple[str, str, Optional[str]]:
"""
UI function: scrape a webpage (with depth) or dump a GitHub repo (Repomix),
then export as Markdown/JSON/CSV/Text/PDF.
"""
progress(0, desc="Initializing…")
out_path: Optional[str] = None
if source_type == "GitHub Repository":
if not check_repomix_installed():
return "Repomix is not installed or not accessible.", "", None
raw, _ = run_repomix(url_or_id, progress=progress)
if raw.startswith("Error"):
return raw, "", None
elif source_type == "Webpage":
raw, _ = scrape_and_convert_website(url_or_id, depth, progress=progress)
if raw.startswith("Error"):
return raw, "", None
else:
return "Invalid source type selected.", "", None
try:
progress(0.9, desc=f"Converting to {output_format_selection}…")
out_path = save_output_to_file(raw, output_format_selection, url_or_id)
preview = raw
if output_format_selection == "JSON":
preview = convert_to_json(raw, url_or_id)
elif output_format_selection == "CSV":
try:
with open(out_path, "r", encoding="utf-8") as f:
first_lines = [next(f) for _ in range(5)]
preview = "".join(first_lines) or "[CSV content is empty or very short]"
except StopIteration:
with open(out_path, "r", encoding="utf-8") as f:
preview = f.read() or "[CSV content is empty]"
except Exception as e:
preview = f"[Error reading CSV for preview: {e}]"
elif output_format_selection == "PDF":
from os.path import basename
preview = (
f"[PDF generated. Download to view: "
f"{basename(out_path) if out_path else 'file.pdf'}]"
)
progress(1, desc="Done.")
return f"Successfully processed: {url_or_id}", preview, out_path
except Exception as e:
return f"Error during conversion: {e}", "", None
# -----------------------------
# Pydantic models for MCP tool
# -----------------------------
class ProcessArgs(BaseModel):
url_or_id: str = Field(
...,
description=(
"For webpages, a full URL (e.g., https://example.com). "
"For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."
),
)
source_type: Literal["Webpage", "GitHub Repository"] = Field(
...,
description='Choose the source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.',
)
depth: conint(ge=0, le=3) = Field(
...,
description="Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub.",
)
output_format_selection: Literal["Markdown", "JSON", "CSV", "Text", "PDF"] = Field(
...,
description="Desired output format for the processed content.",
)
class ProcessResult(BaseModel):
status: str = Field(..., description="Human-readable status line.")
preview: str = Field(
...,
description="Preview text (Markdown/JSON/Text), or a short note for CSV/PDF.",
)
file_path: Optional[str] = Field(
None, description="Temp file path for the artifact, or null if not created."
)
def process_input_mcp(args: ProcessArgs) -> ProcessResult:
"""
MCP-friendly tool that accepts/returns Pydantic models (schema carries field descriptions).
"""
status, preview, path = process_input_updated(
args.url_or_id, args.source_type, int(args.depth), args.output_format_selection
)
return ProcessResult(status=status, preview=preview, file_path=path)
# -----------------------------
# Gradio UI
# -----------------------------
with gr.Blocks(title="RAG-Ready Content Scraper", theme="CultriX/gradio-theme") as ui_iface:
gr.Markdown("# RAG-Ready Content Scraper")
gr.Markdown(
"Scrape webpage content or GitHub repositories to generate RAG-ready datasets."
)
with gr.Row():
with gr.Column(scale=2):
url_input = gr.Textbox(
label="Enter URL or GitHub Repository ID",
placeholder="https://example.com or owner/repo",
)
source_type_input = gr.Radio(
choices=["Webpage", "GitHub Repository"],
value="Webpage",
label="Select Source Type",
)
depth_input = gr.Slider(
minimum=0,
maximum=3,
step=1,
value=0,
label="Scraping Depth (for Webpages)",
info="0 = only main page. Ignored for GitHub.",
)
output_format_input = gr.Dropdown(
choices=["Markdown", "JSON", "CSV", "Text", "PDF"],
value="Markdown",
label="Select Output Format",
)
submit_button = gr.Button("Process Content", variant="primary")
with gr.Column(scale=3):
status_output = gr.Textbox(label="Status", interactive=False)
preview_output = gr.Code(
label="Preview Content", language="markdown", interactive=False
)
file_download_output = gr.File(
label="Download Processed File", interactive=False
)
gr.Examples(
examples=[
["https://gradio.app/docs/js", "Webpage", 1, "Markdown"],
["gradio-app/gradio", "GitHub Repository", 0, "Text"],
[
"https://en.wikipedia.org/wiki/Retrieval-augmented_generation",
"Webpage",
0,
"JSON",
],
],
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
fn=process_input_updated,
cache_examples=False,
)
submit_button.click(
fn=process_input_updated,
inputs=[url_input, source_type_input, depth_input, output_format_input],
outputs=[status_output, preview_output, file_download_output],
)
# -----------------------------
# MCP-only Interface (Pydantic tool)
# -----------------------------
# We expose a second interface whose *function signature* uses Pydantic models.
# MCP reads this signature to build a JSON Schema with rich field descriptions.
mcp_iface = gr.Interface(
fn=process_input_mcp,
# Components are placeholders; MCP ignores them and reads the Python types.
# Keep them simple so the tab is usable if someone clicks it.
inputs=gr.JSON(label="ProcessArgs (JSON)"),
outputs=gr.JSON(label="ProcessResult (JSON)"),
title="MCP Tool: process_input_mcp",
description="Pydantic-typed MCP tool exposing rich parameter descriptions.",
allow_flagging="never",
)
# Combine the user UI and the MCP tool as two tabs (the second can be ignored by users).
app = gr.TabbedInterface([ui_iface, mcp_iface], tab_names=["App", "MCP"])
if __name__ == "__main__":
# IMPORTANT: enable MCP on launch so Spaces exposes /gradio_api/mcp/sse
app.queue().launch(share=True, mcp_server=True) |