Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
"""
|
| 3 |
RAG-Ready Content Scraper — Gradio + MCP (SSE)
|
| 4 |
|
| 5 |
-
|
| 6 |
/gradio_api/mcp/sse
|
| 7 |
|
| 8 |
Example MCP configs:
|
|
@@ -43,7 +43,10 @@ import json
|
|
| 43 |
import re
|
| 44 |
import subprocess
|
| 45 |
import tempfile
|
| 46 |
-
from typing import Optional, Tuple
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
import gradio as gr
|
| 49 |
import markdown_pdf
|
|
@@ -59,28 +62,14 @@ from rag_scraper.utils import URLUtils
|
|
| 59 |
# -----------------------------
|
| 60 |
|
| 61 |
def is_github_repo(url_or_id: str) -> bool:
|
| 62 |
-
"""
|
| 63 |
-
Determine whether the string looks like a GitHub repository reference.
|
| 64 |
-
|
| 65 |
-
:param url_or_id: Full GitHub URL containing ``github.com`` or an
|
| 66 |
-
``owner/repo`` identifier.
|
| 67 |
-
:type url_or_id: str
|
| 68 |
-
:return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
|
| 69 |
-
otherwise ``False``.
|
| 70 |
-
:rtype: bool
|
| 71 |
-
"""
|
| 72 |
if "github.com" in url_or_id:
|
| 73 |
return True
|
| 74 |
return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
|
| 75 |
|
| 76 |
|
| 77 |
def check_repomix_installed() -> bool:
|
| 78 |
-
"""
|
| 79 |
-
Check if the ``repomix`` CLI is available on PATH.
|
| 80 |
-
|
| 81 |
-
:return: ``True`` if ``repomix --version`` succeeds, else ``False``.
|
| 82 |
-
:rtype: bool
|
| 83 |
-
"""
|
| 84 |
try:
|
| 85 |
result = subprocess.run(
|
| 86 |
["repomix", "--version"],
|
|
@@ -97,19 +86,7 @@ def run_repomix(
|
|
| 97 |
repo_url_or_id: str,
|
| 98 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 99 |
) -> Tuple[str, Optional[str]]:
|
| 100 |
-
"""
|
| 101 |
-
Run Repomix on a GitHub repository and return combined Markdown.
|
| 102 |
-
|
| 103 |
-
:param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
|
| 104 |
-
or in the form ``owner/repo``.
|
| 105 |
-
:type repo_url_or_id: str
|
| 106 |
-
:param progress: Gradio progress tracker (UI only).
|
| 107 |
-
:type progress: gr.Progress
|
| 108 |
-
:return: A tuple ``(content, output_path)`` where ``content`` is the
|
| 109 |
-
combined Markdown or an error string starting with ``"Error"``, and
|
| 110 |
-
``output_path`` is the temp file path (or ``None``).
|
| 111 |
-
:rtype: Tuple[str, Optional[str]]
|
| 112 |
-
"""
|
| 113 |
progress(0, desc="Starting Repomix processing...")
|
| 114 |
try:
|
| 115 |
with tempfile.TemporaryDirectory() as temp_dir:
|
|
@@ -169,18 +146,7 @@ def scrape_and_convert_website(
|
|
| 169 |
depth: int,
|
| 170 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 171 |
) -> Tuple[str, str]:
|
| 172 |
-
"""
|
| 173 |
-
Recursively scrape a website and convert pages to Markdown.
|
| 174 |
-
|
| 175 |
-
:param url: Starting URL to scrape.
|
| 176 |
-
:type url: str
|
| 177 |
-
:param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
|
| 178 |
-
:type depth: int
|
| 179 |
-
:param progress: Gradio progress tracker (UI only).
|
| 180 |
-
:type progress: gr.Progress
|
| 181 |
-
:return: A tuple ``(combined_markdown, tmp_md_path)``.
|
| 182 |
-
:rtype: Tuple[str, str]
|
| 183 |
-
"""
|
| 184 |
progress(0, desc=f"Starting web scrape for {url}...")
|
| 185 |
visited_urls = set()
|
| 186 |
|
|
@@ -244,31 +210,13 @@ def scrape_and_convert_website(
|
|
| 244 |
|
| 245 |
|
| 246 |
def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
|
| 247 |
-
"""
|
| 248 |
-
Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
|
| 249 |
-
|
| 250 |
-
:param markdown_content: The Markdown body to embed.
|
| 251 |
-
:type markdown_content: str
|
| 252 |
-
:param source_url_or_id: Original input string identifying the source.
|
| 253 |
-
:type source_url_or_id: str
|
| 254 |
-
:return: Pretty-printed JSON string.
|
| 255 |
-
:rtype: str
|
| 256 |
-
"""
|
| 257 |
data = {"source": source_url_or_id, "content": markdown_content}
|
| 258 |
return json.dumps(data, indent=2)
|
| 259 |
|
| 260 |
|
| 261 |
def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
|
| 262 |
-
"""
|
| 263 |
-
Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
|
| 264 |
-
|
| 265 |
-
:param markdown_content: The Markdown body to store.
|
| 266 |
-
:type markdown_content: str
|
| 267 |
-
:param source_url_or_id: Original input string identifying the source.
|
| 268 |
-
:type source_url_or_id: str
|
| 269 |
-
:return: Path to the created CSV file.
|
| 270 |
-
:rtype: str
|
| 271 |
-
"""
|
| 272 |
output = tempfile.NamedTemporaryFile(
|
| 273 |
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
|
| 274 |
)
|
|
@@ -284,22 +232,7 @@ def save_output_to_file(
|
|
| 284 |
output_format: str,
|
| 285 |
source_url_or_id: str,
|
| 286 |
) -> str:
|
| 287 |
-
"""
|
| 288 |
-
Save processed content in the selected format and return a file path.
|
| 289 |
-
|
| 290 |
-
:param content: The raw Markdown to save or convert.
|
| 291 |
-
:type content: str
|
| 292 |
-
:param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
|
| 293 |
-
:type output_format: str
|
| 294 |
-
:param source_url_or_id: Original input string identifying the source.
|
| 295 |
-
:type source_url_or_id: str
|
| 296 |
-
:return: Path to a temporary file holding the artifact.
|
| 297 |
-
:rtype: str
|
| 298 |
-
|
| 299 |
-
.. note::
|
| 300 |
-
PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
|
| 301 |
-
CSV uses a 2-column schema: ``['source','content']``.
|
| 302 |
-
"""
|
| 303 |
processed_content = content # default for Markdown/Text
|
| 304 |
|
| 305 |
if output_format == "JSON":
|
|
@@ -335,44 +268,29 @@ def save_output_to_file(
|
|
| 335 |
# ----------------------------------------------------------
|
| 336 |
|
| 337 |
def process_input_updated(
|
| 338 |
-
url_or_id:
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 343 |
) -> Tuple[str, str, Optional[str]]:
|
| 344 |
"""
|
| 345 |
-
Scrape or
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
:param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
|
| 351 |
-
For GitHub, either ``owner/repo`` or a full GitHub URL
|
| 352 |
-
(e.g., ``https://github.com/owner/repo``).
|
| 353 |
-
:type url_or_id: str
|
| 354 |
-
:param source_type: Select the content source. One of
|
| 355 |
-
{``"Webpage"``, ``"GitHub Repository"``}.
|
| 356 |
-
:type source_type: str
|
| 357 |
-
:param depth: Crawl depth for webpages. Integer in the range 0–3 where
|
| 358 |
-
0 = only the main page. **Ignored** when ``source_type`` is
|
| 359 |
-
``"GitHub Repository"``.
|
| 360 |
-
:type depth: int
|
| 361 |
-
:param output_format_selection: Desired output format. One of
|
| 362 |
-
{``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
|
| 363 |
-
:type output_format_selection: str
|
| 364 |
-
:param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
|
| 365 |
-
:type progress: gr.Progress
|
| 366 |
-
|
| 367 |
-
:returns: A 3-tuple:
|
| 368 |
-
- **status** (*str*): Human-readable status line.
|
| 369 |
-
- **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
|
| 370 |
-
- **file_path** (*Optional[str]*): Path to the generated artifact for download,
|
| 371 |
-
or ``None`` on error.
|
| 372 |
-
:rtype: Tuple[str, str, Optional[str]]
|
| 373 |
-
|
| 374 |
-
:raises Exception: (caught internally) Unexpected processing errors are surfaced
|
| 375 |
-
as a user-facing status with details in the preview.
|
| 376 |
"""
|
| 377 |
progress(0, desc="Initializing...")
|
| 378 |
raw_content = ""
|
|
|
|
| 2 |
"""
|
| 3 |
RAG-Ready Content Scraper — Gradio + MCP (SSE)
|
| 4 |
|
| 5 |
+
Exposes an MCP SSE endpoint on Hugging Face Spaces at:
|
| 6 |
/gradio_api/mcp/sse
|
| 7 |
|
| 8 |
Example MCP configs:
|
|
|
|
| 43 |
import re
|
| 44 |
import subprocess
|
| 45 |
import tempfile
|
| 46 |
+
from typing import Optional, Tuple, Literal
|
| 47 |
+
|
| 48 |
+
# NEW: use Annotated+Doc so MCP can show per-parameter descriptions
|
| 49 |
+
from typing_extensions import Annotated, Doc
|
| 50 |
|
| 51 |
import gradio as gr
|
| 52 |
import markdown_pdf
|
|
|
|
| 62 |
# -----------------------------
|
| 63 |
|
| 64 |
def is_github_repo(url_or_id: str) -> bool:
|
| 65 |
+
"""Return True if the string looks like a GitHub repository reference."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
if "github.com" in url_or_id:
|
| 67 |
return True
|
| 68 |
return bool(re.match(r"^[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+$", url_or_id))
|
| 69 |
|
| 70 |
|
| 71 |
def check_repomix_installed() -> bool:
|
| 72 |
+
"""Check if the `repomix` CLI is available on PATH."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
try:
|
| 74 |
result = subprocess.run(
|
| 75 |
["repomix", "--version"],
|
|
|
|
| 86 |
repo_url_or_id: str,
|
| 87 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 88 |
) -> Tuple[str, Optional[str]]:
|
| 89 |
+
"""Run Repomix on a GitHub repository and return combined Markdown."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
progress(0, desc="Starting Repomix processing...")
|
| 91 |
try:
|
| 92 |
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
|
| 146 |
depth: int,
|
| 147 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 148 |
) -> Tuple[str, str]:
|
| 149 |
+
"""Recursively scrape a website and convert pages to Markdown."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
progress(0, desc=f"Starting web scrape for {url}...")
|
| 151 |
visited_urls = set()
|
| 152 |
|
|
|
|
| 210 |
|
| 211 |
|
| 212 |
def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
|
| 213 |
+
"""Wrap Markdown text in a JSON object with `source` and `content` keys."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
data = {"source": source_url_or_id, "content": markdown_content}
|
| 215 |
return json.dumps(data, indent=2)
|
| 216 |
|
| 217 |
|
| 218 |
def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
|
| 219 |
+
"""Persist Markdown as a simple CSV with two columns: `source`, `content`."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
output = tempfile.NamedTemporaryFile(
|
| 221 |
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
|
| 222 |
)
|
|
|
|
| 232 |
output_format: str,
|
| 233 |
source_url_or_id: str,
|
| 234 |
) -> str:
|
| 235 |
+
"""Save processed content in the selected format and return a file path."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
processed_content = content # default for Markdown/Text
|
| 237 |
|
| 238 |
if output_format == "JSON":
|
|
|
|
| 268 |
# ----------------------------------------------------------
|
| 269 |
|
| 270 |
def process_input_updated(
|
| 271 |
+
url_or_id: Annotated[
|
| 272 |
+
str,
|
| 273 |
+
Doc("For webpages, a full URL (e.g., https://example.com). For GitHub, either owner/repo or a full GitHub URL (https://github.com/owner/repo)."),
|
| 274 |
+
],
|
| 275 |
+
source_type: Annotated[
|
| 276 |
+
Literal["Webpage", "GitHub Repository"],
|
| 277 |
+
Doc('Select the content source: "Webpage" to crawl HTML, or "GitHub Repository" to run Repomix.'),
|
| 278 |
+
],
|
| 279 |
+
depth: Annotated[
|
| 280 |
+
int,
|
| 281 |
+
Doc("Crawl depth for webpages (0–3). 0 = only the main page. Ignored for GitHub repositories."),
|
| 282 |
+
],
|
| 283 |
+
output_format_selection: Annotated[
|
| 284 |
+
Literal["Markdown", "JSON", "CSV", "Text", "PDF"],
|
| 285 |
+
Doc("Desired output format for the processed content."),
|
| 286 |
+
],
|
| 287 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 288 |
) -> Tuple[str, str, Optional[str]]:
|
| 289 |
"""
|
| 290 |
+
Scrape a webpage (with depth) or dump a GitHub repo (Repomix), then export as Markdown/JSON/CSV/Text/PDF.
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
Tuple[str, str, Optional[str]]: (status, preview, file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
"""
|
| 295 |
progress(0, desc="Initializing...")
|
| 296 |
raw_content = ""
|