Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,13 +2,12 @@
|
|
| 2 |
"""
|
| 3 |
RAG-Ready Content Scraper — Gradio + MCP (SSE)
|
| 4 |
|
| 5 |
-
This
|
| 6 |
-
|
| 7 |
|
| 8 |
-
MCP
|
| 9 |
|
| 10 |
1) Direct SSE (Cursor, Windsurf, Cline, etc.)
|
| 11 |
-
|
| 12 |
{
|
| 13 |
"mcpServers": {
|
| 14 |
"gradio": {
|
|
@@ -17,8 +16,7 @@ MCP SSE client config examples:
|
|
| 17 |
}
|
| 18 |
}
|
| 19 |
|
| 20 |
-
2) Experimental stdio via Node
|
| 21 |
-
|
| 22 |
{
|
| 23 |
"mcpServers": {
|
| 24 |
"gradio": {
|
|
@@ -57,18 +55,19 @@ from rag_scraper.utils import URLUtils
|
|
| 57 |
|
| 58 |
|
| 59 |
# -----------------------------
|
| 60 |
-
#
|
| 61 |
# -----------------------------
|
| 62 |
|
| 63 |
def is_github_repo(url_or_id: str) -> bool:
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
"""
|
| 73 |
if "github.com" in url_or_id:
|
| 74 |
return True
|
|
@@ -76,10 +75,11 @@ def is_github_repo(url_or_id: str) -> bool:
|
|
| 76 |
|
| 77 |
|
| 78 |
def check_repomix_installed() -> bool:
|
| 79 |
-
"""
|
|
|
|
| 80 |
|
| 81 |
-
|
| 82 |
-
|
| 83 |
"""
|
| 84 |
try:
|
| 85 |
result = subprocess.run(
|
|
@@ -97,22 +97,18 @@ def run_repomix(
|
|
| 97 |
repo_url_or_id: str,
|
| 98 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 99 |
) -> Tuple[str, Optional[str]]:
|
| 100 |
-
"""
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
Notes:
|
| 114 |
-
- Requires `repomix` installed in the environment.
|
| 115 |
-
- If `repo_url_or_id` is "owner/repo" it is expanded to a full GitHub URL.
|
| 116 |
"""
|
| 117 |
progress(0, desc="Starting Repomix processing...")
|
| 118 |
try:
|
|
@@ -128,15 +124,11 @@ def run_repomix(
|
|
| 128 |
progress(0.2, desc=f"Running Repomix on {repo_url}...")
|
| 129 |
cmd = [
|
| 130 |
"repomix",
|
| 131 |
-
"--remote",
|
| 132 |
-
|
| 133 |
-
"--
|
| 134 |
-
output_file_path,
|
| 135 |
-
"--style",
|
| 136 |
-
"markdown",
|
| 137 |
"--compress",
|
| 138 |
]
|
| 139 |
-
|
| 140 |
process = subprocess.run(
|
| 141 |
cmd, capture_output=True, text=True, check=False, encoding="utf-8"
|
| 142 |
)
|
|
@@ -177,22 +169,17 @@ def scrape_and_convert_website(
|
|
| 177 |
depth: int,
|
| 178 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 179 |
) -> Tuple[str, str]:
|
| 180 |
-
"""
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
combined Markdown content.
|
| 192 |
-
|
| 193 |
-
Notes:
|
| 194 |
-
- Only internal links are visited.
|
| 195 |
-
- Link extraction uses `LinkExtractor` with `LinkType.INTERNAL`.
|
| 196 |
"""
|
| 197 |
progress(0, desc=f"Starting web scrape for {url}...")
|
| 198 |
visited_urls = set()
|
|
@@ -212,7 +199,10 @@ def scrape_and_convert_website(
|
|
| 212 |
progress_val = (
|
| 213 |
link_index / total_links_estimate if total_links_estimate > 0 else 0
|
| 214 |
)
|
| 215 |
-
progress(
|
|
|
|
|
|
|
|
|
|
| 216 |
html_content = Scraper.fetch_html(current_url)
|
| 217 |
except Exception as e:
|
| 218 |
return f"Error fetching {current_url}: {str(e)}\n"
|
|
@@ -254,28 +244,30 @@ def scrape_and_convert_website(
|
|
| 254 |
|
| 255 |
|
| 256 |
def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
|
| 257 |
-
"""
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
| 265 |
"""
|
| 266 |
data = {"source": source_url_or_id, "content": markdown_content}
|
| 267 |
return json.dumps(data, indent=2)
|
| 268 |
|
| 269 |
|
| 270 |
def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
|
| 271 |
-
"""
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
|
|
|
| 279 |
"""
|
| 280 |
output = tempfile.NamedTemporaryFile(
|
| 281 |
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
|
|
@@ -292,19 +284,21 @@ def save_output_to_file(
|
|
| 292 |
output_format: str,
|
| 293 |
source_url_or_id: str,
|
| 294 |
) -> str:
|
| 295 |
-
"""
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
|
|
|
|
|
|
| 308 |
"""
|
| 309 |
processed_content = content # default for Markdown/Text
|
| 310 |
|
|
@@ -316,7 +310,6 @@ def save_output_to_file(
|
|
| 316 |
elif output_format == "Text":
|
| 317 |
suffix = ".txt"
|
| 318 |
elif output_format == "PDF":
|
| 319 |
-
# Write PDF directly and return the path.
|
| 320 |
try:
|
| 321 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
|
| 322 |
pdf_output_path = tmp_pdf.name
|
|
@@ -327,7 +320,6 @@ def save_output_to_file(
|
|
| 327 |
# Fallback: persist as Markdown with .pdf.md suffix.
|
| 328 |
print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
|
| 329 |
suffix = ".pdf.md"
|
| 330 |
-
# For the fallback, continue below and write Markdown.
|
| 331 |
else:
|
| 332 |
suffix = ".md"
|
| 333 |
|
|
@@ -339,7 +331,7 @@ def save_output_to_file(
|
|
| 339 |
|
| 340 |
|
| 341 |
# ----------------------------------------------------------
|
| 342 |
-
# Main tool function (
|
| 343 |
# ----------------------------------------------------------
|
| 344 |
|
| 345 |
def process_input_updated(
|
|
@@ -349,39 +341,38 @@ def process_input_updated(
|
|
| 349 |
output_format_selection: str,
|
| 350 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 351 |
) -> Tuple[str, str, Optional[str]]:
|
| 352 |
-
"""
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
- Any internal exception is caught and returned as a user-facing status + preview.
|
| 385 |
"""
|
| 386 |
progress(0, desc="Initializing...")
|
| 387 |
raw_content = ""
|
|
@@ -422,7 +413,7 @@ def process_input_updated(
|
|
| 422 |
if output_format_selection == "JSON":
|
| 423 |
preview_content = convert_to_json(raw_content, url_or_id)
|
| 424 |
elif output_format_selection == "CSV" and output_file_path:
|
| 425 |
-
#
|
| 426 |
try:
|
| 427 |
with open(output_file_path, "r", encoding="utf-8") as f_csv:
|
| 428 |
csv_preview_lines = [next(f_csv) for _ in range(5)]
|
|
@@ -435,7 +426,7 @@ def process_input_updated(
|
|
| 435 |
elif output_format_selection == "CSV" and not output_file_path:
|
| 436 |
preview_content = "[CSV file path not available for preview]"
|
| 437 |
elif output_format_selection == "PDF":
|
| 438 |
-
#
|
| 439 |
preview_content = (
|
| 440 |
f"[PDF generated. Download to view: "
|
| 441 |
f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
|
|
@@ -529,7 +520,7 @@ Markdown, JSON, CSV, Text, PDF.
|
|
| 529 |
|
| 530 |
**Notes**
|
| 531 |
- PDF generation requires the `markdown-pdf` library.
|
| 532 |
-
-
|
| 533 |
- MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
|
| 534 |
"""
|
| 535 |
)
|
|
@@ -541,5 +532,5 @@ Markdown, JSON, CSV, Text, PDF.
|
|
| 541 |
)
|
| 542 |
|
| 543 |
if __name__ == "__main__":
|
| 544 |
-
#
|
| 545 |
iface.queue().launch(share=True)
|
|
|
|
| 2 |
"""
|
| 3 |
RAG-Ready Content Scraper — Gradio + MCP (SSE)
|
| 4 |
|
| 5 |
+
This app runs on Hugging Face Spaces and exposes an MCP SSE endpoint at:
|
| 6 |
+
/gradio_api/mcp/sse
|
| 7 |
|
| 8 |
+
Example MCP configs:
|
| 9 |
|
| 10 |
1) Direct SSE (Cursor, Windsurf, Cline, etc.)
|
|
|
|
| 11 |
{
|
| 12 |
"mcpServers": {
|
| 13 |
"gradio": {
|
|
|
|
| 16 |
}
|
| 17 |
}
|
| 18 |
|
| 19 |
+
2) Experimental stdio via Node:
|
|
|
|
| 20 |
{
|
| 21 |
"mcpServers": {
|
| 22 |
"gradio": {
|
|
|
|
| 55 |
|
| 56 |
|
| 57 |
# -----------------------------
|
| 58 |
+
# Helper utilities
|
| 59 |
# -----------------------------
|
| 60 |
|
| 61 |
def is_github_repo(url_or_id: str) -> bool:
|
| 62 |
+
"""
|
| 63 |
+
Determine whether the string looks like a GitHub repository reference.
|
| 64 |
+
|
| 65 |
+
:param url_or_id: Full GitHub URL containing ``github.com`` or an
|
| 66 |
+
``owner/repo`` identifier.
|
| 67 |
+
:type url_or_id: str
|
| 68 |
+
:return: ``True`` if it matches a GitHub URL or ``owner/repo`` pattern,
|
| 69 |
+
otherwise ``False``.
|
| 70 |
+
:rtype: bool
|
| 71 |
"""
|
| 72 |
if "github.com" in url_or_id:
|
| 73 |
return True
|
|
|
|
| 75 |
|
| 76 |
|
| 77 |
def check_repomix_installed() -> bool:
|
| 78 |
+
"""
|
| 79 |
+
Check if the ``repomix`` CLI is available on PATH.
|
| 80 |
|
| 81 |
+
:return: ``True`` if ``repomix --version`` succeeds, else ``False``.
|
| 82 |
+
:rtype: bool
|
| 83 |
"""
|
| 84 |
try:
|
| 85 |
result = subprocess.run(
|
|
|
|
| 97 |
repo_url_or_id: str,
|
| 98 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 99 |
) -> Tuple[str, Optional[str]]:
|
| 100 |
+
"""
|
| 101 |
+
Run Repomix on a GitHub repository and return combined Markdown.
|
| 102 |
+
|
| 103 |
+
:param repo_url_or_id: GitHub repo as full URL (``https://github.com/...``)
|
| 104 |
+
or in the form ``owner/repo``.
|
| 105 |
+
:type repo_url_or_id: str
|
| 106 |
+
:param progress: Gradio progress tracker (UI only).
|
| 107 |
+
:type progress: gr.Progress
|
| 108 |
+
:return: A tuple ``(content, output_path)`` where ``content`` is the
|
| 109 |
+
combined Markdown or an error string starting with ``"Error"``, and
|
| 110 |
+
``output_path`` is the temp file path (or ``None``).
|
| 111 |
+
:rtype: Tuple[str, Optional[str]]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""
|
| 113 |
progress(0, desc="Starting Repomix processing...")
|
| 114 |
try:
|
|
|
|
| 124 |
progress(0.2, desc=f"Running Repomix on {repo_url}...")
|
| 125 |
cmd = [
|
| 126 |
"repomix",
|
| 127 |
+
"--remote", repo_url,
|
| 128 |
+
"--output", output_file_path,
|
| 129 |
+
"--style", "markdown",
|
|
|
|
|
|
|
|
|
|
| 130 |
"--compress",
|
| 131 |
]
|
|
|
|
| 132 |
process = subprocess.run(
|
| 133 |
cmd, capture_output=True, text=True, check=False, encoding="utf-8"
|
| 134 |
)
|
|
|
|
| 169 |
depth: int,
|
| 170 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 171 |
) -> Tuple[str, str]:
|
| 172 |
+
"""
|
| 173 |
+
Recursively scrape a website and convert pages to Markdown.
|
| 174 |
+
|
| 175 |
+
:param url: Starting URL to scrape.
|
| 176 |
+
:type url: str
|
| 177 |
+
:param depth: Crawl depth where 0 = only the main page (1..3 follow internal links).
|
| 178 |
+
:type depth: int
|
| 179 |
+
:param progress: Gradio progress tracker (UI only).
|
| 180 |
+
:type progress: gr.Progress
|
| 181 |
+
:return: A tuple ``(combined_markdown, tmp_md_path)``.
|
| 182 |
+
:rtype: Tuple[str, str]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
"""
|
| 184 |
progress(0, desc=f"Starting web scrape for {url}...")
|
| 185 |
visited_urls = set()
|
|
|
|
| 199 |
progress_val = (
|
| 200 |
link_index / total_links_estimate if total_links_estimate > 0 else 0
|
| 201 |
)
|
| 202 |
+
progress(
|
| 203 |
+
progress_val,
|
| 204 |
+
desc=f"Scraping: {current_url} (Depth used: {depth - current_depth})",
|
| 205 |
+
)
|
| 206 |
html_content = Scraper.fetch_html(current_url)
|
| 207 |
except Exception as e:
|
| 208 |
return f"Error fetching {current_url}: {str(e)}\n"
|
|
|
|
| 244 |
|
| 245 |
|
| 246 |
def convert_to_json(markdown_content: str, source_url_or_id: str) -> str:
|
| 247 |
+
"""
|
| 248 |
+
Wrap Markdown text in a JSON object with ``source`` and ``content`` keys.
|
| 249 |
+
|
| 250 |
+
:param markdown_content: The Markdown body to embed.
|
| 251 |
+
:type markdown_content: str
|
| 252 |
+
:param source_url_or_id: Original input string identifying the source.
|
| 253 |
+
:type source_url_or_id: str
|
| 254 |
+
:return: Pretty-printed JSON string.
|
| 255 |
+
:rtype: str
|
| 256 |
"""
|
| 257 |
data = {"source": source_url_or_id, "content": markdown_content}
|
| 258 |
return json.dumps(data, indent=2)
|
| 259 |
|
| 260 |
|
| 261 |
def convert_to_csv(markdown_content: str, source_url_or_id: str) -> str:
|
| 262 |
+
"""
|
| 263 |
+
Persist Markdown as a simple CSV with two columns: ``source``, ``content``.
|
| 264 |
+
|
| 265 |
+
:param markdown_content: The Markdown body to store.
|
| 266 |
+
:type markdown_content: str
|
| 267 |
+
:param source_url_or_id: Original input string identifying the source.
|
| 268 |
+
:type source_url_or_id: str
|
| 269 |
+
:return: Path to the created CSV file.
|
| 270 |
+
:rtype: str
|
| 271 |
"""
|
| 272 |
output = tempfile.NamedTemporaryFile(
|
| 273 |
mode="w+", delete=False, newline="", suffix=".csv", encoding="utf-8"
|
|
|
|
| 284 |
output_format: str,
|
| 285 |
source_url_or_id: str,
|
| 286 |
) -> str:
|
| 287 |
+
"""
|
| 288 |
+
Save processed content in the selected format and return a file path.
|
| 289 |
+
|
| 290 |
+
:param content: The raw Markdown to save or convert.
|
| 291 |
+
:type content: str
|
| 292 |
+
:param output_format: One of {``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
|
| 293 |
+
:type output_format: str
|
| 294 |
+
:param source_url_or_id: Original input string identifying the source.
|
| 295 |
+
:type source_url_or_id: str
|
| 296 |
+
:return: Path to a temporary file holding the artifact.
|
| 297 |
+
:rtype: str
|
| 298 |
+
|
| 299 |
+
.. note::
|
| 300 |
+
PDF uses ``markdown_pdf`` and writes directly to a temporary ``.pdf`` file.
|
| 301 |
+
CSV uses a 2-column schema: ``['source','content']``.
|
| 302 |
"""
|
| 303 |
processed_content = content # default for Markdown/Text
|
| 304 |
|
|
|
|
| 310 |
elif output_format == "Text":
|
| 311 |
suffix = ".txt"
|
| 312 |
elif output_format == "PDF":
|
|
|
|
| 313 |
try:
|
| 314 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
|
| 315 |
pdf_output_path = tmp_pdf.name
|
|
|
|
| 320 |
# Fallback: persist as Markdown with .pdf.md suffix.
|
| 321 |
print(f"PDF conversion failed: {e}. Saving as Markdown instead.")
|
| 322 |
suffix = ".pdf.md"
|
|
|
|
| 323 |
else:
|
| 324 |
suffix = ".md"
|
| 325 |
|
|
|
|
| 331 |
|
| 332 |
|
| 333 |
# ----------------------------------------------------------
|
| 334 |
+
# Main tool function (exposed to MCP via SSE)
|
| 335 |
# ----------------------------------------------------------
|
| 336 |
|
| 337 |
def process_input_updated(
|
|
|
|
| 341 |
output_format_selection: str,
|
| 342 |
progress: gr.Progress = gr.Progress(track_tqdm=True),
|
| 343 |
) -> Tuple[str, str, Optional[str]]:
|
| 344 |
+
"""
|
| 345 |
+
Scrape or repo-dump content and export it as Markdown/JSON/CSV/Text/PDF.
|
| 346 |
+
|
| 347 |
+
This function is exposed to MCP clients via the Spaces SSE endpoint
|
| 348 |
+
``/gradio_api/mcp/sse``.
|
| 349 |
+
|
| 350 |
+
:param url_or_id: For webpages, a full URL (e.g., ``https://example.com``).
|
| 351 |
+
For GitHub, either ``owner/repo`` or a full GitHub URL
|
| 352 |
+
(e.g., ``https://github.com/owner/repo``).
|
| 353 |
+
:type url_or_id: str
|
| 354 |
+
:param source_type: Select the content source. One of
|
| 355 |
+
{``"Webpage"``, ``"GitHub Repository"``}.
|
| 356 |
+
:type source_type: str
|
| 357 |
+
:param depth: Crawl depth for webpages. Integer in the range 0–3 where
|
| 358 |
+
0 = only the main page. **Ignored** when ``source_type`` is
|
| 359 |
+
``"GitHub Repository"``.
|
| 360 |
+
:type depth: int
|
| 361 |
+
:param output_format_selection: Desired output format. One of
|
| 362 |
+
{``"Markdown"``, ``"JSON"``, ``"CSV"``, ``"Text"``, ``"PDF"``}.
|
| 363 |
+
:type output_format_selection: str
|
| 364 |
+
:param progress: (UI only) Gradio progress tracker. MCP callers can omit this.
|
| 365 |
+
:type progress: gr.Progress
|
| 366 |
+
|
| 367 |
+
:returns: A 3-tuple:
|
| 368 |
+
- **status** (*str*): Human-readable status line.
|
| 369 |
+
- **preview** (*str*): Text preview (full Markdown/JSON/Text, or a note for CSV/PDF).
|
| 370 |
+
- **file_path** (*Optional[str]*): Path to the generated artifact for download,
|
| 371 |
+
or ``None`` on error.
|
| 372 |
+
:rtype: Tuple[str, str, Optional[str]]
|
| 373 |
+
|
| 374 |
+
:raises Exception: (caught internally) Unexpected processing errors are surfaced
|
| 375 |
+
as a user-facing status with details in the preview.
|
|
|
|
| 376 |
"""
|
| 377 |
progress(0, desc="Initializing...")
|
| 378 |
raw_content = ""
|
|
|
|
| 413 |
if output_format_selection == "JSON":
|
| 414 |
preview_content = convert_to_json(raw_content, url_or_id)
|
| 415 |
elif output_format_selection == "CSV" and output_file_path:
|
| 416 |
+
# Show a small preview of the CSV
|
| 417 |
try:
|
| 418 |
with open(output_file_path, "r", encoding="utf-8") as f_csv:
|
| 419 |
csv_preview_lines = [next(f_csv) for _ in range(5)]
|
|
|
|
| 426 |
elif output_format_selection == "CSV" and not output_file_path:
|
| 427 |
preview_content = "[CSV file path not available for preview]"
|
| 428 |
elif output_format_selection == "PDF":
|
| 429 |
+
# Can't render PDF in text preview
|
| 430 |
preview_content = (
|
| 431 |
f"[PDF generated. Download to view: "
|
| 432 |
f"{os.path.basename(output_file_path) if output_file_path else 'file.pdf'}]"
|
|
|
|
| 520 |
|
| 521 |
**Notes**
|
| 522 |
- PDF generation requires the `markdown-pdf` library.
|
| 523 |
+
- Designed for Docker/Hugging Face Spaces.
|
| 524 |
- MCP SSE endpoint is available at: `/gradio_api/mcp/sse`.
|
| 525 |
"""
|
| 526 |
)
|
|
|
|
| 532 |
)
|
| 533 |
|
| 534 |
if __name__ == "__main__":
|
| 535 |
+
# Enable queuing for concurrency; Spaces generally manage hosting.
|
| 536 |
iface.queue().launch(share=True)
|