Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,10 +19,8 @@ import os
|
|
| 19 |
def scrape_website_content(url: str) -> Tuple[str, str]:
|
| 20 |
"""
|
| 21 |
Scrape a website and return its main content formatted as markdown and a downloadable file path.
|
| 22 |
-
|
| 23 |
Args:
|
| 24 |
url (str): The URL to scrape (can include or omit http/https protocol)
|
| 25 |
-
|
| 26 |
Returns:
|
| 27 |
Tuple[str, str]: The scraped content formatted as markdown, and a file path for download
|
| 28 |
"""
|
|
@@ -444,6 +442,7 @@ def fibwatch_latest_to_originals(url: str) -> Tuple[str, str]:
|
|
| 444 |
except Exception as e:
|
| 445 |
return str(e), None
|
| 446 |
# Create Gradio interfaces for each function
|
|
|
|
| 447 |
def create_mcp_interface():
|
| 448 |
"""Create Gradio interface that exposes web scraping tools as MCP functions."""
|
| 449 |
# Create individual interfaces for each tool
|
|
@@ -465,7 +464,8 @@ def create_mcp_interface():
|
|
| 465 |
],
|
| 466 |
title="Website Content Scraper",
|
| 467 |
description="Extract and format website content as markdown",
|
| 468 |
-
api_name="scrape_content"
|
|
|
|
| 469 |
|
| 470 |
sitemap_interface = gr.Interface(
|
| 471 |
fn=generate_sitemap_for_ui,
|
|
@@ -487,6 +487,7 @@ def create_mcp_interface():
|
|
| 487 |
description="Generate a sitemap of all links found on a webpage",
|
| 488 |
api_name="generate_sitemap"
|
| 489 |
)
|
|
|
|
| 490 |
bulk_extract_interface = gr.Interface(
|
| 491 |
fn=extract_all_content_for_ui,
|
| 492 |
inputs=gr.Textbox(
|
|
@@ -505,9 +506,9 @@ def create_mcp_interface():
|
|
| 505 |
],
|
| 506 |
title="Bulk Content Extractor",
|
| 507 |
description="Extract text content from all internal links and download as ZIP",
|
| 508 |
-
api_name="extract_all_content"
|
|
|
|
| 509 |
|
| 510 |
-
# Enhanced sitemap interface with configurable limits
|
| 511 |
sitemap_limited_interface = gr.Interface(
|
| 512 |
fn=generate_sitemap_with_limit,
|
| 513 |
inputs=[
|
|
@@ -538,20 +539,7 @@ def create_mcp_interface():
|
|
| 538 |
api_name="generate_sitemap_limited"
|
| 539 |
)
|
| 540 |
|
| 541 |
-
# Enhanced bulk extract interface with configurable limits
|
| 542 |
bulk_limited_interface = gr.Interface(
|
| 543 |
-
fibwatch_interface = gr.Interface(
|
| 544 |
-
fn=fibwatch_latest_to_originals,
|
| 545 |
-
inputs=gr.Textbox(
|
| 546 |
-
label="Fibwatch Page",
|
| 547 |
-
placeholder="https://fibwatch.art/videos/latest?page_id=1"
|
| 548 |
-
),
|
| 549 |
-
outputs=[
|
| 550 |
-
gr.Textbox(label="Original Links", lines=10),
|
| 551 |
-
gr.File(label="Download")
|
| 552 |
-
],
|
| 553 |
-
title="Fibwatch Scraper"
|
| 554 |
-
)
|
| 555 |
fn=extract_limited_content_as_zip,
|
| 556 |
inputs=[
|
| 557 |
gr.Textbox(
|
|
@@ -581,29 +569,41 @@ def create_mcp_interface():
|
|
| 581 |
api_name="extract_limited_content"
|
| 582 |
)
|
| 583 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
# Combine into tabbed interface
|
| 585 |
-
demo = gr.TabbedInterface(
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
"
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
)
|
| 604 |
return demo
|
| 605 |
|
| 606 |
-
|
| 607 |
if __name__ == "__main__":
|
| 608 |
# Create and launch the MCP server
|
| 609 |
app = create_mcp_interface()
|
|
|
|
| 19 |
def scrape_website_content(url: str) -> Tuple[str, str]:
|
| 20 |
"""
|
| 21 |
Scrape a website and return its main content formatted as markdown and a downloadable file path.
|
|
|
|
| 22 |
Args:
|
| 23 |
url (str): The URL to scrape (can include or omit http/https protocol)
|
|
|
|
| 24 |
Returns:
|
| 25 |
Tuple[str, str]: The scraped content formatted as markdown, and a file path for download
|
| 26 |
"""
|
|
|
|
| 442 |
except Exception as e:
|
| 443 |
return str(e), None
|
| 444 |
# Create Gradio interfaces for each function
|
| 445 |
+
# Create Gradio interfaces for each function
|
| 446 |
def create_mcp_interface():
|
| 447 |
"""Create Gradio interface that exposes web scraping tools as MCP functions."""
|
| 448 |
# Create individual interfaces for each tool
|
|
|
|
| 464 |
],
|
| 465 |
title="Website Content Scraper",
|
| 466 |
description="Extract and format website content as markdown",
|
| 467 |
+
api_name="scrape_content"
|
| 468 |
+
)
|
| 469 |
|
| 470 |
sitemap_interface = gr.Interface(
|
| 471 |
fn=generate_sitemap_for_ui,
|
|
|
|
| 487 |
description="Generate a sitemap of all links found on a webpage",
|
| 488 |
api_name="generate_sitemap"
|
| 489 |
)
|
| 490 |
+
|
| 491 |
bulk_extract_interface = gr.Interface(
|
| 492 |
fn=extract_all_content_for_ui,
|
| 493 |
inputs=gr.Textbox(
|
|
|
|
| 506 |
],
|
| 507 |
title="Bulk Content Extractor",
|
| 508 |
description="Extract text content from all internal links and download as ZIP",
|
| 509 |
+
api_name="extract_all_content"
|
| 510 |
+
)
|
| 511 |
|
|
|
|
| 512 |
sitemap_limited_interface = gr.Interface(
|
| 513 |
fn=generate_sitemap_with_limit,
|
| 514 |
inputs=[
|
|
|
|
| 539 |
api_name="generate_sitemap_limited"
|
| 540 |
)
|
| 541 |
|
|
|
|
| 542 |
bulk_limited_interface = gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
fn=extract_limited_content_as_zip,
|
| 544 |
inputs=[
|
| 545 |
gr.Textbox(
|
|
|
|
| 569 |
api_name="extract_limited_content"
|
| 570 |
)
|
| 571 |
|
| 572 |
+
fibwatch_interface = gr.Interface(
|
| 573 |
+
fn=fibwatch_latest_to_originals,
|
| 574 |
+
inputs=gr.Textbox(
|
| 575 |
+
label="Fibwatch Page",
|
| 576 |
+
placeholder="https://fibwatch.art/videos/latest?page_id=1"
|
| 577 |
+
),
|
| 578 |
+
outputs=[
|
| 579 |
+
gr.Textbox(label="Original Links", lines=10),
|
| 580 |
+
gr.File(label="Download")
|
| 581 |
+
],
|
| 582 |
+
title="Fibwatch Scraper",
|
| 583 |
+
description="Extract original b-cdn.net links from Fibwatch pages"
|
| 584 |
+
)
|
| 585 |
+
|
| 586 |
# Combine into tabbed interface
|
| 587 |
+
demo = gr.TabbedInterface(
|
| 588 |
+
[
|
| 589 |
+
scrape_interface,
|
| 590 |
+
sitemap_interface,
|
| 591 |
+
bulk_extract_interface,
|
| 592 |
+
bulk_limited_interface,
|
| 593 |
+
fibwatch_interface,
|
| 594 |
+
],
|
| 595 |
+
[
|
| 596 |
+
"Content Scraper",
|
| 597 |
+
"All Links Sitemap",
|
| 598 |
+
"Bulk Extractor",
|
| 599 |
+
"Limited Bulk Extractor",
|
| 600 |
+
"Fibwatch Scraper",
|
| 601 |
+
],
|
| 602 |
+
title="🕷️ Web Scraper MCP Server"
|
| 603 |
+
)
|
| 604 |
+
|
|
|
|
| 605 |
return demo
|
| 606 |
|
|
|
|
| 607 |
if __name__ == "__main__":
|
| 608 |
# Create and launch the MCP server
|
| 609 |
app = create_mcp_interface()
|