Spaces:
Sleeping
Sleeping
added fibwatch
Browse files
app.py
CHANGED
|
@@ -388,6 +388,69 @@ def extract_all_content_for_ui(url: str) -> Tuple[str, str]:
|
|
| 388 |
|
| 389 |
|
| 390 |
def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 391 |
"""
|
| 392 |
Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
|
| 393 |
|
|
@@ -499,6 +562,26 @@ def create_mcp_interface():
|
|
| 499 |
|
| 500 |
# Enhanced bulk extract interface with configurable limits
|
| 501 |
bulk_limited_interface = gr.Interface(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 502 |
fn=extract_limited_content_as_zip,
|
| 503 |
inputs=[
|
| 504 |
gr.Textbox(
|
|
@@ -529,11 +612,25 @@ def create_mcp_interface():
|
|
| 529 |
)
|
| 530 |
|
| 531 |
# Combine into tabbed interface
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 537 |
|
| 538 |
return demo
|
| 539 |
|
|
@@ -543,4 +640,4 @@ if __name__ == "__main__":
|
|
| 543 |
app = create_mcp_interface()
|
| 544 |
app.launch(
|
| 545 |
mcp_server=True
|
| 546 |
-
)
|
|
|
|
| 388 |
|
| 389 |
|
| 390 |
def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
|
| 391 |
+
|
| 392 |
+
def fibwatch_latest_to_originals(url: str) -> Tuple[str, str]:
|
| 393 |
+
try:
|
| 394 |
+
if not url.startswith(("http://", "https://")):
|
| 395 |
+
url = "https://" + url
|
| 396 |
+
|
| 397 |
+
session = requests.Session()
|
| 398 |
+
session.headers.update({
|
| 399 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
|
| 400 |
+
})
|
| 401 |
+
|
| 402 |
+
# fetch listing page
|
| 403 |
+
res = session.get(url, timeout=10)
|
| 404 |
+
res.raise_for_status()
|
| 405 |
+
soup = BeautifulSoup(res.text, "html.parser")
|
| 406 |
+
|
| 407 |
+
# collect watch links
|
| 408 |
+
watch_links = set()
|
| 409 |
+
for a in soup.find_all("a", href=True):
|
| 410 |
+
href = a["href"]
|
| 411 |
+
if href.startswith("/watch/") and href.endswith(".html"):
|
| 412 |
+
watch_links.add("https://fibwatch.art" + href)
|
| 413 |
+
|
| 414 |
+
if not watch_links:
|
| 415 |
+
return "❌ No watch links found.", None
|
| 416 |
+
|
| 417 |
+
original_links = set()
|
| 418 |
+
|
| 419 |
+
# visit watch pages
|
| 420 |
+
for watch_url in watch_links:
|
| 421 |
+
try:
|
| 422 |
+
r = session.get(watch_url, timeout=10)
|
| 423 |
+
r.raise_for_status()
|
| 424 |
+
s = BeautifulSoup(r.text, "html.parser")
|
| 425 |
+
|
| 426 |
+
for a in s.find_all("a", href=True):
|
| 427 |
+
h = a["href"]
|
| 428 |
+
txt = a.get_text(strip=True).lower()
|
| 429 |
+
|
| 430 |
+
if "b-cdn.net" in h:
|
| 431 |
+
original_links.add(h)
|
| 432 |
+
elif txt == "original" and h.startswith("http"):
|
| 433 |
+
original_links.add(h)
|
| 434 |
+
except:
|
| 435 |
+
continue
|
| 436 |
+
|
| 437 |
+
if not original_links:
|
| 438 |
+
return "❌ No Original CDN links found.", None
|
| 439 |
+
|
| 440 |
+
md_out = "# Fibwatch Original Links\n\n"
|
| 441 |
+
md_out += f"Found {len(original_links)} links:\n\n"
|
| 442 |
+
|
| 443 |
+
for link in original_links:
|
| 444 |
+
md_out += f"- {link}\n"
|
| 445 |
+
|
| 446 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w") as f:
|
| 447 |
+
f.write("\n".join(original_links))
|
| 448 |
+
file_path = f.name
|
| 449 |
+
|
| 450 |
+
return md_out, file_path
|
| 451 |
+
|
| 452 |
+
except Exception as e:
|
| 453 |
+
return f"Error: {str(e)}", None
|
| 454 |
"""
|
| 455 |
Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
|
| 456 |
|
|
|
|
| 562 |
|
| 563 |
# Enhanced bulk extract interface with configurable limits
|
| 564 |
bulk_limited_interface = gr.Interface(
|
| 565 |
+
|
| 566 |
+
fibwatch_interface = gr.Interface(
|
| 567 |
+
fn=fibwatch_latest_to_originals,
|
| 568 |
+
inputs=gr.Textbox(
|
| 569 |
+
label="Fibwatch Listing Page URL",
|
| 570 |
+
placeholder="https://fibwatch.art/videos/latest?page_id=1"
|
| 571 |
+
),
|
| 572 |
+
outputs=[
|
| 573 |
+
gr.Textbox(
|
| 574 |
+
label="Original CDN Links",
|
| 575 |
+
lines=15,
|
| 576 |
+
show_copy_button=True
|
| 577 |
+
),
|
| 578 |
+
gr.File(label="Download TXT")
|
| 579 |
+
],
|
| 580 |
+
title="Fibwatch Latest → Originals",
|
| 581 |
+
description="Extract all Original CDN links from Fibwatch listing page",
|
| 582 |
+
api_name="fibwatch_latest_scraper"
|
| 583 |
+
)
|
| 584 |
+
|
| 585 |
fn=extract_limited_content_as_zip,
|
| 586 |
inputs=[
|
| 587 |
gr.Textbox(
|
|
|
|
| 612 |
)
|
| 613 |
|
| 614 |
# Combine into tabbed interface
|
| 615 |
+
demo = gr.TabbedInterface(
|
| 616 |
+
[
|
| 617 |
+
scrape_interface,
|
| 618 |
+
sitemap_interface,
|
| 619 |
+
sitemap_limited_interface,
|
| 620 |
+
bulk_extract_interface,
|
| 621 |
+
bulk_limited_interface,
|
| 622 |
+
fibwatch_interface
|
| 623 |
+
],
|
| 624 |
+
[
|
| 625 |
+
"Content Scraper",
|
| 626 |
+
"All Links Sitemap",
|
| 627 |
+
"Limited Sitemap",
|
| 628 |
+
"Bulk Extractor",
|
| 629 |
+
"Limited Bulk Extractor",
|
| 630 |
+
"Fibwatch Scraper"
|
| 631 |
+
],
|
| 632 |
+
title="🕷️ Web Scraper MCP Server"
|
| 633 |
+
)
|
| 634 |
|
| 635 |
return demo
|
| 636 |
|
|
|
|
| 640 |
app = create_mcp_interface()
|
| 641 |
app.launch(
|
| 642 |
mcp_server=True
|
| 643 |
+
)
|