siam3310 commited on
Commit
029a1bc
·
verified ·
1 Parent(s): 11116c6

added fibwatch

Browse files
Files changed (1) hide show
  1. app.py +103 -6
app.py CHANGED
@@ -388,6 +388,69 @@ def extract_all_content_for_ui(url: str) -> Tuple[str, str]:
388
 
389
 
390
  def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  """
392
  Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
393
 
@@ -499,6 +562,26 @@ def create_mcp_interface():
499
 
500
  # Enhanced bulk extract interface with configurable limits
501
  bulk_limited_interface = gr.Interface(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
  fn=extract_limited_content_as_zip,
503
  inputs=[
504
  gr.Textbox(
@@ -529,11 +612,25 @@ def create_mcp_interface():
529
  )
530
 
531
  # Combine into tabbed interface
532
- demo = gr.TabbedInterface(
533
- [scrape_interface, sitemap_interface, sitemap_limited_interface, bulk_extract_interface, bulk_limited_interface],
534
- ["Content Scraper", "All Links Sitemap", "Limited Sitemap", "Bulk Extractor", "Limited Bulk Extractor"],
535
- title="🕷️ Web Scraper MCP Server"
536
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
537
 
538
  return demo
539
 
@@ -543,4 +640,4 @@ if __name__ == "__main__":
543
  app = create_mcp_interface()
544
  app.launch(
545
  mcp_server=True
546
- )
 
388
 
389
 
390
  def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
391
+
392
+ def fibwatch_latest_to_originals(url: str) -> Tuple[str, str]:
393
+ try:
394
+ if not url.startswith(("http://", "https://")):
395
+ url = "https://" + url
396
+
397
+ session = requests.Session()
398
+ session.headers.update({
399
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
400
+ })
401
+
402
+ # fetch listing page
403
+ res = session.get(url, timeout=10)
404
+ res.raise_for_status()
405
+ soup = BeautifulSoup(res.text, "html.parser")
406
+
407
+ # collect watch links
408
+ watch_links = set()
409
+ for a in soup.find_all("a", href=True):
410
+ href = a["href"]
411
+ if href.startswith("/watch/") and href.endswith(".html"):
412
+ watch_links.add("https://fibwatch.art" + href)
413
+
414
+ if not watch_links:
415
+ return "❌ No watch links found.", None
416
+
417
+ original_links = set()
418
+
419
+ # visit watch pages
420
+ for watch_url in watch_links:
421
+ try:
422
+ r = session.get(watch_url, timeout=10)
423
+ r.raise_for_status()
424
+ s = BeautifulSoup(r.text, "html.parser")
425
+
426
+ for a in s.find_all("a", href=True):
427
+ h = a["href"]
428
+ txt = a.get_text(strip=True).lower()
429
+
430
+ if "b-cdn.net" in h:
431
+ original_links.add(h)
432
+ elif txt == "original" and h.startswith("http"):
433
+ original_links.add(h)
434
+ except:
435
+ continue
436
+
437
+ if not original_links:
438
+ return "❌ No Original CDN links found.", None
439
+
440
+ md_out = "# Fibwatch Original Links\n\n"
441
+ md_out += f"Found {len(original_links)} links:\n\n"
442
+
443
+ for link in original_links:
444
+ md_out += f"- {link}\n"
445
+
446
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w") as f:
447
+ f.write("\n".join(original_links))
448
+ file_path = f.name
449
+
450
+ return md_out, file_path
451
+
452
+ except Exception as e:
453
+ return f"Error: {str(e)}", None
454
  """
455
  Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
456
 
 
562
 
563
  # Enhanced bulk extract interface with configurable limits
564
  bulk_limited_interface = gr.Interface(
565
+
566
+ fibwatch_interface = gr.Interface(
567
+ fn=fibwatch_latest_to_originals,
568
+ inputs=gr.Textbox(
569
+ label="Fibwatch Listing Page URL",
570
+ placeholder="https://fibwatch.art/videos/latest?page_id=1"
571
+ ),
572
+ outputs=[
573
+ gr.Textbox(
574
+ label="Original CDN Links",
575
+ lines=15,
576
+ show_copy_button=True
577
+ ),
578
+ gr.File(label="Download TXT")
579
+ ],
580
+ title="Fibwatch Latest → Originals",
581
+ description="Extract all Original CDN links from Fibwatch listing page",
582
+ api_name="fibwatch_latest_scraper"
583
+ )
584
+
585
  fn=extract_limited_content_as_zip,
586
  inputs=[
587
  gr.Textbox(
 
612
  )
613
 
614
  # Combine into tabbed interface
615
+ demo = gr.TabbedInterface(
616
+ [
617
+ scrape_interface,
618
+ sitemap_interface,
619
+ sitemap_limited_interface,
620
+ bulk_extract_interface,
621
+ bulk_limited_interface,
622
+ fibwatch_interface
623
+ ],
624
+ [
625
+ "Content Scraper",
626
+ "All Links Sitemap",
627
+ "Limited Sitemap",
628
+ "Bulk Extractor",
629
+ "Limited Bulk Extractor",
630
+ "Fibwatch Scraper"
631
+ ],
632
+ title="🕷️ Web Scraper MCP Server"
633
+ )
634
 
635
  return demo
636
 
 
640
  app = create_mcp_interface()
641
  app.launch(
642
  mcp_server=True
643
+ )