web-scraper

Sleeping

App Files Files Community

siam3310 commited on Jan 17

Commit

029a1bc

verified ·

1 Parent(s): 11116c6

added fibwatch

Browse files

Files changed (1) hide show

app.py +103 -6

app.py CHANGED Viewed

@@ -388,6 +388,69 @@ def extract_all_content_for_ui(url: str) -> Tuple[str, str]:
 def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
     """
     Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
@@ -499,6 +562,26 @@ def create_mcp_interface():
     # Enhanced bulk extract interface with configurable limits
     bulk_limited_interface = gr.Interface(
         fn=extract_limited_content_as_zip,
         inputs=[
             gr.Textbox(
@@ -529,11 +612,25 @@ def create_mcp_interface():
     )
     # Combine into tabbed interface
-    demo = gr.TabbedInterface(
-        [scrape_interface, sitemap_interface, sitemap_limited_interface, bulk_extract_interface, bulk_limited_interface],
-        ["Content Scraper", "All Links Sitemap", "Limited Sitemap", "Bulk Extractor", "Limited Bulk Extractor"],
-        title="🕷️ Web Scraper MCP Server"
-    )
     return demo
@@ -543,4 +640,4 @@ if __name__ == "__main__":
     app = create_mcp_interface()
     app.launch(
        mcp_server=True
-    )

 def extract_limited_content_as_zip(url: str, max_links: int) -> Tuple[str, str]:
+def fibwatch_latest_to_originals(url: str) -> Tuple[str, str]:
+    try:
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
+        session = requests.Session()
+        session.headers.update({
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
+        })
+        # fetch listing page
+        res = session.get(url, timeout=10)
+        res.raise_for_status()
+        soup = BeautifulSoup(res.text, "html.parser")
+        # collect watch links
+        watch_links = set()
+        for a in soup.find_all("a", href=True):
+            href = a["href"]
+            if href.startswith("/watch/") and href.endswith(".html"):
+                watch_links.add("https://fibwatch.art" + href)
+        if not watch_links:
+            return "❌ No watch links found.", None
+        original_links = set()
+        # visit watch pages
+        for watch_url in watch_links:
+            try:
+                r = session.get(watch_url, timeout=10)
+                r.raise_for_status()
+                s = BeautifulSoup(r.text, "html.parser")
+                for a in s.find_all("a", href=True):
+                    h = a["href"]
+                    txt = a.get_text(strip=True).lower()
+                    if "b-cdn.net" in h:
+                        original_links.add(h)
+                    elif txt == "original" and h.startswith("http"):
+                        original_links.add(h)
+            except:
+                continue
+        if not original_links:
+            return "❌ No Original CDN links found.", None
+        md_out = "# Fibwatch Original Links\n\n"
+        md_out += f"Found {len(original_links)} links:\n\n"
+        for link in original_links:
+            md_out += f"- {link}\n"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w") as f:
+            f.write("\n".join(original_links))
+            file_path = f.name
+        return md_out, file_path
+    except Exception as e:
+        return f"Error: {str(e)}", None
     """
     Wrapper function for Gradio UI that allows configurable link limits for bulk extraction.
     # Enhanced bulk extract interface with configurable limits
     bulk_limited_interface = gr.Interface(
+fibwatch_interface = gr.Interface(
+    fn=fibwatch_latest_to_originals,
+    inputs=gr.Textbox(
+        label="Fibwatch Listing Page URL",
+        placeholder="https://fibwatch.art/videos/latest?page_id=1"
+    ),
+    outputs=[
+        gr.Textbox(
+            label="Original CDN Links",
+            lines=15,
+            show_copy_button=True
+        ),
+        gr.File(label="Download TXT")
+    ],
+    title="Fibwatch Latest → Originals",
+    description="Extract all Original CDN links from Fibwatch listing page",
+    api_name="fibwatch_latest_scraper"
+)
         fn=extract_limited_content_as_zip,
         inputs=[
             gr.Textbox(
     )
     # Combine into tabbed interface
+demo = gr.TabbedInterface(
+    [
+        scrape_interface,
+        sitemap_interface,
+        sitemap_limited_interface,
+        bulk_extract_interface,
+        bulk_limited_interface,
+        fibwatch_interface
+    ],
+    [
+        "Content Scraper",
+        "All Links Sitemap",
+        "Limited Sitemap",
+        "Bulk Extractor",
+        "Limited Bulk Extractor",
+        "Fibwatch Scraper"
+    ],
+    title="🕷️ Web Scraper MCP Server"
+)
     return demo
     app = create_mcp_interface()
     app.launch(
        mcp_server=True
+    )