Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

9ab61fb

verified ·

1 Parent(s): 4e52cce

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -47

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import subprocess
 import json
 import re
 from bs4 import BeautifulSoup
-from concurrent.futures import ThreadPoolExecutor, as_completed
 from functools import lru_cache
 import networkx as nx
 from pyvis.network import Network
@@ -21,17 +21,13 @@ session.headers.update({
 # --- VirusTotal helper (optional) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
-    resp = session.post(
-        "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
-    )
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     # Poll until complete
     while True:
         time.sleep(5)
-        st = session.get(
-            f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
-        )
         st.raise_for_status()
         attr = st.json()["data"]["attributes"]
         if attr.get("status") == "completed":
@@ -39,11 +35,7 @@ def scan_url_vt(url, api_key):
 # --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
-    cmd = [
-        "ffprobe", "-v", "error", "-print_format", "json",
-        "-show_format", "-show_streams",
-        url_or_path
-    ]
     out = subprocess.check_output(cmd)
     return json.loads(out)
@@ -54,9 +46,8 @@ def fetch_page_metadata(url):
         resp = session.get(url, timeout=5)
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
-        meta = {"url": url, "title": soup.title.string if soup.title else None}
-        # OpenGraph & twitter tags
-        for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
@@ -64,7 +55,7 @@ def fetch_page_metadata(url):
     except Exception as e:
         return {"url": url, "error": str(e)}
-# --- Fetch favicon for clickable graph nodes ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
@@ -77,37 +68,42 @@ def fetch_favicon(url):
     except Exception:
         return None
-# --- Trace origins recursively up to max depth ---
-def trace_origins(description, max_depth=2):
     graph = nx.DiGraph()
     def recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
-        links = []
-        if "og:url" in info:
-            links.append(info["og:url"])
         else:
             try:
-                soup = BeautifulSoup(session.get(url, timeout=5).text, "html.parser")
                 for a in soup.find_all("a", href=True):
-                    if a["href"].startswith("http"):
-                        links.append(a["href"])
             except:
                 pass
-        for link in set(links):
             graph.add_edge(link, url)
             recurse(link, depth + 1)
-    seeds = re.findall(r"https?://[^\s\"<]+", description)
     for seed in seeds:
         recurse(seed, 1)
     return graph
-# --- Build PyVis network HTML ---
 def build_graph_html(graph):
-    net = Network(height="500px", width="100%", directed=True)
     for url, data in graph.nodes(data=True):
         if data.get("favicon"):
             net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
@@ -117,27 +113,33 @@ def build_graph_html(graph):
         net.add_edge(src, dst)
     return net.generate_html()
-# --- Fetch IA items (movies) ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
-    query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
-    results = list(search_items(ia_query))[:20]
-    clean_urls = []
-    for res in results:
-        identifier = res["identifier"]
-        item = get_item(identifier)
         for f in item.files:
             fmt = f.get("format", "").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        if not scan_url_vt(url, api_key):
                             continue
                     except:
                         continue
-                clean_urls.append(url)
-    return clean_urls
 # --- Gradio UI ---
 with gr.Blocks() as demo:
@@ -155,7 +157,7 @@ with gr.Blocks() as demo:
     ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
     origins_graph  = gr.HTML(label="► Source‑Origin Graph")
-    executor = ThreadPoolExecutor(max_workers=10)
     def search_and_populate(keywords, api_key, scan_enabled):
         urls = fetch_clean_videos(keywords, api_key, scan_enabled)
@@ -163,14 +165,20 @@ with gr.Blocks() as demo:
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
-            return None, {}, {}, ""
-        identifier = selected_url.split("/")[4]
         # 1) IA metadata
         try:
             item = get_item(identifier)
-            raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
-        except:
-            raw_ia = {"error": "Could not fetch IA metadata"}
         # 2) FFprobe
         ff_md = {}
         if ff_on:
@@ -179,8 +187,8 @@ with gr.Blocks() as demo:
             except Exception as e:
                 ff_md = {"error": str(e)}
         # 3) Origins
-        desc = raw_ia.get("metadata", {}).get("description", "")
-        graph = trace_origins(desc, max_depth=2)
         graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
         return selected_url, raw_ia, ff_md, graph_html

 import json
 import re
 from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor
 from functools import lru_cache
 import networkx as nx
 from pyvis.network import Network
 # --- VirusTotal helper (optional) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
+    resp = session.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     # Poll until complete
     while True:
         time.sleep(5)
+        st = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
         st.raise_for_status()
         attr = st.json()["data"]["attributes"]
         if attr.get("status") == "completed":
 # --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
+    cmd = ["ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", url_or_path]
     out = subprocess.check_output(cmd)
     return json.loads(out)
         resp = session.get(url, timeout=5)
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
+        meta = {"url": url, "title": soup.title.string.strip() if soup.title and soup.title.string else url}
+        for tag in soup.find_all("meta"):  # OpenGraph & Twitter tags
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
     except Exception as e:
         return {"url": url, "error": str(e)}
+# --- Fetch favicon for graph nodes ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
     except Exception:
         return None
+# --- Recursive origin tracing ---
+def trace_origins(description_html, max_depth=2):
     graph = nx.DiGraph()
+    # clean HTML to plain text for better URL extraction
+    desc_text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
+    seeds = re.findall(r"https?://[^\s]+", desc_text)
     def recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
+        # find further links via OG:url or anchor tags
+        next_links = []
+        if info.get("og:url"):
+            next_links.append(info["og:url"])
         else:
             try:
+                page = session.get(url, timeout=5).text
+                soup = BeautifulSoup(page, "html.parser")
                 for a in soup.find_all("a", href=True):
+                    href = a["href"].strip()
+                    if href.startswith("http"):
+                        next_links.append(href)
             except:
                 pass
+        for link in set(next_links):
             graph.add_edge(link, url)
             recurse(link, depth + 1)
     for seed in seeds:
         recurse(seed, 1)
     return graph
+# --- Generate PyVis graph HTML ---
 def build_graph_html(graph):
+    net = Network(height="500px", width="100%", directed=True, notebook=False)
     for url, data in graph.nodes(data=True):
         if data.get("favicon"):
             net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
         net.add_edge(src, dst)
     return net.generate_html()
+# --- Search and filter IA videos ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
+    terms = [kw.strip() for kw in keywords.split(",")]
+    query = " OR ".join(term.replace(" ", "+") for term in terms)
     ia_query = f"mediatype:(movies) AND ({query})"
+    items = list(search_items(ia_query))[:20]
+    urls = []
+    for res in items:
+        identifier = res.get("identifier")
+        if not identifier:
+            continue
+        try:
+            item = get_item(identifier)
+        except Exception:
+            continue
         for f in item.files:
             fmt = f.get("format", "").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                video_url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        if not scan_url_vt(video_url, api_key):
                             continue
                     except:
                         continue
+                urls.append(video_url)
+    return urls
 # --- Gradio UI ---
 with gr.Blocks() as demo:
     ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
     origins_graph  = gr.HTML(label="► Source‑Origin Graph")
+    executor = ThreadPoolExecutor(max_workers=5)
     def search_and_populate(keywords, api_key, scan_enabled):
         urls = fetch_clean_videos(keywords, api_key, scan_enabled)
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
+            return None, {}, {}, "<p>No data.</p>"
+        # extract identifier robustly
+        parsed = urlparse(selected_url)
+        parts = parsed.path.strip("/").split("/")
+        identifier = parts[1] if len(parts) > 1 else None
         # 1) IA metadata
         try:
             item = get_item(identifier)
+            raw_ia = {
+                "metadata": item.metadata,
+                "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]
+            }
+        except Exception as e:
+            raw_ia = {"error": f"Could not fetch IA metadata: {e}"}
         # 2) FFprobe
         ff_md = {}
         if ff_on:
             except Exception as e:
                 ff_md = {"error": str(e)}
         # 3) Origins
+        desc_html = raw_ia.get("metadata", {}).get("description", "")
+        graph = trace_origins(desc_html, max_depth=2)
         graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
         return selected_url, raw_ia, ff_md, graph_html