Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

d4356c2

verified ·

1 Parent(s): ffdb027

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -70

app.py CHANGED Viewed

@@ -6,27 +6,35 @@ import subprocess
 import json
 import re
 from bs4 import BeautifulSoup
-# --- VirusTotal helper functions ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
-    resp = requests.post(
         "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
     )
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
-        status_resp = requests.get(
-            f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
-        )
-        status_resp.raise_for_status()
-        attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
-            stats = attr.get("stats", {})
-            return stats.get("malicious", 0) == 0
-# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
@@ -36,15 +44,15 @@ def extract_ffprobe_metadata(url_or_path):
     out = subprocess.check_output(cmd)
     return json.loads(out)
-# --- Scrape basic page metadata (title + og: tags) ---
 def fetch_page_metadata(url):
     try:
-        resp = requests.get(url, timeout=5)
         resp.raise_for_status()
-        html = resp.text
-        soup = BeautifulSoup(html, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
-        # grab OpenGraph tags
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
@@ -53,38 +61,86 @@ def fetch_page_metadata(url):
     except Exception as e:
         return {"url": url, "error": str(e)}
-# --- Core search & scan logic ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
     query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
-    results = list(search_items(ia_query))[:50]
     clean_urls = []
     for res in results:
-        identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files:
-            fmt = f.get("format", "").lower()
-            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        is_clean = scan_url_vt(url, api_key)
-                    except Exception:
                         continue
-                else:
-                    is_clean = True
-                if is_clean:
-                    clean_urls.append(url)
     return clean_urls
-# --- Gradio UI setup ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer  \nEnable VT scan, FFprobe & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
-    scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
     run_btn        = gr.Button("Search & Scan")
@@ -92,57 +148,37 @@ with gr.Blocks() as demo:
     video_player   = gr.Video(label="Video Player")
     ia_meta_json   = gr.JSON(label="► Raw IA Metadata")
     ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
-    origins_json   = gr.JSON(label="► Source‑Origin Metadata")
     def search_and_populate(keywords, api_key, scan_enabled):
         urls = fetch_clean_videos(keywords, api_key, scan_enabled)
         return gr.update(choices=urls, value=urls[0] if urls else None)
     def update_all(selected_url, ff_on, api_key):
-        # no selection guard
         if not selected_url:
-            return None, {}, {}, []
-        # 1) IA metadata + file list
-        parts = selected_url.split("/")
-        identifier = parts[4] if len(parts) > 4 else None
-        raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
-        if identifier:
-            try:
-                item = get_item(identifier)
-                raw_ia["metadata"] = item.metadata
-                raw_ia["files"] = [
-                    {
-                        "name": f.get("name"),
-                        "format": f.get("format"),
-                        "size": f.get("size"),
-                        "md5": f.get("md5"),
-                        **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
-                    }
-                    for f in item.files
-                ]
-            except Exception:
-                raw_ia["error"] = "could not fetch IA metadata"
-        # 2) FFprobe metadata if toggled
         ff_md = {}
         if ff_on:
             try:
                 ff_md = extract_ffprobe_metadata(selected_url)
             except Exception as e:
                 ff_md = {"error": str(e)}
-        # 3) Origin tracing: scrape each URL in description
-        origins = []
-        desc = raw_ia["metadata"].get("description", "")
-        urls_found = re.findall(r'https?://[^\s"<]+', desc)
-        for url in urls_found:
-            meta = fetch_page_metadata(url)
-            origins.append(meta)
-            # stop at first “real” origin (you can remove this break to collect all)
-            break
-        return selected_url, raw_ia, ff_md, origins
     run_btn.click(
         fn=search_and_populate,
@@ -152,8 +188,8 @@ with gr.Blocks() as demo:
     url_dropdown.change(
         fn=update_all,
         inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
-        outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
     )
 if __name__ == "__main__":
-    demo.launch()

 import json
 import re
 from bs4 import BeautifulSoup
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from functools import lru_cache
+import networkx as nx
+from pyvis.network import Network
+from urllib.parse import urlparse
+\# --- Shared HTTP session for speed & headers ---
+session = requests.Session()
+session.headers.update({
+    "User-Agent": "Mozilla/5.0 (compatible; IA-Video-Meta-Explorer/1.0)"
+})
+\# --- VirusTotal helper (optional) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
+    resp = session.post(
         "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
     )
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
+    # Poll until complete
     while True:
         time.sleep(5)
+        st = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
+        st.raise_for_status()
+        attr = st.json()["data"]["attributes"]
         if attr.get("status") == "completed":
+            return attr.get("stats", {}).get("malicious", 0) == 0
+\# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
     out = subprocess.check_output(cmd)
     return json.loads(out)
+\# --- Caching page metadata ---
+@lru_cache(maxsize=256)
 def fetch_page_metadata(url):
     try:
+        resp = session.get(url, timeout=5)
         resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
+        # OpenGraph & twitter
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
     except Exception as e:
         return {"url": url, "error": str(e)}
+\# --- Fetch favicon for clickable graph nodes ---
+@lru_cache(maxsize=256)
+def fetch_favicon(url):
+    try:
+        domain = urlparse(url).scheme + "://" + urlparse(url).netloc
+        ico_url = domain + "/favicon.ico"
+        resp = session.get(ico_url, timeout=3)
+        resp.raise_for_status()
+        return ico_url
+    except:
+        return None
+\# --- Trace origins recursively up to a max depth ---
+def trace_origins(description, max_depth=2, executor=None):
+    graph = nx.DiGraph()
+    def _recurse(url, depth):
+        if depth > max_depth or url in graph:
+            return
+        info = fetch_page_metadata(url)
+        favicon = fetch_favicon(url)
+        graph.add_node(url, title=info.get("title"), favicon=favicon)
+        # find OG:url or linked URLs on page as potential origins
+        links = []
+        if "og:url" in info:
+            links.append(info["og:url"])
+        else:
+            try:
+                soup = BeautifulSoup(session.get(url, timeout=5).text, "html.parser")
+                for a in soup.find_all("a", href=True):
+                    if a["href"].startswith("http"):
+                        links.append(a["href"])
+            except:
+                pass
+        for link in set(links):
+            graph.add_edge(link, url)
+            _recurse(link, depth + 1)
+    # initial URLs from IA description
+    seeds = re.findall(r'https?://[^\s"<]+', description)
+    for seed in seeds:
+        _recurse(seed, 1)
+    return graph
+\# --- Build PyVis network HTML ---
+def build_graph_html(graph):
+    net = Network(height="500px", width="100%", directed=True)
+    for url, data in graph.nodes(data=True):
+        net.add_node(url, label=data.get("title") or url, title=url, shape="image" if data.get("favicon") else "ellipse", image=data.get("favicon"))
+    for src, dst in graph.edges():
+        net.add_edge(src, dst)
+    return net.generate_html()
+\# --- Fetch IA items (movies) ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
     query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
+    results = list(search_items(ia_query))[:20]
     clean_urls = []
     for res in results:
+        identifier = res['identifier']
         item = get_item(identifier)
         for f in item.files:
+            fmt = f.get('format', '').lower()
+            if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        if not scan_url_vt(url, api_key):
+                            continue
+                    except:
                         continue
+                clean_urls.append(url)
     return clean_urls
+\# --- Gradio UI ---
 with gr.Blocks() as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer  — Enhanced Metadata & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
+    scan_toggle    = gr.Checkbox(label="Enable VT scan", value=False)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
     run_btn        = gr.Button("Search & Scan")
     video_player   = gr.Video(label="Video Player")
     ia_meta_json   = gr.JSON(label="► Raw IA Metadata")
     ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
+    origins_graph  = gr.HTML(label="► Source‑Origin Graph")
+    executor = ThreadPoolExecutor(max_workers=10)
     def search_and_populate(keywords, api_key, scan_enabled):
         urls = fetch_clean_videos(keywords, api_key, scan_enabled)
         return gr.update(choices=urls, value=urls[0] if urls else None)
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
+            return None, {}, {}, ""
+        identifier = selected_url.split("/")[4]
+        # 1) IA metadata
+        raw_ia = {}
+        try:
+            item = get_item(identifier)
+            raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
+        except:
+            raw_ia = {"error": "Could not fetch IA metadata"}
+        # 2) FFprobe
         ff_md = {}
         if ff_on:
             try:
                 ff_md = extract_ffprobe_metadata(selected_url)
             except Exception as e:
                 ff_md = {"error": str(e)}
+        # 3) Origins
+        desc = raw_ia.get("metadata", {}).get("description", "")
+        graph = trace_origins(desc, max_depth=2, executor=executor)
+        graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
+        return selected_url, raw_ia, ff_md, graph_html
     run_btn.click(
         fn=search_and_populate,
     url_dropdown.change(
         fn=update_all,
         inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
+        outputs=[video_player, ia_meta_json, ffprobe_json, origins_graph]
     )
 if __name__ == "__main__":
+    demo.launch(server_port=7860, share=False)