Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

ffdb027

verified ·

1 Parent(s): 44936dc

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -102

app.py CHANGED Viewed

@@ -1,15 +1,13 @@
 import gradio as gr
 from internetarchive import search_items, get_item
-import requests, time, subprocess, json, re, tempfile, os
 from bs4 import BeautifulSoup
-import networkx as nx
-from pyvis.network import Network
-# --- SETTINGS ---
-NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
-THEME = "gradio/soft"
-# --- VirusTotal scan ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post(
@@ -19,133 +17,142 @@ def scan_url_vt(url, api_key):
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
-        status = requests.get(
             f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
         )
-        attr = status.json()["data"]["attributes"]
         if attr.get("status") == "completed":
-            return attr["stats"].get("malicious", 0) == 0
-# --- FFprobe metadata ---
-def extract_ffprobe_metadata(path):
-    output = subprocess.check_output([
-        "ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", path
-    ])
-    return json.loads(output)
-# --- Fetch page metadata + favicon ---
 def fetch_page_metadata(url):
     try:
-        r = requests.get(url, timeout=5)
-        r.raise_for_status()
-        soup = BeautifulSoup(r.text, "html.parser")
-        meta = {"url": url, "title": soup.title.string if soup.title else url}
         for tag in soup.find_all("meta"):
-            key = tag.get("property") or tag.get("name")
-            if key and (key.startswith("og:") or key.startswith("twitter:")):
-                meta[key] = tag.get("content")
-        icon = soup.find("link", rel=lambda v: v and "icon" in v.lower())
-        meta["favicon"] = icon.get("href") if icon else ""
         return meta
     except Exception as e:
-        return {"url": url, "error": str(e), "favicon": ""}
-# --- IA search + filter ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
-    query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
-    items = list(search_items(f"mediatype:(movies) AND ({query})"))[:50]
-    results = []
-    for item_meta in items:
-        title = item_meta.get("title", "").lower()
-        if any(re.search(p, title) for p in NEWS_FILTER):
-            continue
-        item = get_item(item_meta['identifier'])
         for f in item.files:
-            fmt = f.get('format', '').lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                url = f"https://archive.org/download/{item_meta['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        if not scan_url_vt(url, api_key):
-                            continue
-                    except:
                         continue
-                results.append(url)
-    return results
-# --- Build graph HTML (using write_html) ---
-def build_graph_html(chain):
-    net = Network(height="300px", width="100%", directed=True)
-    for hop in chain:
-        url = hop['url']
-        meta = hop['metadata']
-        title = meta.get('title', url)
-        favicon = meta.get('favicon', '')
-        if favicon:
-            net.add_node(url, label="", shape='image', image=favicon, title=title)
-        else:
-            net.add_node(url, label=title, title=title)
-    for i in range(len(chain)-1):
-        net.add_edge(chain[i]['url'], chain[i+1]['url'])
-    tmp_path = tempfile.mktemp(suffix='.html')
-    net.write_html(tmp_path, notebook=False, open_browser=False)
-    with open(tmp_path, 'r', encoding='utf8') as f:
-        html = f.read()
-    os.remove(tmp_path)
-    return html
-# --- Gradio UI ---
-with gr.Blocks(theme=THEME) as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer")
     with gr.Row():
-        kw_input = gr.Textbox(label="Search keywords (comma-separated)", value="drone strike, military uav")
-        vt_key   = gr.Textbox(label="VirusTotal API Key", type="password")
-    scan_toggle    = gr.Checkbox(label="Enable VirusTotal scan", value=True)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
     run_btn        = gr.Button("Search & Scan")
     url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
     video_player   = gr.Video(label="Video Player")
-    ia_meta        = gr.JSON(label="► Raw IA Metadata")
-    ff_meta        = gr.JSON(label="► FFprobe Metadata")
-    graph_panel    = gr.HTML(label="► Reupload Chain Graph")
-    origin_meta    = gr.JSON(label="► Origin Node Metadata")
-    def search_and_populate(keywords, api_key, scan_on):
-        urls = fetch_clean_videos(keywords, api_key, scan_on)
         return gr.update(choices=urls, value=urls[0] if urls else None)
-    def on_url_select(selected_url, ff_on, api_key):
         if not selected_url:
-            return None, {}, {}, "", {}
-        # IA metadata + files
-        parts = selected_url.split('/')
-        ident = parts[4]
-        item = get_item(ident)
-        ia_data = {'metadata': item.metadata, 'files': item.files}
-        # FFprobe
-        ff_data = extract_ffprobe_metadata(selected_url) if ff_on else {}
-        # origin chain
-        desc = item.metadata.get('description', '')
-        urls = re.findall(r'https?://[^\s"<]+', desc)
-        chain = []
-        for u in urls:
-            chain.append({'url': u, 'metadata': fetch_page_metadata(u)})
-        # append IA as last hop
-        chain.append({'url': selected_url, 'metadata': {'title': item.metadata.get('title','')}})
-        graph_html = build_graph_html(chain)
-        origin_node = chain[0]['metadata'] if chain else {}
-        return selected_url, ia_data, ff_data, graph_html, origin_node
     run_btn.click(
-        search_and_populate,
-        inputs=[kw_input, vt_key, scan_toggle],
         outputs=[url_dropdown]
     )
     url_dropdown.change(
-        on_url_select,
-        inputs=[url_dropdown, ffprobe_toggle, vt_key],
-        outputs=[video_player, ia_meta, ff_meta, graph_panel, origin_meta]
     )
 if __name__ == "__main__":

 import gradio as gr
 from internetarchive import search_items, get_item
+import requests
+import time
+import subprocess
+import json
+import re
 from bs4 import BeautifulSoup
+# --- VirusTotal helper functions ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post(
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
+        status_resp = requests.get(
             f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
         )
+        status_resp.raise_for_status()
+        attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
+            stats = attr.get("stats", {})
+            return stats.get("malicious", 0) == 0
+# --- FFprobe metadata extraction ---
+def extract_ffprobe_metadata(url_or_path):
+    cmd = [
+        "ffprobe", "-v", "error", "-print_format", "json",
+        "-show_format", "-show_streams",
+        url_or_path
+    ]
+    out = subprocess.check_output(cmd)
+    return json.loads(out)
+# --- Scrape basic page metadata (title + og: tags) ---
 def fetch_page_metadata(url):
     try:
+        resp = requests.get(url, timeout=5)
+        resp.raise_for_status()
+        html = resp.text
+        soup = BeautifulSoup(html, "html.parser")
+        meta = {"url": url, "title": soup.title.string if soup.title else None}
+        # grab OpenGraph tags
         for tag in soup.find_all("meta"):
+            prop = tag.get("property") or tag.get("name")
+            if prop and prop.startswith(("og:", "twitter:")):
+                meta[prop] = tag.get("content")
         return meta
     except Exception as e:
+        return {"url": url, "error": str(e)}
+# --- Core search & scan logic ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
+    query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
+    ia_query = f"mediatype:(movies) AND ({query})"
+    results = list(search_items(ia_query))[:50]
+    clean_urls = []
+    for res in results:
+        identifier = res["identifier"]
+        item = get_item(identifier)
         for f in item.files:
+            fmt = f.get("format", "").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        is_clean = scan_url_vt(url, api_key)
+                    except Exception:
                         continue
+                else:
+                    is_clean = True
+                if is_clean:
+                    clean_urls.append(url)
+    return clean_urls
+# --- Gradio UI setup ---
+with gr.Blocks() as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer  \nEnable VT scan, FFprobe & Origin Tracing")
     with gr.Row():
+        kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
+        vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
+    scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
     run_btn        = gr.Button("Search & Scan")
     url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
     video_player   = gr.Video(label="Video Player")
+    ia_meta_json   = gr.JSON(label="► Raw IA Metadata")
+    ffprobe_json   = gr.JSON(label="► FFprobe Metadata")
+    origins_json   = gr.JSON(label="► Source‑Origin Metadata")
+    def search_and_populate(keywords, api_key, scan_enabled):
+        urls = fetch_clean_videos(keywords, api_key, scan_enabled)
         return gr.update(choices=urls, value=urls[0] if urls else None)
+    def update_all(selected_url, ff_on, api_key):
+        # no selection guard
         if not selected_url:
+            return None, {}, {}, []
+        # 1) IA metadata + file list
+        parts = selected_url.split("/")
+        identifier = parts[4] if len(parts) > 4 else None
+        raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
+        if identifier:
+            try:
+                item = get_item(identifier)
+                raw_ia["metadata"] = item.metadata
+                raw_ia["files"] = [
+                    {
+                        "name": f.get("name"),
+                        "format": f.get("format"),
+                        "size": f.get("size"),
+                        "md5": f.get("md5"),
+                        **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
+                    }
+                    for f in item.files
+                ]
+            except Exception:
+                raw_ia["error"] = "could not fetch IA metadata"
+        # 2) FFprobe metadata if toggled
+        ff_md = {}
+        if ff_on:
+            try:
+                ff_md = extract_ffprobe_metadata(selected_url)
+            except Exception as e:
+                ff_md = {"error": str(e)}
+        # 3) Origin tracing: scrape each URL in description
+        origins = []
+        desc = raw_ia["metadata"].get("description", "")
+        urls_found = re.findall(r'https?://[^\s"<]+', desc)
+        for url in urls_found:
+            meta = fetch_page_metadata(url)
+            origins.append(meta)
+            # stop at first “real” origin (you can remove this break to collect all)
+            break
+        return selected_url, raw_ia, ff_md, origins
     run_btn.click(
+        fn=search_and_populate,
+        inputs=[kw_input, vt_key_input, scan_toggle],
         outputs=[url_dropdown]
     )
     url_dropdown.change(
+        fn=update_all,
+        inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
+        outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
     )
 if __name__ == "__main__":