Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

40f1e90

verified ·

1 Parent(s): 34285ab

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -97

app.py CHANGED Viewed

@@ -7,9 +7,9 @@ from pyvis.network import Network
 # --- SETTINGS ---
 NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
-THEME = "gradio/soft"  # bring back the default soft theme
-# --- VirusTotal scan (unchanged) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
@@ -17,130 +17,125 @@ def scan_url_vt(url, api_key):
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
-        st = requests.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
-        attr = st.json()["data"]["attributes"]
-        if attr.get("status")=="completed":
-            return attr["stats"].get("malicious",0)==0
-# --- FFprobe metadata (unchanged) ---
 def extract_ffprobe_metadata(path):
-    out = subprocess.check_output([
-        "ffprobe","-v","error","-print_format","json","-show_format","-show_streams", path
     ])
-    return json.loads(out)
-# --- Fetch page metadata + favicon URL ---
 def fetch_page_metadata(url):
     try:
-        r = requests.get(url, timeout=5); r.raise_for_status()
-        bs = BeautifulSoup(r.text,"html.parser")
-        meta = {"url":url, "title": bs.title.string if bs.title else ""}
-        # og: and twitter:
-        for m in bs.find_all("meta"):
-            p = m.get("property") or m.get("name")
-            if p and p.startswith(("og:","twitter:")):
-                meta[p] = m.get("content")
-        # find favicon
-        icon = bs.find("link", rel=lambda v:v and "icon" in v.lower())
-        meta["favicon"] = icon["href"] if icon else ""
         return meta
     except Exception as e:
-        return {"url":url, "error":str(e), "favicon":""}
-# --- Core IA search + filter ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
-    # build query
-    q = " OR ".join(kw.strip().replace(" ","+") for kw in keywords.split(","))
-    items = list(search_items(f"mediatype:(movies) AND ({q})"))[:50]
-    clean = []
-    for it in items:
-        title = it.get("title","").lower()
-        # filter out news
         if any(re.search(p, title) for p in NEWS_FILTER):
             continue
-        # find video files
-        for f in get_item(it["identifier"]).files:
-            fmt = f.get("format","").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                url = f"https://archive.org/download/{it['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        ok = scan_url_vt(url, api_key)
                     except:
                         continue
-                else:
-                    ok = True
-                if ok:
-                    clean.append(url)
-    return clean
-# --- Build a PyVis graph and return its HTML path ---
-def build_graph(chain):
-    G = Network(height="300px", width="100%", directed=True)
-    for node in chain:
-        label = node.get("metadata",{}).get("title","origin")
-        icon = node.get("metadata",{}).get("favicon","")
-        G.add_node(node["url"], label="", shape="image", image=icon or None, title=label)
-    # link them in order
     for i in range(len(chain)-1):
-        G.add_edge(chain[i]["url"], chain[i+1]["url"])
-    tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
-    G.show(tmp.name)
-    return tmp.name
-# --- UI ---
 with gr.Blocks(theme=THEME) as demo:
-    gr.Markdown("## 📼 Raw-Footage Chain Explorer")
     with gr.Row():
-        kw  = gr.Textbox("Keywords (comma-sep)", value="drone strike, military uav")
-        vt  = gr.Textbox("VT API Key", type="password")
-    scan_toggle  = gr.Checkbox("Enable VT scan", True)
-    ff_toggle    = gr.Checkbox("Enable FFprobe", False)
-    run_btn      = gr.Button("Search & Scan")
-    url_dd       = gr.Dropdown("Clean Video URLs", choices=[])
-    vid_player   = gr.Video()
-    ia_json      = gr.JSON()
-    ff_json      = gr.JSON()
-    graph_html   = gr.HTML()
-    origin_meta  = gr.JSON()
-    def search_and_populate(k, api, s):
-        urls = fetch_clean_videos(k, api, s)
         return gr.update(choices=urls, value=urls[0] if urls else None)
-    def update_all(sel, ff_on, api_key):
-        if not sel:
             return None, {}, {}, "", {}
-        # 1) IA metadata + files
-        parts = sel.split("/"); ident = parts[4]
         item = get_item(ident)
-        raw = {"metadata":item.metadata, "files": [
-            {"name":f["name"], "format":f["format"], "size":f.get("size")}
-            for f in item.files
-        ]}
-        # 2) FFprobe
-        ffm = extract_ffprobe_metadata(sel) if ff_on else {}
-        # 3) trace origins
-        desc = raw["metadata"].get("description","")
-        urls = re.findall(r"https?://[^\s\"']+", desc)
         chain = []
         for u in urls:
-            m = fetch_page_metadata(u)
-            chain.append({"url":u, "metadata":m})
-        # finally add IA itself as last hop
-        chain.append({"url":sel, "metadata": {"title": raw["metadata"].get("title"), "favicon": ""}})
-        # 4) graph
-        gfile = build_graph(chain)
-        # 5) default show first origin metadata
-        om = chain[0]["metadata"] if chain else {}
-        # embed graph HTML
-        graph_data = open(gfile,"r",encoding="utf8").read()
-        os.unlink(gfile)
-        return sel, raw, ffm, graph_data, om
-    run_btn.click(search_and_populate, [kw, vt, scan_toggle], [url_dd])
-    url_dd.change(update_all, [url_dd, ff_toggle, vt],
-                  [vid_player, ia_json, ff_json, graph_html, origin_meta])
-demo.launch()

 # --- SETTINGS ---
 NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
+THEME = "gradio/soft"  # Default Gradio soft theme
+# --- VirusTotal scan ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
+        status = requests.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
+        attr = status.json()["data"]["attributes"]
+        if attr.get("status") == "completed":
+            return attr["stats"].get("malicious", 0) == 0
+# --- FFprobe metadata ---
 def extract_ffprobe_metadata(path):
+    output = subprocess.check_output([
+        "ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", path
     ])
+    return json.loads(output)
+# --- Fetch page metadata + favicon ---
 def fetch_page_metadata(url):
     try:
+        r = requests.get(url, timeout=5)
+        r.raise_for_status()
+        soup = BeautifulSoup(r.text, "html.parser")
+        meta = {"url": url, "title": soup.title.string if soup.title else ""}
+        for tag in soup.find_all("meta"):
+            key = tag.get("property") or tag.get("name")
+            if key and (key.startswith("og:") or key.startswith("twitter:")):
+                meta[key] = tag.get("content")
+        icon = soup.find("link", rel=lambda v: v and "icon" in v.lower())
+        meta["favicon"] = icon.get("href") if icon else ""
         return meta
     except Exception as e:
+        return {"url": url, "error": str(e), "favicon": ""}
+# --- IA search + filter ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
+    query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
+    items = list(search_items(f"mediatype:(movies) AND ({query})"))[:50]
+    results = []
+    for item_meta in items:
+        title = item_meta.get("title", "").lower()
         if any(re.search(p, title) for p in NEWS_FILTER):
             continue
+        item = get_item(item_meta['identifier'])
+        for f in item.files:
+            fmt = f.get('format', '').lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                url = f"https://archive.org/download/{item_meta['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        if not scan_url_vt(url, api_key):
+                            continue
                     except:
                         continue
+                results.append(url)
+    return results
+# --- Build graph HTML ---
+def build_graph_html(chain):
+    net = Network(height="300px", width="100%", directed=True)
+    for hop in chain:
+        url = hop['url']
+        meta = hop['metadata']
+        title = meta.get('title', url)
+        favicon = meta.get('favicon', '')
+        if favicon:
+            net.add_node(url, label="", shape='image', image=favicon, title=title)
+        else:
+            net.add_node(url, label=title, title=title)
     for i in range(len(chain)-1):
+        net.add_edge(chain[i]['url'], chain[i+1]['url'])
+    tmpf = tempfile.NamedTemporaryFile(suffix='.html', delete=False)
+    net.show(tmpf.name)
+    html = open(tmpf.name, 'r', encoding='utf8').read()
+    os.unlink(tmpf.name)
+    return html
+# --- Gradio UI ---
 with gr.Blocks(theme=THEME) as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer")
     with gr.Row():
+        kw_input = gr.Textbox(label="Search keywords (comma-separated)", value="drone strike, military uav")
+        vt_key   = gr.Textbox(label="VirusTotal API Key", type="password")
+    scan_toggle    = gr.Checkbox(label="Enable VirusTotal scan", value=True)
+    ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
+    run_btn        = gr.Button(label="Search & Scan")
+    url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
+    video_player   = gr.Video(label="Video Player")
+    ia_meta        = gr.JSON(label="► Raw IA Metadata")
+    ff_meta        = gr.JSON(label="► FFprobe Metadata")
+    graph_panel    = gr.HTML(label="► Reupload Chain Graph")
+    origin_meta    = gr.JSON(label="► Origin Node Metadata")
+    def search_and_populate(keywords, api_key, scan_on):
+        urls = fetch_clean_videos(keywords, api_key, scan_on)
         return gr.update(choices=urls, value=urls[0] if urls else None)
+    def on_url_select(selected_url, ff_on, api_key):
+        if not selected_url:
             return None, {}, {}, "", {}
+        # IA metadata + files
+        parts = selected_url.split('/')
+        ident = parts[4]
         item = get_item(ident)
+        ia_data = {'metadata': item.metadata, 'files': item.files}
+        # FFprobe
+        ff_data = extract_ffprobe_metadata(selected_url) if ff_on else {}
+        # origin chain
+        desc = item.metadata.get('description', '')
+        urls = re.findall(r'https?://[^\s"<]+', desc)
         chain = []
         for u in urls:
+            meta = fetch_page_metadata(u)
+            chain.append({'url': u, 'metadata': meta})
+        # append IA as last hop
+        chain.append({'url': selected_url, 'metadata': {'title': item.metadata.get('title','')} })
+        graph_html = build_graph_html(chain)
+        origin_node = chain[0]['metadata'] if chain else {}
+        return selected_url, ia_data, ff_data, graph_html, origin_node
+    run_btn.click(search_and_populate, [kw_input, vt_key, scan_toggle], [url_dropdown])
+    url_dropdown.change(on_url_select, [url_dropdown, ffprobe_toggle, vt_key],
+                       [video_player, ia_meta, ff_meta, graph_panel, origin_meta])
+if __name__ == "__main__":
+    demo.launch()