Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

34285ab

verified ·

1 Parent(s): 07fb168

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -129

app.py CHANGED Viewed

@@ -1,163 +1,146 @@
 import gradio as gr
 from internetarchive import search_items, get_item
-import requests
-import time
-import subprocess
-import json
-import re
 import networkx as nx
 from pyvis.network import Network
-from bs4 import BeautifulSoup
-# --- VirusTotal helper ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
-    resp = requests.post(
-        "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
-    )
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
-        status_resp = requests.get(
-            f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
-        )
-        status_resp.raise_for_status()
-        attr = status_resp.json()["data"]["attributes"]
-        if attr.get("status") == "completed":
-            return attr.get("stats", {}).get("malicious", 0) == 0
-# --- FFprobe ---
-def extract_ffprobe_metadata(url_or_path):
-    cmd = [
-        "ffprobe", "-v", "error", "-print_format", "json",
-        "-show_format", "-show_streams",
-        url_or_path
-    ]
-    out = subprocess.check_output(cmd)
     return json.loads(out)
-# --- Fetch page metadata + favicon ---
 def fetch_page_metadata(url):
     try:
-        resp = requests.get(url, timeout=5)
-        resp.raise_for_status()
-        soup = BeautifulSoup(resp.text, "html.parser")
-        meta = {"url": url, "title": soup.title.string if soup.title else None}
-        for tag in soup.find_all("meta"):
-            prop = tag.get("property") or tag.get("name")
-            if prop and prop.startswith(("og:", "twitter:")):
-                meta[prop] = tag.get("content")
-        # favicon
-        icon = soup.find("link", rel=lambda x: x and "icon" in x)
-        meta["favicon"] = requests.compat.urljoin(url, icon.get("href")) if icon else None
         return meta
     except Exception as e:
-        return {"url": url, "error": str(e)}
-# --- IA search & filter raw footage ---
-NEWS_STATIONS = ["cnn", "fox", "bbc", "nbc", "al jazeera", "rt "]
-def fetch_raw_footage_urls(keywords, api_key, scan_enabled):
-    query = " OR ".join([kw.strip().replace(' ', '+') for kw in keywords.split(",")])
-    ia_query = f"mediatype:(movies) AND ({query})"
-    results = list(search_items(ia_query))[:50]
-    urls = []
-    for res in results:
-        item = get_item(res['identifier'])
-        title = item.metadata.get("title", "").lower()
-        if any(ns in title for ns in NEWS_STATIONS):
             continue
-        for f in item.files:
-            fmt = f.get('format','').lower()
-            if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
-                url = f"https://archive.org/download/{res['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        if not scan_url_vt(url, api_key):
-                            continue
                     except:
                         continue
-                urls.append(url)
-    return urls
-# --- Recursive origin tracing ---
-def trace_origins(description, depth=0, max_depth=3, visited=None):
-    if visited is None: visited = set()
-    nodes = []
-    links = []
-    urls = re.findall(r'https?://[^\s"<]+', description)
-    for url in urls:
-        if url in visited: continue
-        visited.add(url)
-        meta = fetch_page_metadata(url)
-        nodes.append((url, meta))
-        if depth < max_depth and 'description' in meta:
-            sub_nodes, sub_links = trace_origins(meta.get('description',''), depth+1, max_depth, visited)
-            links.extend(sub_links)
-            nodes.extend(sub_nodes)
-        # link from origin to IA later
-        links.append((url, 'internet_archive'))
-    return nodes, links
-# --- Build graph HTML via pyvis ---
-def build_graph(nodes, links):
-    net = Network(height="400px", width="100%", directed=True)
-    for url, meta in nodes + [('internet_archive', {'title':'Internet Archive'})]:
-        label = meta.get('title') or url
-        favicon = meta.get('favicon')
-        net.add_node(url, label=label, title=json.dumps(meta), shape='image' if favicon else 'dot',
-                     image=favicon if favicon else None)
-    for src, dst in links:
-        net.add_edge(src, dst)
-    net.force_atlas_2based()
-    return net.generate_html()
-# --- Gradio UI ---
-with gr.Blocks() as demo:
-    gr.Markdown("# IA Drone‑Strike Chain Explorer")
     with gr.Row():
-        kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
-        vt_key   = gr.Textbox(label="VirusTotal API Key", type="password")
-    scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
-    ffprobe_toggle = gr.Checkbox(label="Enable FFprobe", value=False)
-    run_btn = gr.Button("Search & Scan")
-    url_dd = gr.Dropdown(label="Raw Footage URLs", choices=[], interactive=True)
-    video = gr.Video(label="Player")
-    ia_meta = gr.JSON(label="IA Metadata")
-    ff_meta = gr.JSON(label="FFprobe Metadata")
-    graph_html = gr.HTML(label="Reupload Chain Graph")
-    origin_meta = gr.JSON(label="Clicked Origin Metadata")
-    def search_populate(kw, api_key, scan_on):
-        urls = fetch_raw_footage_urls(kw, api_key, scan_on)
         return gr.update(choices=urls, value=urls[0] if urls else None)
-    def on_select(url, ff_on, api_key):
-        if not url: return None, {}, {}, "", {}
-        # IA meta
-        parts = url.split('/')
-        ident = parts[4]
         item = get_item(ident)
-        raw = {'metadata': item.metadata, 'files': [{k:v for k,v in f.items()} for f in item.files]}
-        # ffprobe
-        ff = extract_ffprobe_metadata(url) if ff_on else {}
-        # origin trace
-        nodes, links = trace_origins(item.metadata.get('description',''))
-        nodes.append(('internet_archive', {'title':'Internet Archive'}))
-        links = [(n[0],'internet_archive') for n in nodes if n[0] != 'internet_archive']
-        html = build_graph(nodes, links)
-        return url, raw, ff, html, {}
-    def on_click_node(node_id):
-        # find metadata in nodes list
-        # simplistic: refetch page
-        meta = fetch_page_metadata(node_id) if node_id != 'internet_archive' else {'title':'Internet Archive'}
-        return meta
-    run_btn.click(search_populate, [kw_input, vt_key, scan_toggle], [url_dd])
-    url_dd.change(on_select, [url_dd, ffprobe_toggle, vt_key], [video, ia_meta, ff_meta, graph_html, origin_meta])
-    graph_html.click(on_click_node, None, origin_meta)
-if __name__ == '__main__':
-    demo.launch()

 import gradio as gr
 from internetarchive import search_items, get_item
+import requests, time, subprocess, json, re, tempfile, os
+from bs4 import BeautifulSoup
 import networkx as nx
 from pyvis.network import Network
+# --- SETTINGS ---
+NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
+THEME = "gradio/soft"  # bring back the default soft theme
+# --- VirusTotal scan (unchanged) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
+    resp = requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
+        st = requests.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
+        attr = st.json()["data"]["attributes"]
+        if attr.get("status")=="completed":
+            return attr["stats"].get("malicious",0)==0
+# --- FFprobe metadata (unchanged) ---
+def extract_ffprobe_metadata(path):
+    out = subprocess.check_output([
+        "ffprobe","-v","error","-print_format","json","-show_format","-show_streams", path
+    ])
     return json.loads(out)
+# --- Fetch page metadata + favicon URL ---
 def fetch_page_metadata(url):
     try:
+        r = requests.get(url, timeout=5); r.raise_for_status()
+        bs = BeautifulSoup(r.text,"html.parser")
+        meta = {"url":url, "title": bs.title.string if bs.title else ""}
+        # og: and twitter:
+        for m in bs.find_all("meta"):
+            p = m.get("property") or m.get("name")
+            if p and p.startswith(("og:","twitter:")):
+                meta[p] = m.get("content")
+        # find favicon
+        icon = bs.find("link", rel=lambda v:v and "icon" in v.lower())
+        meta["favicon"] = icon["href"] if icon else ""
         return meta
     except Exception as e:
+        return {"url":url, "error":str(e), "favicon":""}
+# --- Core IA search + filter ---
+def fetch_clean_videos(keywords, api_key, scan_enabled):
+    # build query
+    q = " OR ".join(kw.strip().replace(" ","+") for kw in keywords.split(","))
+    items = list(search_items(f"mediatype:(movies) AND ({q})"))[:50]
+    clean = []
+    for it in items:
+        title = it.get("title","").lower()
+        # filter out news
+        if any(re.search(p, title) for p in NEWS_FILTER):
             continue
+        # find video files
+        for f in get_item(it["identifier"]).files:
+            fmt = f.get("format","").lower()
+            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                url = f"https://archive.org/download/{it['identifier']}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        ok = scan_url_vt(url, api_key)
                     except:
                         continue
+                else:
+                    ok = True
+                if ok:
+                    clean.append(url)
+    return clean
+# --- Build a PyVis graph and return its HTML path ---
+def build_graph(chain):
+    G = Network(height="300px", width="100%", directed=True)
+    for node in chain:
+        label = node.get("metadata",{}).get("title","origin")
+        icon = node.get("metadata",{}).get("favicon","")
+        G.add_node(node["url"], label="", shape="image", image=icon or None, title=label)
+    # link them in order
+    for i in range(len(chain)-1):
+        G.add_edge(chain[i]["url"], chain[i+1]["url"])
+    tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
+    G.show(tmp.name)
+    return tmp.name
+# --- UI ---
+with gr.Blocks(theme=THEME) as demo:
+    gr.Markdown("## 📼 Raw-Footage Chain Explorer")
     with gr.Row():
+        kw  = gr.Textbox("Keywords (comma-sep)", value="drone strike, military uav")
+        vt  = gr.Textbox("VT API Key", type="password")
+    scan_toggle  = gr.Checkbox("Enable VT scan", True)
+    ff_toggle    = gr.Checkbox("Enable FFprobe", False)
+    run_btn      = gr.Button("Search & Scan")
+    url_dd       = gr.Dropdown("Clean Video URLs", choices=[])
+    vid_player   = gr.Video()
+    ia_json      = gr.JSON()
+    ff_json      = gr.JSON()
+    graph_html   = gr.HTML()
+    origin_meta  = gr.JSON()
+    def search_and_populate(k, api, s):
+        urls = fetch_clean_videos(k, api, s)
         return gr.update(choices=urls, value=urls[0] if urls else None)
+    def update_all(sel, ff_on, api_key):
+        if not sel:
+            return None, {}, {}, "", {}
+        # 1) IA metadata + files
+        parts = sel.split("/"); ident = parts[4]
         item = get_item(ident)
+        raw = {"metadata":item.metadata, "files": [
+            {"name":f["name"], "format":f["format"], "size":f.get("size")}
+            for f in item.files
+        ]}
+        # 2) FFprobe
+        ffm = extract_ffprobe_metadata(sel) if ff_on else {}
+        # 3) trace origins
+        desc = raw["metadata"].get("description","")
+        urls = re.findall(r"https?://[^\s\"']+", desc)
+        chain = []
+        for u in urls:
+            m = fetch_page_metadata(u)
+            chain.append({"url":u, "metadata":m})
+        # finally add IA itself as last hop
+        chain.append({"url":sel, "metadata": {"title": raw["metadata"].get("title"), "favicon": ""}})
+        # 4) graph
+        gfile = build_graph(chain)
+        # 5) default show first origin metadata
+        om = chain[0]["metadata"] if chain else {}
+        # embed graph HTML
+        graph_data = open(gfile,"r",encoding="utf8").read()
+        os.unlink(gfile)
+        return sel, raw, ffm, graph_data, om
+    run_btn.click(search_and_populate, [kw, vt, scan_toggle], [url_dd])
+    url_dd.change(update_all, [url_dd, ff_toggle, vt],
+                  [vid_player, ia_json, ff_json, graph_html, origin_meta])
+demo.launch()