Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

de6b885

verified ·

1 Parent(s): e90bbf9

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -80

app.py CHANGED Viewed

@@ -8,8 +8,10 @@ import re
 import concurrent.futures
 from bs4 import BeautifulSoup
 from functools import lru_cache
-# Reuse HTTP session for performance
 session = requests.Session()
 session.headers.update({
     "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
@@ -66,50 +68,27 @@ def fetch_ia_metadata(identifier):
     return {
         "metadata": item.metadata,
         "files": [
-            {
-                "name": f.get("name"),
-                "format": f.get("format"),
-                "size": f.get("size"),
-                "md5": f.get("md5"),
-                **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
-            }
             for f in item.files
         ]
     }
-# --- Search IA and optionally VT-scan in parallel ---
-def fetch_clean_videos(keywords, api_key, scan_enabled):
     query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
-    results = list(search_items(ia_query))[:50]
-    candidate_urls = []
-    for res in results:
-        identifier = res["identifier"]
-        # only list video files; full metadata fetched later
-        for f in get_item(identifier).files:
-            fmt = f.get("format", "").lower()
-            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                candidate_urls.append(
-                    f"https://archive.org/download/{identifier}/{f['name']}"
-                )
-    if scan_enabled and api_key:
-        clean_urls = []
-        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-            future_to_url = {executor.submit(scan_url_vt, url, api_key): url for url in candidate_urls}
-            for fut in concurrent.futures.as_completed(future_to_url):
-                url = future_to_url[fut]
-                try:
-                    if fut.result():
-                        clean_urls.append(url)
-                except Exception:
-                    pass
-        return clean_urls
-    return candidate_urls
-# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📼 IA Drone‑Strike Explorer")
     with gr.Row():
@@ -117,65 +96,88 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
     scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
-    run_btn        = gr.Button("🔍 Search & Scan", variant="primary")
-    url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
-    video_player   = gr.Video(label="Video Preview")
     with gr.Tabs():
         with gr.TabItem("IA Metadata"):
-            ia_meta_json   = gr.JSON(label="Raw IA Metadata")
         with gr.TabItem("FFprobe"):
             ffprobe_json = gr.JSON(label="FFprobe Metadata")
-        with gr.TabItem("Origins"):
-            origins_json = gr.JSON(label="Source Origins")
-    def search_and_populate(keywords, api_key, scan_enabled):
-        urls = fetch_clean_videos(keywords, api_key, scan_enabled)
-        return gr.update(choices=urls, value=urls[0] if urls else None)
-    def update_all(selected_url, ff_on, api_key):
-        if not selected_url:
-            return None, {}, {}, []
-        parts = selected_url.split("/")
-        identifier = parts[4] if len(parts) > 4 else None
-        raw_ia = {"identifier": identifier}
-        if identifier:
-            try:
-                data = fetch_ia_metadata(identifier)
-                raw_ia.update(data)
-            except Exception:
-                raw_ia["error"] = "could not fetch IA metadata"
         ff_md = {}
         if ff_on:
             try:
-                ff_md = extract_ffprobe_metadata(selected_url)
             except Exception as e:
                 ff_md = {"error": str(e)}
-        desc = raw_ia.get("metadata", {}).get("description", "")
-        urls_found = re.findall(r'https?://[^\s"<]+' , desc)
-        origins = []
-        if urls_found:
-            with concurrent.futures.ThreadPoolExecutor() as executor:
-                for meta in executor.map(fetch_page_metadata, urls_found[:5]):
-                    origins.append(meta)
-        return selected_url, raw_ia, ff_md, origins
-    run_btn.click(
-        search_and_populate,
-        inputs=[kw_input, vt_key_input, scan_toggle],
-        outputs=[url_dropdown]
-    )
-    url_dropdown.change(
         update_all,
-        inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
-        outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
     )
 if __name__ == "__main__":
-    demo.launch()

 import concurrent.futures
 from bs4 import BeautifulSoup
 from functools import lru_cache
+from pyvis.network import Network
+from urllib.parse import urlparse
+# Persistent HTTP session for performance
 session = requests.Session()
 session.headers.update({
     "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
     return {
         "metadata": item.metadata,
         "files": [
+            {k: v for k, v in f.items() if k != "_checksum"}
             for f in item.files
         ]
     }
+# --- Search IA and return identifiers ---
+def fetch_identifiers(keywords):
     query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
+    results = list(search_items(ia_query, fields=["identifier"]))[:50]
+    return [r["identifier"] for r in results]
+# --- List video files for a given item ---
+def list_files_for_identifier(identifier):
+    data = fetch_ia_metadata(identifier)
+    return [
+        f["name"] for f in data["files"]
+        if f.get("format", "").lower().startswith(("mpeg","mp4","avi","mov","webm","m4v"))
+    ]
+# --- Gradio UI setup ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 📼 IA Drone‑Strike Explorer")
     with gr.Row():
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
     scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
     ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
+    run_btn        = gr.Button("🔍 Search Items", variant="primary")
+    id_dropdown   = gr.Dropdown(label="IA Item Identifiers", choices=[], interactive=True)
+    file_dropdown = gr.Dropdown(label="Video Files", choices=[], interactive=True)
+    video_player  = gr.Video(label="Video Preview")
     with gr.Tabs():
         with gr.TabItem("IA Metadata"):
+            ia_meta_json = gr.JSON(label="Raw IA Metadata")
         with gr.TabItem("FFprobe"):
             ffprobe_json = gr.JSON(label="FFprobe Metadata")
+        with gr.TabItem("Origins Graph"):
+            origins_graph = gr.HTML(label="Source Origins Graph")
+            origins_meta  = gr.JSON(label="Origins Metadata")
+    # 1) Fetch identifiers for search keywords
+    run_btn.click(
+        lambda kws: gr.update(choices=fetch_identifiers(kws), value=None),
+        inputs=[kw_input],
+        outputs=[id_dropdown]
+    )
+    # 2) Populate video files dropdown when an identifier is selected
+    id_dropdown.change(
+        lambda ident: gr.update(choices=list_files_for_identifier(ident), value=None),
+        inputs=[id_dropdown],
+        outputs=[file_dropdown]
+    )
+    # 3) When a file is selected, fetch metadata, run FFprobe (if toggled),
+    #    and build the clickable origins graph with circular favicon nodes.
+    def update_all(identifier, file_name, ff_on, api_key):
+        if not identifier or not file_name:
+            return None, {}, {}, "", []
+        url = f"https://archive.org/download/{identifier}/{file_name}"
+        # IA metadata (cached)
+        data = fetch_ia_metadata(identifier)
+        raw_ia = {"identifier": identifier, **data}
+        # FFprobe metadata
         ff_md = {}
         if ff_on:
             try:
+                ff_md = extract_ffprobe_metadata(url)
             except Exception as e:
                 ff_md = {"error": str(e)}
+        # Origins graph
+        desc = data["metadata"].get("description", "") or ""
+        urls = re.findall(r"https?://[^\s\"<]+", desc)
+        origins_list = []
+        net = Network(height="300px", width="100%", directed=True)
+        net.set_options('{"edges":{"arrows":"to"}}')
+        net.add_node(identifier, label=identifier, shape="ellipse")
+        for u in urls[:10]:
+            meta = fetch_page_metadata(u)
+            origins_list.append(meta)
+            dom = urlparse(u).netloc
+            fav = f"https://www.google.com/s2/favicons?domain={dom}"
+            net.add_node(
+                u,
+                label=dom,
+                shape="circularImage",
+                image=fav,
+                title=json.dumps(meta, indent=2),
+                href=u
+            )
+            net.add_edge(identifier, u)
+        graph_html = net.generate_html()
+        return url, raw_ia, ff_md, graph_html, origins_list
+    file_dropdown.change(
         update_all,
+        inputs=[id_dropdown, file_dropdown, ffprobe_toggle, vt_key_input],
+        outputs=[video_player, ia_meta_json, ffprobe_json, origins_graph, origins_meta]
     )
 if __name__ == "__main__":
+    demo.launch()