Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

e90bbf9

verified ·

1 Parent(s): 05c15d4

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -111

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from internetarchive import search_items
 import requests
 import time
 import subprocess
@@ -15,35 +15,36 @@ session.headers.update({
     "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
 })
-# Cache IA metadata to avoid redundant requests
-@lru_cache(maxsize=128)
-def get_ia_metadata(identifier):
-    resp = session.get(f"https://archive.org/metadata/{identifier}", timeout=10)
-    resp.raise_for_status()
-    return resp.json()
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
-    resp = session.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
-    # Poll until complete
     while True:
         time.sleep(5)
-        status_resp = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
         status_resp.raise_for_status()
         attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
-            return attr.get("stats", {}).get("malicious", 0) == 0
 def extract_ffprobe_metadata(url_or_path):
-    cmd = ["ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", url_or_path]
     out = subprocess.check_output(cmd)
     return json.loads(out)
 def fetch_page_metadata(url):
     try:
         resp = session.get(url, timeout=5)
@@ -58,103 +59,123 @@ def fetch_page_metadata(url):
     except Exception as e:
         return {"url": url, "error": str(e)}
-def fetch_clean_videos(keywords, api_key=None, scan_enabled=False, max_results=30):
-    query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
-    ia_query = f'mediatype:(movies) AND ({query})'
-    items = list(search_items(ia_query))[:max_results]
-    clean_urls = []
-    def process_item(res):
-        identifier = res["identifier"]
-        try:
-            data = get_ia_metadata(identifier)
-            for f in data.get("files", []):
-                fmt = f.get("format", "").lower()
-                if fmt.startswith(("mpeg", "mp4", "avi", "mov", "webm", "m4v")):
-                    url = f"https://archive.org/download/{identifier}/{f['name']}"
-                    if scan_enabled and api_key:
-                        if not scan_url_vt(url, api_key):
-                            continue
-                    return url
-        except Exception:
-            return None
-        return None
-    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
-        for url in executor.map(process_item, items):
-            if url:
-                clean_urls.append(url)
-    return clean_urls
-def search_and_populate(keywords, api_key, scan_enabled):
-    urls = fetch_clean_videos(keywords, api_key, scan_enabled)
-    return gr.update(choices=urls, value=urls[0] if urls else None)
-def update_all(selected_url, ff_on, api_key):
-    if not selected_url:
-        return None, {}, {}, []
-    parts = selected_url.split("/")
-    identifier = parts[4] if len(parts) > 4 else None
-    # IA metadata
-    raw_ia = {}
-    if identifier:
-        try:
-            data = get_ia_metadata(identifier)
-            raw_ia = {
-                "metadata": data.get("metadata", {}),
-                "files": [
-                    {k: v for k, v in f.items() if k in ("name", "format", "size", "md5")}
-                    for f in data.get("files", [])
-                ]
             }
-        except Exception as e:
-            raw_ia = {"error": str(e)}
-    # FFprobe
-    ff_md = {}
-    if ff_on:
-        try:
-            ff_md = extract_ffprobe_metadata(selected_url)
-        except Exception as e:
-            ff_md = {"error": str(e)}
-    # Origin tracing: first URL only
-    origins = []
-    description = raw_ia.get("metadata", {}).get("description", "")
-    urls_found = re.findall(r'https?://[^\s"<]+'", description)
-    if urls_found:
-        origins.append(fetch_page_metadata(urls_found[0]))
-    return selected_url, raw_ia, ff_md, origins
-with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 900px;}") as demo:
-    gr.Markdown("## 📼 IA Drone‑Strike Explorer")
     with gr.Row():
-        with gr.Column(scale=1):
-            kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
-            vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
-            scan_toggle  = gr.Checkbox(label="Enable VT scan", value=True)
-            ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
-            run_btn      = gr.Button("Search & Scan", variant="primary")
-            url_dropdown = gr.Dropdown(label="Select Video", choices=[], interactive=True)
-        with gr.Column(scale=2):
-            with gr.Tab("Video"):
-                video_player = gr.Video(label="Video Player")
-            with gr.Tab("IA Metadata"):
-                ia_meta_json = gr.JSON(label="► Raw IA Metadata")
-            with gr.Tab("FFprobe"):
-                ffprobe_json = gr.JSON(label="► FFprobe Metadata")
-            with gr.Tab("Origins"):
-                origins_json = gr.JSON(label="► Source‑Origin Metadata")
-    run_btn.click(search_and_populate, [kw_input, vt_key_input, scan_toggle], [url_dropdown], show_progress=True)
-    url_dropdown.change(update_all, [url_dropdown, ffprobe_toggle, vt_key_input],
-                        [video_player, ia_meta_json, ffprobe_json, origins_json])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
+from internetarchive import search_items, get_item
 import requests
 import time
 import subprocess
     "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
 })
+# --- VirusTotal scan ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
+    resp = session.post(
+        "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
+    )
     resp.raise_for_status()
     analysis_id = resp.json()["data"]["id"]
     while True:
         time.sleep(5)
+        status_resp = session.get(
+            f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
+        )
         status_resp.raise_for_status()
         attr = status_resp.json()["data"]["attributes"]
         if attr.get("status") == "completed":
+            stats = attr.get("stats", {})
+            return stats.get("malicious", 0) == 0
+# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
+    cmd = [
+        "ffprobe", "-v", "error", "-print_format", "json",
+        "-show_format", "-show_streams",
+        url_or_path
+    ]
     out = subprocess.check_output(cmd)
     return json.loads(out)
+# --- Scrape page metadata (OpenGraph + title) ---
 def fetch_page_metadata(url):
     try:
         resp = session.get(url, timeout=5)
     except Exception as e:
         return {"url": url, "error": str(e)}
+# --- Cache IA metadata to speed repeated fetches ---
+@lru_cache(maxsize=128)
+def fetch_ia_metadata(identifier):
+    item = get_item(identifier)
+    return {
+        "metadata": item.metadata,
+        "files": [
+            {
+                "name": f.get("name"),
+                "format": f.get("format"),
+                "size": f.get("size"),
+                "md5": f.get("md5"),
+                **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
             }
+            for f in item.files
+        ]
+    }
+# --- Search IA and optionally VT-scan in parallel ---
+def fetch_clean_videos(keywords, api_key, scan_enabled):
+    query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
+    ia_query = f"mediatype:(movies) AND ({query})"
+    results = list(search_items(ia_query))[:50]
+    candidate_urls = []
+    for res in results:
+        identifier = res["identifier"]
+        # only list video files; full metadata fetched later
+        for f in get_item(identifier).files:
+            fmt = f.get("format", "").lower()
+            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                candidate_urls.append(
+                    f"https://archive.org/download/{identifier}/{f['name']}"
+                )
+    if scan_enabled and api_key:
+        clean_urls = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            future_to_url = {executor.submit(scan_url_vt, url, api_key): url for url in candidate_urls}
+            for fut in concurrent.futures.as_completed(future_to_url):
+                url = future_to_url[fut]
+                try:
+                    if fut.result():
+                        clean_urls.append(url)
+                except Exception:
+                    pass
+        return clean_urls
+    return candidate_urls
+# --- Gradio UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer")
     with gr.Row():
+        kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
+        vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
+    scan_toggle    = gr.Checkbox(label="Enable VT scan", value=True)
+    ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
+    run_btn        = gr.Button("🔍 Search & Scan", variant="primary")
+    url_dropdown   = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
+    video_player   = gr.Video(label="Video Preview")
+    with gr.Tabs():
+        with gr.TabItem("IA Metadata"):
+            ia_meta_json   = gr.JSON(label="Raw IA Metadata")
+        with gr.TabItem("FFprobe"):
+            ffprobe_json = gr.JSON(label="FFprobe Metadata")
+        with gr.TabItem("Origins"):
+            origins_json = gr.JSON(label="Source Origins")
+    def search_and_populate(keywords, api_key, scan_enabled):
+        urls = fetch_clean_videos(keywords, api_key, scan_enabled)
+        return gr.update(choices=urls, value=urls[0] if urls else None)
+    def update_all(selected_url, ff_on, api_key):
+        if not selected_url:
+            return None, {}, {}, []
+        parts = selected_url.split("/")
+        identifier = parts[4] if len(parts) > 4 else None
+        raw_ia = {"identifier": identifier}
+        if identifier:
+            try:
+                data = fetch_ia_metadata(identifier)
+                raw_ia.update(data)
+            except Exception:
+                raw_ia["error"] = "could not fetch IA metadata"
+        ff_md = {}
+        if ff_on:
+            try:
+                ff_md = extract_ffprobe_metadata(selected_url)
+            except Exception as e:
+                ff_md = {"error": str(e)}
+        desc = raw_ia.get("metadata", {}).get("description", "")
+        urls_found = re.findall(r'https?://[^\s"<]+' , desc)
+        origins = []
+        if urls_found:
+            with concurrent.futures.ThreadPoolExecutor() as executor:
+                for meta in executor.map(fetch_page_metadata, urls_found[:5]):
+                    origins.append(meta)
+        return selected_url, raw_ia, ff_md, origins
+    run_btn.click(
+        search_and_populate,
+        inputs=[kw_input, vt_key_input, scan_toggle],
+        outputs=[url_dropdown]
+    )
+    url_dropdown.change(
+        update_all,
+        inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
+        outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
+    )
 if __name__ == "__main__":
     demo.launch()