wuhp commited on
Commit
e90bbf9
·
verified ·
1 Parent(s): 05c15d4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -111
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from internetarchive import search_items
3
  import requests
4
  import time
5
  import subprocess
@@ -15,35 +15,36 @@ session.headers.update({
15
  "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
16
  })
17
 
18
- # Cache IA metadata to avoid redundant requests
19
- @lru_cache(maxsize=128)
20
- def get_ia_metadata(identifier):
21
- resp = session.get(f"https://archive.org/metadata/{identifier}", timeout=10)
22
- resp.raise_for_status()
23
- return resp.json()
24
-
25
-
26
  def scan_url_vt(url, api_key):
27
  headers = {"x-apikey": api_key}
28
- resp = session.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
 
 
29
  resp.raise_for_status()
30
  analysis_id = resp.json()["data"]["id"]
31
- # Poll until complete
32
  while True:
33
  time.sleep(5)
34
- status_resp = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
 
 
35
  status_resp.raise_for_status()
36
  attr = status_resp.json()["data"]["attributes"]
37
  if attr.get("status") == "completed":
38
- return attr.get("stats", {}).get("malicious", 0) == 0
39
-
40
 
 
41
  def extract_ffprobe_metadata(url_or_path):
42
- cmd = ["ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", url_or_path]
 
 
 
 
43
  out = subprocess.check_output(cmd)
44
  return json.loads(out)
45
 
46
-
47
  def fetch_page_metadata(url):
48
  try:
49
  resp = session.get(url, timeout=5)
@@ -58,103 +59,123 @@ def fetch_page_metadata(url):
58
  except Exception as e:
59
  return {"url": url, "error": str(e)}
60
 
61
-
62
- def fetch_clean_videos(keywords, api_key=None, scan_enabled=False, max_results=30):
63
- query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
64
- ia_query = f'mediatype:(movies) AND ({query})'
65
- items = list(search_items(ia_query))[:max_results]
66
- clean_urls = []
67
-
68
- def process_item(res):
69
- identifier = res["identifier"]
70
- try:
71
- data = get_ia_metadata(identifier)
72
- for f in data.get("files", []):
73
- fmt = f.get("format", "").lower()
74
- if fmt.startswith(("mpeg", "mp4", "avi", "mov", "webm", "m4v")):
75
- url = f"https://archive.org/download/{identifier}/{f['name']}"
76
- if scan_enabled and api_key:
77
- if not scan_url_vt(url, api_key):
78
- continue
79
- return url
80
- except Exception:
81
- return None
82
- return None
83
-
84
- with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
85
- for url in executor.map(process_item, items):
86
- if url:
87
- clean_urls.append(url)
88
- return clean_urls
89
-
90
-
91
- def search_and_populate(keywords, api_key, scan_enabled):
92
- urls = fetch_clean_videos(keywords, api_key, scan_enabled)
93
- return gr.update(choices=urls, value=urls[0] if urls else None)
94
-
95
-
96
- def update_all(selected_url, ff_on, api_key):
97
- if not selected_url:
98
- return None, {}, {}, []
99
-
100
- parts = selected_url.split("/")
101
- identifier = parts[4] if len(parts) > 4 else None
102
-
103
- # IA metadata
104
- raw_ia = {}
105
- if identifier:
106
- try:
107
- data = get_ia_metadata(identifier)
108
- raw_ia = {
109
- "metadata": data.get("metadata", {}),
110
- "files": [
111
- {k: v for k, v in f.items() if k in ("name", "format", "size", "md5")}
112
- for f in data.get("files", [])
113
- ]
114
  }
115
- except Exception as e:
116
- raw_ia = {"error": str(e)}
117
-
118
- # FFprobe
119
- ff_md = {}
120
- if ff_on:
121
- try:
122
- ff_md = extract_ffprobe_metadata(selected_url)
123
- except Exception as e:
124
- ff_md = {"error": str(e)}
125
-
126
- # Origin tracing: first URL only
127
- origins = []
128
- description = raw_ia.get("metadata", {}).get("description", "")
129
- urls_found = re.findall(r'https?://[^\s"<]+'", description)
130
- if urls_found:
131
- origins.append(fetch_page_metadata(urls_found[0]))
132
-
133
- return selected_url, raw_ia, ff_md, origins
134
-
135
- with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 900px;}") as demo:
136
- gr.Markdown("## 📼 IA Drone‑Strike Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  with gr.Row():
138
- with gr.Column(scale=1):
139
- kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
140
- vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
141
- scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
142
- ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
143
- run_btn = gr.Button("Search & Scan", variant="primary")
144
- url_dropdown = gr.Dropdown(label="Select Video", choices=[], interactive=True)
145
- with gr.Column(scale=2):
146
- with gr.Tab("Video"):
147
- video_player = gr.Video(label="Video Player")
148
- with gr.Tab("IA Metadata"):
149
- ia_meta_json = gr.JSON(label="Raw IA Metadata")
150
- with gr.Tab("FFprobe"):
151
- ffprobe_json = gr.JSON(label="FFprobe Metadata")
152
- with gr.Tab("Origins"):
153
- origins_json = gr.JSON(label="Source‑Origin Metadata")
154
-
155
- run_btn.click(search_and_populate, [kw_input, vt_key_input, scan_toggle], [url_dropdown], show_progress=True)
156
- url_dropdown.change(update_all, [url_dropdown, ffprobe_toggle, vt_key_input],
157
- [video_player, ia_meta_json, ffprobe_json, origins_json])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
  if __name__ == "__main__":
160
  demo.launch()
 
1
  import gradio as gr
2
+ from internetarchive import search_items, get_item
3
  import requests
4
  import time
5
  import subprocess
 
15
  "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
16
  })
17
 
18
+ # --- VirusTotal scan ---
 
 
 
 
 
 
 
19
  def scan_url_vt(url, api_key):
20
  headers = {"x-apikey": api_key}
21
+ resp = session.post(
22
+ "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
23
+ )
24
  resp.raise_for_status()
25
  analysis_id = resp.json()["data"]["id"]
 
26
  while True:
27
  time.sleep(5)
28
+ status_resp = session.get(
29
+ f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
30
+ )
31
  status_resp.raise_for_status()
32
  attr = status_resp.json()["data"]["attributes"]
33
  if attr.get("status") == "completed":
34
+ stats = attr.get("stats", {})
35
+ return stats.get("malicious", 0) == 0
36
 
37
+ # --- FFprobe metadata extraction ---
38
  def extract_ffprobe_metadata(url_or_path):
39
+ cmd = [
40
+ "ffprobe", "-v", "error", "-print_format", "json",
41
+ "-show_format", "-show_streams",
42
+ url_or_path
43
+ ]
44
  out = subprocess.check_output(cmd)
45
  return json.loads(out)
46
 
47
+ # --- Scrape page metadata (OpenGraph + title) ---
48
  def fetch_page_metadata(url):
49
  try:
50
  resp = session.get(url, timeout=5)
 
59
  except Exception as e:
60
  return {"url": url, "error": str(e)}
61
 
62
+ # --- Cache IA metadata to speed repeated fetches ---
63
+ @lru_cache(maxsize=128)
64
+ def fetch_ia_metadata(identifier):
65
+ item = get_item(identifier)
66
+ return {
67
+ "metadata": item.metadata,
68
+ "files": [
69
+ {
70
+ "name": f.get("name"),
71
+ "format": f.get("format"),
72
+ "size": f.get("size"),
73
+ "md5": f.get("md5"),
74
+ **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  }
76
+ for f in item.files
77
+ ]
78
+ }
79
+
80
+ # --- Search IA and optionally VT-scan in parallel ---
81
+ def fetch_clean_videos(keywords, api_key, scan_enabled):
82
+ query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
83
+ ia_query = f"mediatype:(movies) AND ({query})"
84
+ results = list(search_items(ia_query))[:50]
85
+
86
+ candidate_urls = []
87
+ for res in results:
88
+ identifier = res["identifier"]
89
+ # only list video files; full metadata fetched later
90
+ for f in get_item(identifier).files:
91
+ fmt = f.get("format", "").lower()
92
+ if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
93
+ candidate_urls.append(
94
+ f"https://archive.org/download/{identifier}/{f['name']}"
95
+ )
96
+
97
+ if scan_enabled and api_key:
98
+ clean_urls = []
99
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
100
+ future_to_url = {executor.submit(scan_url_vt, url, api_key): url for url in candidate_urls}
101
+ for fut in concurrent.futures.as_completed(future_to_url):
102
+ url = future_to_url[fut]
103
+ try:
104
+ if fut.result():
105
+ clean_urls.append(url)
106
+ except Exception:
107
+ pass
108
+ return clean_urls
109
+
110
+ return candidate_urls
111
+
112
+ # --- Gradio UI ---
113
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
114
+ gr.Markdown("# 📼 IA Drone‑Strike Explorer")
115
  with gr.Row():
116
+ kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
117
+ vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
118
+ scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
119
+ ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
120
+ run_btn = gr.Button("🔍 Search & Scan", variant="primary")
121
+
122
+ url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
123
+ video_player = gr.Video(label="Video Preview")
124
+
125
+ with gr.Tabs():
126
+ with gr.TabItem("IA Metadata"):
127
+ ia_meta_json = gr.JSON(label="Raw IA Metadata")
128
+ with gr.TabItem("FFprobe"):
129
+ ffprobe_json = gr.JSON(label="FFprobe Metadata")
130
+ with gr.TabItem("Origins"):
131
+ origins_json = gr.JSON(label="Source Origins")
132
+
133
+ def search_and_populate(keywords, api_key, scan_enabled):
134
+ urls = fetch_clean_videos(keywords, api_key, scan_enabled)
135
+ return gr.update(choices=urls, value=urls[0] if urls else None)
136
+
137
+ def update_all(selected_url, ff_on, api_key):
138
+ if not selected_url:
139
+ return None, {}, {}, []
140
+
141
+ parts = selected_url.split("/")
142
+ identifier = parts[4] if len(parts) > 4 else None
143
+
144
+ raw_ia = {"identifier": identifier}
145
+ if identifier:
146
+ try:
147
+ data = fetch_ia_metadata(identifier)
148
+ raw_ia.update(data)
149
+ except Exception:
150
+ raw_ia["error"] = "could not fetch IA metadata"
151
+
152
+ ff_md = {}
153
+ if ff_on:
154
+ try:
155
+ ff_md = extract_ffprobe_metadata(selected_url)
156
+ except Exception as e:
157
+ ff_md = {"error": str(e)}
158
+
159
+ desc = raw_ia.get("metadata", {}).get("description", "")
160
+ urls_found = re.findall(r'https?://[^\s"<]+' , desc)
161
+ origins = []
162
+ if urls_found:
163
+ with concurrent.futures.ThreadPoolExecutor() as executor:
164
+ for meta in executor.map(fetch_page_metadata, urls_found[:5]):
165
+ origins.append(meta)
166
+
167
+ return selected_url, raw_ia, ff_md, origins
168
+
169
+ run_btn.click(
170
+ search_and_populate,
171
+ inputs=[kw_input, vt_key_input, scan_toggle],
172
+ outputs=[url_dropdown]
173
+ )
174
+ url_dropdown.change(
175
+ update_all,
176
+ inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
177
+ outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
178
+ )
179
 
180
  if __name__ == "__main__":
181
  demo.launch()