wuhp commited on
Commit
44936dc
·
verified ·
1 Parent(s): de6b885

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +118 -149
app.py CHANGED
@@ -1,183 +1,152 @@
1
  import gradio as gr
2
  from internetarchive import search_items, get_item
3
- import requests
4
- import time
5
- import subprocess
6
- import json
7
- import re
8
- import concurrent.futures
9
  from bs4 import BeautifulSoup
10
- from functools import lru_cache
11
  from pyvis.network import Network
12
- from urllib.parse import urlparse
13
 
14
- # Persistent HTTP session for performance
15
- session = requests.Session()
16
- session.headers.update({
17
- "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
18
- })
19
 
20
  # --- VirusTotal scan ---
21
  def scan_url_vt(url, api_key):
22
  headers = {"x-apikey": api_key}
23
- resp = session.post(
24
  "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
25
  )
26
  resp.raise_for_status()
27
  analysis_id = resp.json()["data"]["id"]
28
  while True:
29
  time.sleep(5)
30
- status_resp = session.get(
31
  f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
32
  )
33
- status_resp.raise_for_status()
34
- attr = status_resp.json()["data"]["attributes"]
35
  if attr.get("status") == "completed":
36
- stats = attr.get("stats", {})
37
- return stats.get("malicious", 0) == 0
38
 
39
- # --- FFprobe metadata extraction ---
40
- def extract_ffprobe_metadata(url_or_path):
41
- cmd = [
42
- "ffprobe", "-v", "error", "-print_format", "json",
43
- "-show_format", "-show_streams",
44
- url_or_path
45
- ]
46
- out = subprocess.check_output(cmd)
47
- return json.loads(out)
48
 
49
- # --- Scrape page metadata (OpenGraph + title) ---
50
  def fetch_page_metadata(url):
51
  try:
52
- resp = session.get(url, timeout=5)
53
- resp.raise_for_status()
54
- soup = BeautifulSoup(resp.text, "html.parser")
55
- meta = {"url": url, "title": soup.title.string if soup.title else None}
56
  for tag in soup.find_all("meta"):
57
- prop = tag.get("property") or tag.get("name")
58
- if prop and prop.startswith(("og:", "twitter:")):
59
- meta[prop] = tag.get("content")
 
 
60
  return meta
61
  except Exception as e:
62
- return {"url": url, "error": str(e)}
63
-
64
- # --- Cache IA metadata to speed repeated fetches ---
65
- @lru_cache(maxsize=128)
66
- def fetch_ia_metadata(identifier):
67
- item = get_item(identifier)
68
- return {
69
- "metadata": item.metadata,
70
- "files": [
71
- {k: v for k, v in f.items() if k != "_checksum"}
72
- for f in item.files
73
- ]
74
- }
75
-
76
- # --- Search IA and return identifiers ---
77
- def fetch_identifiers(keywords):
78
- query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
79
- ia_query = f"mediatype:(movies) AND ({query})"
80
- results = list(search_items(ia_query, fields=["identifier"]))[:50]
81
- return [r["identifier"] for r in results]
82
-
83
- # --- List video files for a given item ---
84
- def list_files_for_identifier(identifier):
85
- data = fetch_ia_metadata(identifier)
86
- return [
87
- f["name"] for f in data["files"]
88
- if f.get("format", "").lower().startswith(("mpeg","mp4","avi","mov","webm","m4v"))
89
- ]
90
-
91
- # --- Gradio UI setup ---
92
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  gr.Markdown("# 📼 IA Drone‑Strike Explorer")
94
  with gr.Row():
95
- kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
96
- vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
97
- scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
98
  ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
99
- run_btn = gr.Button("🔍 Search Items", variant="primary")
100
-
101
- id_dropdown = gr.Dropdown(label="IA Item Identifiers", choices=[], interactive=True)
102
- file_dropdown = gr.Dropdown(label="Video Files", choices=[], interactive=True)
103
- video_player = gr.Video(label="Video Preview")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- with gr.Tabs():
106
- with gr.TabItem("IA Metadata"):
107
- ia_meta_json = gr.JSON(label="Raw IA Metadata")
108
- with gr.TabItem("FFprobe"):
109
- ffprobe_json = gr.JSON(label="FFprobe Metadata")
110
- with gr.TabItem("Origins Graph"):
111
- origins_graph = gr.HTML(label="Source Origins Graph")
112
- origins_meta = gr.JSON(label="Origins Metadata")
113
-
114
- # 1) Fetch identifiers for search keywords
115
  run_btn.click(
116
- lambda kws: gr.update(choices=fetch_identifiers(kws), value=None),
117
- inputs=[kw_input],
118
- outputs=[id_dropdown]
119
- )
120
-
121
- # 2) Populate video files dropdown when an identifier is selected
122
- id_dropdown.change(
123
- lambda ident: gr.update(choices=list_files_for_identifier(ident), value=None),
124
- inputs=[id_dropdown],
125
- outputs=[file_dropdown]
126
  )
127
-
128
- # 3) When a file is selected, fetch metadata, run FFprobe (if toggled),
129
- # and build the clickable origins graph with circular favicon nodes.
130
- def update_all(identifier, file_name, ff_on, api_key):
131
- if not identifier or not file_name:
132
- return None, {}, {}, "", []
133
-
134
- url = f"https://archive.org/download/{identifier}/{file_name}"
135
-
136
- # IA metadata (cached)
137
- data = fetch_ia_metadata(identifier)
138
- raw_ia = {"identifier": identifier, **data}
139
-
140
- # FFprobe metadata
141
- ff_md = {}
142
- if ff_on:
143
- try:
144
- ff_md = extract_ffprobe_metadata(url)
145
- except Exception as e:
146
- ff_md = {"error": str(e)}
147
-
148
- # Origins graph
149
- desc = data["metadata"].get("description", "") or ""
150
- urls = re.findall(r"https?://[^\s\"<]+", desc)
151
- origins_list = []
152
-
153
- net = Network(height="300px", width="100%", directed=True)
154
- net.set_options('{"edges":{"arrows":"to"}}')
155
- net.add_node(identifier, label=identifier, shape="ellipse")
156
-
157
- for u in urls[:10]:
158
- meta = fetch_page_metadata(u)
159
- origins_list.append(meta)
160
- dom = urlparse(u).netloc
161
- fav = f"https://www.google.com/s2/favicons?domain={dom}"
162
- net.add_node(
163
- u,
164
- label=dom,
165
- shape="circularImage",
166
- image=fav,
167
- title=json.dumps(meta, indent=2),
168
- href=u
169
- )
170
- net.add_edge(identifier, u)
171
-
172
- graph_html = net.generate_html()
173
-
174
- return url, raw_ia, ff_md, graph_html, origins_list
175
-
176
- file_dropdown.change(
177
- update_all,
178
- inputs=[id_dropdown, file_dropdown, ffprobe_toggle, vt_key_input],
179
- outputs=[video_player, ia_meta_json, ffprobe_json, origins_graph, origins_meta]
180
  )
181
 
182
  if __name__ == "__main__":
183
- demo.launch()
 
1
  import gradio as gr
2
  from internetarchive import search_items, get_item
3
+ import requests, time, subprocess, json, re, tempfile, os
 
 
 
 
 
4
  from bs4 import BeautifulSoup
5
+ import networkx as nx
6
  from pyvis.network import Network
 
7
 
8
+ # --- SETTINGS ---
9
+ NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
10
+ THEME = "gradio/soft"
 
 
11
 
12
  # --- VirusTotal scan ---
13
  def scan_url_vt(url, api_key):
14
  headers = {"x-apikey": api_key}
15
+ resp = requests.post(
16
  "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
17
  )
18
  resp.raise_for_status()
19
  analysis_id = resp.json()["data"]["id"]
20
  while True:
21
  time.sleep(5)
22
+ status = requests.get(
23
  f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
24
  )
25
+ attr = status.json()["data"]["attributes"]
 
26
  if attr.get("status") == "completed":
27
+ return attr["stats"].get("malicious", 0) == 0
 
28
 
29
+ # --- FFprobe metadata ---
30
+ def extract_ffprobe_metadata(path):
31
+ output = subprocess.check_output([
32
+ "ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", path
33
+ ])
34
+ return json.loads(output)
 
 
 
35
 
36
+ # --- Fetch page metadata + favicon ---
37
  def fetch_page_metadata(url):
38
  try:
39
+ r = requests.get(url, timeout=5)
40
+ r.raise_for_status()
41
+ soup = BeautifulSoup(r.text, "html.parser")
42
+ meta = {"url": url, "title": soup.title.string if soup.title else url}
43
  for tag in soup.find_all("meta"):
44
+ key = tag.get("property") or tag.get("name")
45
+ if key and (key.startswith("og:") or key.startswith("twitter:")):
46
+ meta[key] = tag.get("content")
47
+ icon = soup.find("link", rel=lambda v: v and "icon" in v.lower())
48
+ meta["favicon"] = icon.get("href") if icon else ""
49
  return meta
50
  except Exception as e:
51
+ return {"url": url, "error": str(e), "favicon": ""}
52
+
53
+ # --- IA search + filter ---
54
+ def fetch_clean_videos(keywords, api_key, scan_enabled):
55
+ query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
56
+ items = list(search_items(f"mediatype:(movies) AND ({query})"))[:50]
57
+ results = []
58
+ for item_meta in items:
59
+ title = item_meta.get("title", "").lower()
60
+ if any(re.search(p, title) for p in NEWS_FILTER):
61
+ continue
62
+ item = get_item(item_meta['identifier'])
63
+ for f in item.files:
64
+ fmt = f.get('format', '').lower()
65
+ if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
66
+ url = f"https://archive.org/download/{item_meta['identifier']}/{f['name']}"
67
+ if scan_enabled and api_key:
68
+ try:
69
+ if not scan_url_vt(url, api_key):
70
+ continue
71
+ except:
72
+ continue
73
+ results.append(url)
74
+ return results
75
+
76
+ # --- Build graph HTML (using write_html) ---
77
+ def build_graph_html(chain):
78
+ net = Network(height="300px", width="100%", directed=True)
79
+ for hop in chain:
80
+ url = hop['url']
81
+ meta = hop['metadata']
82
+ title = meta.get('title', url)
83
+ favicon = meta.get('favicon', '')
84
+ if favicon:
85
+ net.add_node(url, label="", shape='image', image=favicon, title=title)
86
+ else:
87
+ net.add_node(url, label=title, title=title)
88
+ for i in range(len(chain)-1):
89
+ net.add_edge(chain[i]['url'], chain[i+1]['url'])
90
+ tmp_path = tempfile.mktemp(suffix='.html')
91
+ net.write_html(tmp_path, notebook=False, open_browser=False)
92
+ with open(tmp_path, 'r', encoding='utf8') as f:
93
+ html = f.read()
94
+ os.remove(tmp_path)
95
+ return html
96
+
97
+ # --- Gradio UI ---
98
+ with gr.Blocks(theme=THEME) as demo:
99
  gr.Markdown("# 📼 IA Drone‑Strike Explorer")
100
  with gr.Row():
101
+ kw_input = gr.Textbox(label="Search keywords (comma-separated)", value="drone strike, military uav")
102
+ vt_key = gr.Textbox(label="VirusTotal API Key", type="password")
103
+ scan_toggle = gr.Checkbox(label="Enable VirusTotal scan", value=True)
104
  ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
105
+ run_btn = gr.Button("Search & Scan")
106
+
107
+ url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
108
+ video_player = gr.Video(label="Video Player")
109
+ ia_meta = gr.JSON(label=" Raw IA Metadata")
110
+ ff_meta = gr.JSON(label="► FFprobe Metadata")
111
+ graph_panel = gr.HTML(label="► Reupload Chain Graph")
112
+ origin_meta = gr.JSON(label="► Origin Node Metadata")
113
+
114
+ def search_and_populate(keywords, api_key, scan_on):
115
+ urls = fetch_clean_videos(keywords, api_key, scan_on)
116
+ return gr.update(choices=urls, value=urls[0] if urls else None)
117
+
118
+ def on_url_select(selected_url, ff_on, api_key):
119
+ if not selected_url:
120
+ return None, {}, {}, "", {}
121
+ # IA metadata + files
122
+ parts = selected_url.split('/')
123
+ ident = parts[4]
124
+ item = get_item(ident)
125
+ ia_data = {'metadata': item.metadata, 'files': item.files}
126
+ # FFprobe
127
+ ff_data = extract_ffprobe_metadata(selected_url) if ff_on else {}
128
+ # origin chain
129
+ desc = item.metadata.get('description', '')
130
+ urls = re.findall(r'https?://[^\s"<]+', desc)
131
+ chain = []
132
+ for u in urls:
133
+ chain.append({'url': u, 'metadata': fetch_page_metadata(u)})
134
+ # append IA as last hop
135
+ chain.append({'url': selected_url, 'metadata': {'title': item.metadata.get('title','')}})
136
+ graph_html = build_graph_html(chain)
137
+ origin_node = chain[0]['metadata'] if chain else {}
138
+ return selected_url, ia_data, ff_data, graph_html, origin_node
139
 
 
 
 
 
 
 
 
 
 
 
140
  run_btn.click(
141
+ search_and_populate,
142
+ inputs=[kw_input, vt_key, scan_toggle],
143
+ outputs=[url_dropdown]
 
 
 
 
 
 
 
144
  )
145
+ url_dropdown.change(
146
+ on_url_select,
147
+ inputs=[url_dropdown, ffprobe_toggle, vt_key],
148
+ outputs=[video_player, ia_meta, ff_meta, graph_panel, origin_meta]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  )
150
 
151
  if __name__ == "__main__":
152
+ demo.launch()