wuhp commited on
Commit
de6b885
·
verified ·
1 Parent(s): e90bbf9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -80
app.py CHANGED
@@ -8,8 +8,10 @@ import re
8
  import concurrent.futures
9
  from bs4 import BeautifulSoup
10
  from functools import lru_cache
 
 
11
 
12
- # Reuse HTTP session for performance
13
  session = requests.Session()
14
  session.headers.update({
15
  "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
@@ -66,50 +68,27 @@ def fetch_ia_metadata(identifier):
66
  return {
67
  "metadata": item.metadata,
68
  "files": [
69
- {
70
- "name": f.get("name"),
71
- "format": f.get("format"),
72
- "size": f.get("size"),
73
- "md5": f.get("md5"),
74
- **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
75
- }
76
  for f in item.files
77
  ]
78
  }
79
 
80
- # --- Search IA and optionally VT-scan in parallel ---
81
- def fetch_clean_videos(keywords, api_key, scan_enabled):
82
  query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
83
  ia_query = f"mediatype:(movies) AND ({query})"
84
- results = list(search_items(ia_query))[:50]
85
-
86
- candidate_urls = []
87
- for res in results:
88
- identifier = res["identifier"]
89
- # only list video files; full metadata fetched later
90
- for f in get_item(identifier).files:
91
- fmt = f.get("format", "").lower()
92
- if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
93
- candidate_urls.append(
94
- f"https://archive.org/download/{identifier}/{f['name']}"
95
- )
96
-
97
- if scan_enabled and api_key:
98
- clean_urls = []
99
- with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
100
- future_to_url = {executor.submit(scan_url_vt, url, api_key): url for url in candidate_urls}
101
- for fut in concurrent.futures.as_completed(future_to_url):
102
- url = future_to_url[fut]
103
- try:
104
- if fut.result():
105
- clean_urls.append(url)
106
- except Exception:
107
- pass
108
- return clean_urls
109
-
110
- return candidate_urls
111
-
112
- # --- Gradio UI ---
113
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
114
  gr.Markdown("# 📼 IA Drone‑Strike Explorer")
115
  with gr.Row():
@@ -117,65 +96,88 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
117
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
118
  scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
119
  ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
120
- run_btn = gr.Button("🔍 Search & Scan", variant="primary")
121
 
122
- url_dropdown = gr.Dropdown(label="Clean Video URLs", choices=[], interactive=True)
123
- video_player = gr.Video(label="Video Preview")
 
124
 
125
  with gr.Tabs():
126
  with gr.TabItem("IA Metadata"):
127
- ia_meta_json = gr.JSON(label="Raw IA Metadata")
128
  with gr.TabItem("FFprobe"):
129
  ffprobe_json = gr.JSON(label="FFprobe Metadata")
130
- with gr.TabItem("Origins"):
131
- origins_json = gr.JSON(label="Source Origins")
 
132
 
133
- def search_and_populate(keywords, api_key, scan_enabled):
134
- urls = fetch_clean_videos(keywords, api_key, scan_enabled)
135
- return gr.update(choices=urls, value=urls[0] if urls else None)
 
 
 
136
 
137
- def update_all(selected_url, ff_on, api_key):
138
- if not selected_url:
139
- return None, {}, {}, []
 
 
 
140
 
141
- parts = selected_url.split("/")
142
- identifier = parts[4] if len(parts) > 4 else None
 
 
 
143
 
144
- raw_ia = {"identifier": identifier}
145
- if identifier:
146
- try:
147
- data = fetch_ia_metadata(identifier)
148
- raw_ia.update(data)
149
- except Exception:
150
- raw_ia["error"] = "could not fetch IA metadata"
151
 
 
 
 
 
 
152
  ff_md = {}
153
  if ff_on:
154
  try:
155
- ff_md = extract_ffprobe_metadata(selected_url)
156
  except Exception as e:
157
  ff_md = {"error": str(e)}
158
 
159
- desc = raw_ia.get("metadata", {}).get("description", "")
160
- urls_found = re.findall(r'https?://[^\s"<]+' , desc)
161
- origins = []
162
- if urls_found:
163
- with concurrent.futures.ThreadPoolExecutor() as executor:
164
- for meta in executor.map(fetch_page_metadata, urls_found[:5]):
165
- origins.append(meta)
166
-
167
- return selected_url, raw_ia, ff_md, origins
168
-
169
- run_btn.click(
170
- search_and_populate,
171
- inputs=[kw_input, vt_key_input, scan_toggle],
172
- outputs=[url_dropdown]
173
- )
174
- url_dropdown.change(
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  update_all,
176
- inputs=[url_dropdown, ffprobe_toggle, vt_key_input],
177
- outputs=[video_player, ia_meta_json, ffprobe_json, origins_json]
178
  )
179
 
180
  if __name__ == "__main__":
181
- demo.launch()
 
8
  import concurrent.futures
9
  from bs4 import BeautifulSoup
10
  from functools import lru_cache
11
+ from pyvis.network import Network
12
+ from urllib.parse import urlparse
13
 
14
+ # Persistent HTTP session for performance
15
  session = requests.Session()
16
  session.headers.update({
17
  "User-Agent": "Mozilla/5.0 (compatible; IA-Drone-Explorer/1.0)"
 
68
  return {
69
  "metadata": item.metadata,
70
  "files": [
71
+ {k: v for k, v in f.items() if k != "_checksum"}
 
 
 
 
 
 
72
  for f in item.files
73
  ]
74
  }
75
 
76
+ # --- Search IA and return identifiers ---
77
+ def fetch_identifiers(keywords):
78
  query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
79
  ia_query = f"mediatype:(movies) AND ({query})"
80
+ results = list(search_items(ia_query, fields=["identifier"]))[:50]
81
+ return [r["identifier"] for r in results]
82
+
83
+ # --- List video files for a given item ---
84
+ def list_files_for_identifier(identifier):
85
+ data = fetch_ia_metadata(identifier)
86
+ return [
87
+ f["name"] for f in data["files"]
88
+ if f.get("format", "").lower().startswith(("mpeg","mp4","avi","mov","webm","m4v"))
89
+ ]
90
+
91
+ # --- Gradio UI setup ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
93
  gr.Markdown("# 📼 IA Drone‑Strike Explorer")
94
  with gr.Row():
 
96
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
97
  scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
98
  ffprobe_toggle = gr.Checkbox(label="Enable FFprobe metadata", value=False)
99
+ run_btn = gr.Button("🔍 Search Items", variant="primary")
100
 
101
+ id_dropdown = gr.Dropdown(label="IA Item Identifiers", choices=[], interactive=True)
102
+ file_dropdown = gr.Dropdown(label="Video Files", choices=[], interactive=True)
103
+ video_player = gr.Video(label="Video Preview")
104
 
105
  with gr.Tabs():
106
  with gr.TabItem("IA Metadata"):
107
+ ia_meta_json = gr.JSON(label="Raw IA Metadata")
108
  with gr.TabItem("FFprobe"):
109
  ffprobe_json = gr.JSON(label="FFprobe Metadata")
110
+ with gr.TabItem("Origins Graph"):
111
+ origins_graph = gr.HTML(label="Source Origins Graph")
112
+ origins_meta = gr.JSON(label="Origins Metadata")
113
 
114
+ # 1) Fetch identifiers for search keywords
115
+ run_btn.click(
116
+ lambda kws: gr.update(choices=fetch_identifiers(kws), value=None),
117
+ inputs=[kw_input],
118
+ outputs=[id_dropdown]
119
+ )
120
 
121
+ # 2) Populate video files dropdown when an identifier is selected
122
+ id_dropdown.change(
123
+ lambda ident: gr.update(choices=list_files_for_identifier(ident), value=None),
124
+ inputs=[id_dropdown],
125
+ outputs=[file_dropdown]
126
+ )
127
 
128
+ # 3) When a file is selected, fetch metadata, run FFprobe (if toggled),
129
+ # and build the clickable origins graph with circular favicon nodes.
130
+ def update_all(identifier, file_name, ff_on, api_key):
131
+ if not identifier or not file_name:
132
+ return None, {}, {}, "", []
133
 
134
+ url = f"https://archive.org/download/{identifier}/{file_name}"
 
 
 
 
 
 
135
 
136
+ # IA metadata (cached)
137
+ data = fetch_ia_metadata(identifier)
138
+ raw_ia = {"identifier": identifier, **data}
139
+
140
+ # FFprobe metadata
141
  ff_md = {}
142
  if ff_on:
143
  try:
144
+ ff_md = extract_ffprobe_metadata(url)
145
  except Exception as e:
146
  ff_md = {"error": str(e)}
147
 
148
+ # Origins graph
149
+ desc = data["metadata"].get("description", "") or ""
150
+ urls = re.findall(r"https?://[^\s\"<]+", desc)
151
+ origins_list = []
152
+
153
+ net = Network(height="300px", width="100%", directed=True)
154
+ net.set_options('{"edges":{"arrows":"to"}}')
155
+ net.add_node(identifier, label=identifier, shape="ellipse")
156
+
157
+ for u in urls[:10]:
158
+ meta = fetch_page_metadata(u)
159
+ origins_list.append(meta)
160
+ dom = urlparse(u).netloc
161
+ fav = f"https://www.google.com/s2/favicons?domain={dom}"
162
+ net.add_node(
163
+ u,
164
+ label=dom,
165
+ shape="circularImage",
166
+ image=fav,
167
+ title=json.dumps(meta, indent=2),
168
+ href=u
169
+ )
170
+ net.add_edge(identifier, u)
171
+
172
+ graph_html = net.generate_html()
173
+
174
+ return url, raw_ia, ff_md, graph_html, origins_list
175
+
176
+ file_dropdown.change(
177
  update_all,
178
+ inputs=[id_dropdown, file_dropdown, ffprobe_toggle, vt_key_input],
179
+ outputs=[video_player, ia_meta_json, ffprobe_json, origins_graph, origins_meta]
180
  )
181
 
182
  if __name__ == "__main__":
183
+ demo.launch()