wuhp commited on
Commit
34285ab
·
verified ·
1 Parent(s): 07fb168

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -129
app.py CHANGED
@@ -1,163 +1,146 @@
1
  import gradio as gr
2
  from internetarchive import search_items, get_item
3
- import requests
4
- import time
5
- import subprocess
6
- import json
7
- import re
8
  import networkx as nx
9
  from pyvis.network import Network
10
- from bs4 import BeautifulSoup
11
 
12
- # --- VirusTotal helper ---
 
 
 
 
13
  def scan_url_vt(url, api_key):
14
  headers = {"x-apikey": api_key}
15
- resp = requests.post(
16
- "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
17
- )
18
  resp.raise_for_status()
19
  analysis_id = resp.json()["data"]["id"]
20
  while True:
21
  time.sleep(5)
22
- status_resp = requests.get(
23
- f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
24
- )
25
- status_resp.raise_for_status()
26
- attr = status_resp.json()["data"]["attributes"]
27
- if attr.get("status") == "completed":
28
- return attr.get("stats", {}).get("malicious", 0) == 0
29
 
30
- # --- FFprobe ---
31
- def extract_ffprobe_metadata(url_or_path):
32
- cmd = [
33
- "ffprobe", "-v", "error", "-print_format", "json",
34
- "-show_format", "-show_streams",
35
- url_or_path
36
- ]
37
- out = subprocess.check_output(cmd)
38
  return json.loads(out)
39
 
40
- # --- Fetch page metadata + favicon ---
41
  def fetch_page_metadata(url):
42
  try:
43
- resp = requests.get(url, timeout=5)
44
- resp.raise_for_status()
45
- soup = BeautifulSoup(resp.text, "html.parser")
46
- meta = {"url": url, "title": soup.title.string if soup.title else None}
47
- for tag in soup.find_all("meta"):
48
- prop = tag.get("property") or tag.get("name")
49
- if prop and prop.startswith(("og:", "twitter:")):
50
- meta[prop] = tag.get("content")
51
- # favicon
52
- icon = soup.find("link", rel=lambda x: x and "icon" in x)
53
- meta["favicon"] = requests.compat.urljoin(url, icon.get("href")) if icon else None
54
  return meta
55
  except Exception as e:
56
- return {"url": url, "error": str(e)}
57
 
58
- # --- IA search & filter raw footage ---
59
- NEWS_STATIONS = ["cnn", "fox", "bbc", "nbc", "al jazeera", "rt "]
60
- def fetch_raw_footage_urls(keywords, api_key, scan_enabled):
61
- query = " OR ".join([kw.strip().replace(' ', '+') for kw in keywords.split(",")])
62
- ia_query = f"mediatype:(movies) AND ({query})"
63
- results = list(search_items(ia_query))[:50]
64
- urls = []
65
- for res in results:
66
- item = get_item(res['identifier'])
67
- title = item.metadata.get("title", "").lower()
68
- if any(ns in title for ns in NEWS_STATIONS):
69
  continue
70
- for f in item.files:
71
- fmt = f.get('format','').lower()
72
- if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
73
- url = f"https://archive.org/download/{res['identifier']}/{f['name']}"
 
74
  if scan_enabled and api_key:
75
  try:
76
- if not scan_url_vt(url, api_key):
77
- continue
78
  except:
79
  continue
80
- urls.append(url)
81
- return urls
 
 
 
82
 
83
- # --- Recursive origin tracing ---
84
- def trace_origins(description, depth=0, max_depth=3, visited=None):
85
- if visited is None: visited = set()
86
- nodes = []
87
- links = []
88
- urls = re.findall(r'https?://[^\s"<]+', description)
89
- for url in urls:
90
- if url in visited: continue
91
- visited.add(url)
92
- meta = fetch_page_metadata(url)
93
- nodes.append((url, meta))
94
- if depth < max_depth and 'description' in meta:
95
- sub_nodes, sub_links = trace_origins(meta.get('description',''), depth+1, max_depth, visited)
96
- links.extend(sub_links)
97
- nodes.extend(sub_nodes)
98
- # link from origin to IA later
99
- links.append((url, 'internet_archive'))
100
- return nodes, links
101
 
102
- # --- Build graph HTML via pyvis ---
103
- def build_graph(nodes, links):
104
- net = Network(height="400px", width="100%", directed=True)
105
- for url, meta in nodes + [('internet_archive', {'title':'Internet Archive'})]:
106
- label = meta.get('title') or url
107
- favicon = meta.get('favicon')
108
- net.add_node(url, label=label, title=json.dumps(meta), shape='image' if favicon else 'dot',
109
- image=favicon if favicon else None)
110
- for src, dst in links:
111
- net.add_edge(src, dst)
112
- net.force_atlas_2based()
113
- return net.generate_html()
114
-
115
- # --- Gradio UI ---
116
- with gr.Blocks() as demo:
117
- gr.Markdown("# IA Drone‑Strike Chain Explorer")
118
  with gr.Row():
119
- kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
120
- vt_key = gr.Textbox(label="VirusTotal API Key", type="password")
121
- scan_toggle = gr.Checkbox(label="Enable VT scan", value=True)
122
- ffprobe_toggle = gr.Checkbox(label="Enable FFprobe", value=False)
123
- run_btn = gr.Button("Search & Scan")
124
 
125
- url_dd = gr.Dropdown(label="Raw Footage URLs", choices=[], interactive=True)
126
- video = gr.Video(label="Player")
127
- ia_meta = gr.JSON(label="IA Metadata")
128
- ff_meta = gr.JSON(label="FFprobe Metadata")
129
- graph_html = gr.HTML(label="Reupload Chain Graph")
130
- origin_meta = gr.JSON(label="Clicked Origin Metadata")
131
 
132
- def search_populate(kw, api_key, scan_on):
133
- urls = fetch_raw_footage_urls(kw, api_key, scan_on)
134
  return gr.update(choices=urls, value=urls[0] if urls else None)
135
 
136
- def on_select(url, ff_on, api_key):
137
- if not url: return None, {}, {}, "", {}
138
- # IA meta
139
- parts = url.split('/')
140
- ident = parts[4]
141
  item = get_item(ident)
142
- raw = {'metadata': item.metadata, 'files': [{k:v for k,v in f.items()} for f in item.files]}
143
- # ffprobe
144
- ff = extract_ffprobe_metadata(url) if ff_on else {}
145
- # origin trace
146
- nodes, links = trace_origins(item.metadata.get('description',''))
147
- nodes.append(('internet_archive', {'title':'Internet Archive'}))
148
- links = [(n[0],'internet_archive') for n in nodes if n[0] != 'internet_archive']
149
- html = build_graph(nodes, links)
150
- return url, raw, ff, html, {}
151
-
152
- def on_click_node(node_id):
153
- # find metadata in nodes list
154
- # simplistic: refetch page
155
- meta = fetch_page_metadata(node_id) if node_id != 'internet_archive' else {'title':'Internet Archive'}
156
- return meta
 
 
 
 
 
 
 
 
157
 
158
- run_btn.click(search_populate, [kw_input, vt_key, scan_toggle], [url_dd])
159
- url_dd.change(on_select, [url_dd, ffprobe_toggle, vt_key], [video, ia_meta, ff_meta, graph_html, origin_meta])
160
- graph_html.click(on_click_node, None, origin_meta)
161
 
162
- if __name__ == '__main__':
163
- demo.launch()
 
1
  import gradio as gr
2
  from internetarchive import search_items, get_item
3
+ import requests, time, subprocess, json, re, tempfile, os
4
+ from bs4 import BeautifulSoup
 
 
 
5
  import networkx as nx
6
  from pyvis.network import Network
 
7
 
8
+ # --- SETTINGS ---
9
+ NEWS_FILTER = [r"\bcnn\b", r"\bfox\b", r"\bbbc\b", r"\bmsnbc\b", r"\breuters\b"]
10
+ THEME = "gradio/soft" # bring back the default soft theme
11
+
12
+ # --- VirusTotal scan (unchanged) ---
13
  def scan_url_vt(url, api_key):
14
  headers = {"x-apikey": api_key}
15
+ resp = requests.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
 
 
16
  resp.raise_for_status()
17
  analysis_id = resp.json()["data"]["id"]
18
  while True:
19
  time.sleep(5)
20
+ st = requests.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
21
+ attr = st.json()["data"]["attributes"]
22
+ if attr.get("status")=="completed":
23
+ return attr["stats"].get("malicious",0)==0
 
 
 
24
 
25
+ # --- FFprobe metadata (unchanged) ---
26
+ def extract_ffprobe_metadata(path):
27
+ out = subprocess.check_output([
28
+ "ffprobe","-v","error","-print_format","json","-show_format","-show_streams", path
29
+ ])
 
 
 
30
  return json.loads(out)
31
 
32
+ # --- Fetch page metadata + favicon URL ---
33
  def fetch_page_metadata(url):
34
  try:
35
+ r = requests.get(url, timeout=5); r.raise_for_status()
36
+ bs = BeautifulSoup(r.text,"html.parser")
37
+ meta = {"url":url, "title": bs.title.string if bs.title else ""}
38
+ # og: and twitter:
39
+ for m in bs.find_all("meta"):
40
+ p = m.get("property") or m.get("name")
41
+ if p and p.startswith(("og:","twitter:")):
42
+ meta[p] = m.get("content")
43
+ # find favicon
44
+ icon = bs.find("link", rel=lambda v:v and "icon" in v.lower())
45
+ meta["favicon"] = icon["href"] if icon else ""
46
  return meta
47
  except Exception as e:
48
+ return {"url":url, "error":str(e), "favicon":""}
49
 
50
+ # --- Core IA search + filter ---
51
+ def fetch_clean_videos(keywords, api_key, scan_enabled):
52
+ # build query
53
+ q = " OR ".join(kw.strip().replace(" ","+") for kw in keywords.split(","))
54
+ items = list(search_items(f"mediatype:(movies) AND ({q})"))[:50]
55
+ clean = []
56
+ for it in items:
57
+ title = it.get("title","").lower()
58
+ # filter out news
59
+ if any(re.search(p, title) for p in NEWS_FILTER):
 
60
  continue
61
+ # find video files
62
+ for f in get_item(it["identifier"]).files:
63
+ fmt = f.get("format","").lower()
64
+ if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
65
+ url = f"https://archive.org/download/{it['identifier']}/{f['name']}"
66
  if scan_enabled and api_key:
67
  try:
68
+ ok = scan_url_vt(url, api_key)
 
69
  except:
70
  continue
71
+ else:
72
+ ok = True
73
+ if ok:
74
+ clean.append(url)
75
+ return clean
76
 
77
+ # --- Build a PyVis graph and return its HTML path ---
78
+ def build_graph(chain):
79
+ G = Network(height="300px", width="100%", directed=True)
80
+ for node in chain:
81
+ label = node.get("metadata",{}).get("title","origin")
82
+ icon = node.get("metadata",{}).get("favicon","")
83
+ G.add_node(node["url"], label="", shape="image", image=icon or None, title=label)
84
+ # link them in order
85
+ for i in range(len(chain)-1):
86
+ G.add_edge(chain[i]["url"], chain[i+1]["url"])
87
+ tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False)
88
+ G.show(tmp.name)
89
+ return tmp.name
 
 
 
 
 
90
 
91
+ # --- UI ---
92
+ with gr.Blocks(theme=THEME) as demo:
93
+ gr.Markdown("## 📼 Raw-Footage Chain Explorer")
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  with gr.Row():
95
+ kw = gr.Textbox("Keywords (comma-sep)", value="drone strike, military uav")
96
+ vt = gr.Textbox("VT API Key", type="password")
97
+ scan_toggle = gr.Checkbox("Enable VT scan", True)
98
+ ff_toggle = gr.Checkbox("Enable FFprobe", False)
99
+ run_btn = gr.Button("Search & Scan")
100
 
101
+ url_dd = gr.Dropdown("Clean Video URLs", choices=[])
102
+ vid_player = gr.Video()
103
+ ia_json = gr.JSON()
104
+ ff_json = gr.JSON()
105
+ graph_html = gr.HTML()
106
+ origin_meta = gr.JSON()
107
 
108
+ def search_and_populate(k, api, s):
109
+ urls = fetch_clean_videos(k, api, s)
110
  return gr.update(choices=urls, value=urls[0] if urls else None)
111
 
112
+ def update_all(sel, ff_on, api_key):
113
+ if not sel:
114
+ return None, {}, {}, "", {}
115
+ # 1) IA metadata + files
116
+ parts = sel.split("/"); ident = parts[4]
117
  item = get_item(ident)
118
+ raw = {"metadata":item.metadata, "files": [
119
+ {"name":f["name"], "format":f["format"], "size":f.get("size")}
120
+ for f in item.files
121
+ ]}
122
+ # 2) FFprobe
123
+ ffm = extract_ffprobe_metadata(sel) if ff_on else {}
124
+ # 3) trace origins
125
+ desc = raw["metadata"].get("description","")
126
+ urls = re.findall(r"https?://[^\s\"']+", desc)
127
+ chain = []
128
+ for u in urls:
129
+ m = fetch_page_metadata(u)
130
+ chain.append({"url":u, "metadata":m})
131
+ # finally add IA itself as last hop
132
+ chain.append({"url":sel, "metadata": {"title": raw["metadata"].get("title"), "favicon": ""}})
133
+ # 4) graph
134
+ gfile = build_graph(chain)
135
+ # 5) default show first origin metadata
136
+ om = chain[0]["metadata"] if chain else {}
137
+ # embed graph HTML
138
+ graph_data = open(gfile,"r",encoding="utf8").read()
139
+ os.unlink(gfile)
140
+ return sel, raw, ffm, graph_data, om
141
 
142
+ run_btn.click(search_and_populate, [kw, vt, scan_toggle], [url_dd])
143
+ url_dd.change(update_all, [url_dd, ff_toggle, vt],
144
+ [vid_player, ia_json, ff_json, graph_html, origin_meta])
145
 
146
+ demo.launch()