wuhp commited on
Commit
4e52cce
·
verified ·
1 Parent(s): d4356c2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -29
app.py CHANGED
@@ -11,13 +11,14 @@ from functools import lru_cache
11
  import networkx as nx
12
  from pyvis.network import Network
13
  from urllib.parse import urlparse
14
- \# --- Shared HTTP session for speed & headers ---
 
15
  session = requests.Session()
16
  session.headers.update({
17
  "User-Agent": "Mozilla/5.0 (compatible; IA-Video-Meta-Explorer/1.0)"
18
  })
19
 
20
- \# --- VirusTotal helper (optional) ---
21
  def scan_url_vt(url, api_key):
22
  headers = {"x-apikey": api_key}
23
  resp = session.post(
@@ -28,13 +29,15 @@ def scan_url_vt(url, api_key):
28
  # Poll until complete
29
  while True:
30
  time.sleep(5)
31
- st = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
 
 
32
  st.raise_for_status()
33
  attr = st.json()["data"]["attributes"]
34
  if attr.get("status") == "completed":
35
  return attr.get("stats", {}).get("malicious", 0) == 0
36
 
37
- \# --- FFprobe metadata extraction ---
38
  def extract_ffprobe_metadata(url_or_path):
39
  cmd = [
40
  "ffprobe", "-v", "error", "-print_format", "json",
@@ -44,7 +47,7 @@ def extract_ffprobe_metadata(url_or_path):
44
  out = subprocess.check_output(cmd)
45
  return json.loads(out)
46
 
47
- \# --- Caching page metadata ---
48
  @lru_cache(maxsize=256)
49
  def fetch_page_metadata(url):
50
  try:
@@ -52,7 +55,7 @@ def fetch_page_metadata(url):
52
  resp.raise_for_status()
53
  soup = BeautifulSoup(resp.text, "html.parser")
54
  meta = {"url": url, "title": soup.title.string if soup.title else None}
55
- # OpenGraph & twitter
56
  for tag in soup.find_all("meta"):
57
  prop = tag.get("property") or tag.get("name")
58
  if prop and prop.startswith(("og:", "twitter:")):
@@ -61,28 +64,28 @@ def fetch_page_metadata(url):
61
  except Exception as e:
62
  return {"url": url, "error": str(e)}
63
 
64
- \# --- Fetch favicon for clickable graph nodes ---
65
  @lru_cache(maxsize=256)
66
  def fetch_favicon(url):
67
  try:
68
- domain = urlparse(url).scheme + "://" + urlparse(url).netloc
69
- ico_url = domain + "/favicon.ico"
 
70
  resp = session.get(ico_url, timeout=3)
71
  resp.raise_for_status()
72
  return ico_url
73
- except:
74
  return None
75
 
76
- \# --- Trace origins recursively up to a max depth ---
77
- def trace_origins(description, max_depth=2, executor=None):
78
  graph = nx.DiGraph()
79
- def _recurse(url, depth):
80
  if depth > max_depth or url in graph:
81
  return
82
  info = fetch_page_metadata(url)
83
  favicon = fetch_favicon(url)
84
  graph.add_node(url, title=info.get("title"), favicon=favicon)
85
- # find OG:url or linked URLs on page as potential origins
86
  links = []
87
  if "og:url" in info:
88
  links.append(info["og:url"])
@@ -96,34 +99,36 @@ def trace_origins(description, max_depth=2, executor=None):
96
  pass
97
  for link in set(links):
98
  graph.add_edge(link, url)
99
- _recurse(link, depth + 1)
100
- # initial URLs from IA description
101
- seeds = re.findall(r'https?://[^\s"<]+', description)
102
  for seed in seeds:
103
- _recurse(seed, 1)
104
  return graph
105
 
106
- \# --- Build PyVis network HTML ---
107
  def build_graph_html(graph):
108
  net = Network(height="500px", width="100%", directed=True)
109
  for url, data in graph.nodes(data=True):
110
- net.add_node(url, label=data.get("title") or url, title=url, shape="image" if data.get("favicon") else "ellipse", image=data.get("favicon"))
 
 
 
111
  for src, dst in graph.edges():
112
  net.add_edge(src, dst)
113
  return net.generate_html()
114
 
115
- \# --- Fetch IA items (movies) ---
116
  def fetch_clean_videos(keywords, api_key, scan_enabled):
117
- query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
118
  ia_query = f"mediatype:(movies) AND ({query})"
119
  results = list(search_items(ia_query))[:20]
120
  clean_urls = []
121
  for res in results:
122
- identifier = res['identifier']
123
  item = get_item(identifier)
124
  for f in item.files:
125
- fmt = f.get('format', '').lower()
126
- if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
127
  url = f"https://archive.org/download/{identifier}/{f['name']}"
128
  if scan_enabled and api_key:
129
  try:
@@ -134,9 +139,9 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
134
  clean_urls.append(url)
135
  return clean_urls
136
 
137
- \# --- Gradio UI ---
138
  with gr.Blocks() as demo:
139
- gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origin Tracing")
140
  with gr.Row():
141
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
142
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -161,7 +166,6 @@ with gr.Blocks() as demo:
161
  return None, {}, {}, ""
162
  identifier = selected_url.split("/")[4]
163
  # 1) IA metadata
164
- raw_ia = {}
165
  try:
166
  item = get_item(identifier)
167
  raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
@@ -176,7 +180,7 @@ with gr.Blocks() as demo:
176
  ff_md = {"error": str(e)}
177
  # 3) Origins
178
  desc = raw_ia.get("metadata", {}).get("description", "")
179
- graph = trace_origins(desc, max_depth=2, executor=executor)
180
  graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
181
  return selected_url, raw_ia, ff_md, graph_html
182
 
 
11
  import networkx as nx
12
  from pyvis.network import Network
13
  from urllib.parse import urlparse
14
+
15
+ # --- Shared HTTP session for speed & headers ---
16
  session = requests.Session()
17
  session.headers.update({
18
  "User-Agent": "Mozilla/5.0 (compatible; IA-Video-Meta-Explorer/1.0)"
19
  })
20
 
21
+ # --- VirusTotal helper (optional) ---
22
  def scan_url_vt(url, api_key):
23
  headers = {"x-apikey": api_key}
24
  resp = session.post(
 
29
  # Poll until complete
30
  while True:
31
  time.sleep(5)
32
+ st = session.get(
33
+ f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
34
+ )
35
  st.raise_for_status()
36
  attr = st.json()["data"]["attributes"]
37
  if attr.get("status") == "completed":
38
  return attr.get("stats", {}).get("malicious", 0) == 0
39
 
40
+ # --- FFprobe metadata extraction ---
41
  def extract_ffprobe_metadata(url_or_path):
42
  cmd = [
43
  "ffprobe", "-v", "error", "-print_format", "json",
 
47
  out = subprocess.check_output(cmd)
48
  return json.loads(out)
49
 
50
+ # --- Caching page metadata ---
51
  @lru_cache(maxsize=256)
52
  def fetch_page_metadata(url):
53
  try:
 
55
  resp.raise_for_status()
56
  soup = BeautifulSoup(resp.text, "html.parser")
57
  meta = {"url": url, "title": soup.title.string if soup.title else None}
58
+ # OpenGraph & twitter tags
59
  for tag in soup.find_all("meta"):
60
  prop = tag.get("property") or tag.get("name")
61
  if prop and prop.startswith(("og:", "twitter:")):
 
64
  except Exception as e:
65
  return {"url": url, "error": str(e)}
66
 
67
+ # --- Fetch favicon for clickable graph nodes ---
68
  @lru_cache(maxsize=256)
69
  def fetch_favicon(url):
70
  try:
71
+ parsed = urlparse(url)
72
+ domain = f"{parsed.scheme}://{parsed.netloc}"
73
+ ico_url = f"{domain}/favicon.ico"
74
  resp = session.get(ico_url, timeout=3)
75
  resp.raise_for_status()
76
  return ico_url
77
+ except Exception:
78
  return None
79
 
80
+ # --- Trace origins recursively up to max depth ---
81
+ def trace_origins(description, max_depth=2):
82
  graph = nx.DiGraph()
83
+ def recurse(url, depth):
84
  if depth > max_depth or url in graph:
85
  return
86
  info = fetch_page_metadata(url)
87
  favicon = fetch_favicon(url)
88
  graph.add_node(url, title=info.get("title"), favicon=favicon)
 
89
  links = []
90
  if "og:url" in info:
91
  links.append(info["og:url"])
 
99
  pass
100
  for link in set(links):
101
  graph.add_edge(link, url)
102
+ recurse(link, depth + 1)
103
+ seeds = re.findall(r"https?://[^\s\"<]+", description)
 
104
  for seed in seeds:
105
+ recurse(seed, 1)
106
  return graph
107
 
108
+ # --- Build PyVis network HTML ---
109
  def build_graph_html(graph):
110
  net = Network(height="500px", width="100%", directed=True)
111
  for url, data in graph.nodes(data=True):
112
+ if data.get("favicon"):
113
+ net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
114
+ else:
115
+ net.add_node(url, label=data.get("title") or url, title=url)
116
  for src, dst in graph.edges():
117
  net.add_edge(src, dst)
118
  return net.generate_html()
119
 
120
+ # --- Fetch IA items (movies) ---
121
  def fetch_clean_videos(keywords, api_key, scan_enabled):
122
+ query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
123
  ia_query = f"mediatype:(movies) AND ({query})"
124
  results = list(search_items(ia_query))[:20]
125
  clean_urls = []
126
  for res in results:
127
+ identifier = res["identifier"]
128
  item = get_item(identifier)
129
  for f in item.files:
130
+ fmt = f.get("format", "").lower()
131
+ if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
132
  url = f"https://archive.org/download/{identifier}/{f['name']}"
133
  if scan_enabled and api_key:
134
  try:
 
139
  clean_urls.append(url)
140
  return clean_urls
141
 
142
+ # --- Gradio UI ---
143
  with gr.Blocks() as demo:
144
+ gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origin Tracing")
145
  with gr.Row():
146
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
147
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
 
166
  return None, {}, {}, ""
167
  identifier = selected_url.split("/")[4]
168
  # 1) IA metadata
 
169
  try:
170
  item = get_item(identifier)
171
  raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
 
180
  ff_md = {"error": str(e)}
181
  # 3) Origins
182
  desc = raw_ia.get("metadata", {}).get("description", "")
183
+ graph = trace_origins(desc, max_depth=2)
184
  graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
185
  return selected_url, raw_ia, ff_md, graph_html
186