wuhp commited on
Commit
12dbf40
·
verified ·
1 Parent(s): 9ab61fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -14
app.py CHANGED
@@ -46,8 +46,9 @@ def fetch_page_metadata(url):
46
  resp = session.get(url, timeout=5)
47
  resp.raise_for_status()
48
  soup = BeautifulSoup(resp.text, "html.parser")
49
- meta = {"url": url, "title": soup.title.string.strip() if soup.title and soup.title.string else url}
50
- for tag in soup.find_all("meta"): # OpenGraph & Twitter tags
 
51
  prop = tag.get("property") or tag.get("name")
52
  if prop and prop.startswith(("og:", "twitter:")):
53
  meta[prop] = tag.get("content")
@@ -55,7 +56,7 @@ def fetch_page_metadata(url):
55
  except Exception as e:
56
  return {"url": url, "error": str(e)}
57
 
58
- # --- Fetch favicon for graph nodes ---
59
  @lru_cache(maxsize=256)
60
  def fetch_favicon(url):
61
  try:
@@ -65,22 +66,21 @@ def fetch_favicon(url):
65
  resp = session.get(ico_url, timeout=3)
66
  resp.raise_for_status()
67
  return ico_url
68
- except Exception:
69
  return None
70
 
71
  # --- Recursive origin tracing ---
72
  def trace_origins(description_html, max_depth=2):
73
  graph = nx.DiGraph()
74
- # clean HTML to plain text for better URL extraction
75
- desc_text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
76
- seeds = re.findall(r"https?://[^\s]+", desc_text)
77
  def recurse(url, depth):
78
  if depth > max_depth or url in graph:
79
  return
80
  info = fetch_page_metadata(url)
81
  favicon = fetch_favicon(url)
82
  graph.add_node(url, title=info.get("title"), favicon=favicon)
83
- # find further links via OG:url or anchor tags
84
  next_links = []
85
  if info.get("og:url"):
86
  next_links.append(info["og:url"])
@@ -101,7 +101,7 @@ def trace_origins(description_html, max_depth=2):
101
  recurse(seed, 1)
102
  return graph
103
 
104
- # --- Generate PyVis graph HTML ---
105
  def build_graph_html(graph):
106
  net = Network(height="500px", width="100%", directed=True, notebook=False)
107
  for url, data in graph.nodes(data=True):
@@ -113,7 +113,7 @@ def build_graph_html(graph):
113
  net.add_edge(src, dst)
114
  return net.generate_html()
115
 
116
- # --- Search and filter IA videos ---
117
  def fetch_clean_videos(keywords, api_key, scan_enabled):
118
  terms = [kw.strip() for kw in keywords.split(",")]
119
  query = " OR ".join(term.replace(" ", "+") for term in terms)
@@ -131,7 +131,7 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
131
  for f in item.files:
132
  fmt = f.get("format", "").lower()
133
  if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
134
- video_url = f"https://archive.org/download/{identifier}/{f['name']}"
135
  if scan_enabled and api_key:
136
  try:
137
  if not scan_url_vt(video_url, api_key):
@@ -143,7 +143,7 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
143
 
144
  # --- Gradio UI ---
145
  with gr.Blocks() as demo:
146
- gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origin Tracing")
147
  with gr.Row():
148
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
149
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -166,7 +166,6 @@ with gr.Blocks() as demo:
166
  def update_all(selected_url, ff_on, api_key):
167
  if not selected_url:
168
  return None, {}, {}, "<p>No data.</p>"
169
- # extract identifier robustly
170
  parsed = urlparse(selected_url)
171
  parts = parsed.path.strip("/").split("/")
172
  identifier = parts[1] if len(parts) > 1 else None
@@ -175,7 +174,10 @@ with gr.Blocks() as demo:
175
  item = get_item(identifier)
176
  raw_ia = {
177
  "metadata": item.metadata,
178
- "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]
 
 
 
179
  }
180
  except Exception as e:
181
  raw_ia = {"error": f"Could not fetch IA metadata: {e}"}
 
46
  resp = session.get(url, timeout=5)
47
  resp.raise_for_status()
48
  soup = BeautifulSoup(resp.text, "html.parser")
49
+ title = soup.title.string.strip() if soup.title and soup.title.string else url
50
+ meta = {"url": url, "title": title}
51
+ for tag in soup.find_all("meta"): # OpenGraph & Twitter
52
  prop = tag.get("property") or tag.get("name")
53
  if prop and prop.startswith(("og:", "twitter:")):
54
  meta[prop] = tag.get("content")
 
56
  except Exception as e:
57
  return {"url": url, "error": str(e)}
58
 
59
+ # --- Fetch favicon ---
60
  @lru_cache(maxsize=256)
61
  def fetch_favicon(url):
62
  try:
 
66
  resp = session.get(ico_url, timeout=3)
67
  resp.raise_for_status()
68
  return ico_url
69
+ except:
70
  return None
71
 
72
  # --- Recursive origin tracing ---
73
  def trace_origins(description_html, max_depth=2):
74
  graph = nx.DiGraph()
75
+ text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
76
+ seeds = re.findall(r"https?://[^\s]+", text)
 
77
  def recurse(url, depth):
78
  if depth > max_depth or url in graph:
79
  return
80
  info = fetch_page_metadata(url)
81
  favicon = fetch_favicon(url)
82
  graph.add_node(url, title=info.get("title"), favicon=favicon)
83
+ # find next links
84
  next_links = []
85
  if info.get("og:url"):
86
  next_links.append(info["og:url"])
 
101
  recurse(seed, 1)
102
  return graph
103
 
104
+ # --- Build PyVis graph HTML ---
105
  def build_graph_html(graph):
106
  net = Network(height="500px", width="100%", directed=True, notebook=False)
107
  for url, data in graph.nodes(data=True):
 
113
  net.add_edge(src, dst)
114
  return net.generate_html()
115
 
116
+ # --- Search IA videos ---
117
  def fetch_clean_videos(keywords, api_key, scan_enabled):
118
  terms = [kw.strip() for kw in keywords.split(",")]
119
  query = " OR ".join(term.replace(" ", "+") for term in terms)
 
131
  for f in item.files:
132
  fmt = f.get("format", "").lower()
133
  if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
134
+ video_url = f"https://archive.org/download/{identifier}/{f.get('name')}"
135
  if scan_enabled and api_key:
136
  try:
137
  if not scan_url_vt(video_url, api_key):
 
143
 
144
  # --- Gradio UI ---
145
  with gr.Blocks() as demo:
146
+ gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origins")
147
  with gr.Row():
148
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
149
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
 
166
  def update_all(selected_url, ff_on, api_key):
167
  if not selected_url:
168
  return None, {}, {}, "<p>No data.</p>"
 
169
  parsed = urlparse(selected_url)
170
  parts = parsed.path.strip("/").split("/")
171
  identifier = parts[1] if len(parts) > 1 else None
 
174
  item = get_item(identifier)
175
  raw_ia = {
176
  "metadata": item.metadata,
177
+ "files": [
178
+ dict(name=f.get("name"), format=f.get("format"), size=f.get("size"))
179
+ for f in item.files
180
+ ]
181
  }
182
  except Exception as e:
183
  raw_ia = {"error": f"Could not fetch IA metadata: {e}"}