wuhp commited on
Commit
9ab61fb
·
verified ·
1 Parent(s): 4e52cce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -47
app.py CHANGED
@@ -6,7 +6,7 @@ import subprocess
6
  import json
7
  import re
8
  from bs4 import BeautifulSoup
9
- from concurrent.futures import ThreadPoolExecutor, as_completed
10
  from functools import lru_cache
11
  import networkx as nx
12
  from pyvis.network import Network
@@ -21,17 +21,13 @@ session.headers.update({
21
  # --- VirusTotal helper (optional) ---
22
  def scan_url_vt(url, api_key):
23
  headers = {"x-apikey": api_key}
24
- resp = session.post(
25
- "https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url}
26
- )
27
  resp.raise_for_status()
28
  analysis_id = resp.json()["data"]["id"]
29
  # Poll until complete
30
  while True:
31
  time.sleep(5)
32
- st = session.get(
33
- f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
34
- )
35
  st.raise_for_status()
36
  attr = st.json()["data"]["attributes"]
37
  if attr.get("status") == "completed":
@@ -39,11 +35,7 @@ def scan_url_vt(url, api_key):
39
 
40
  # --- FFprobe metadata extraction ---
41
  def extract_ffprobe_metadata(url_or_path):
42
- cmd = [
43
- "ffprobe", "-v", "error", "-print_format", "json",
44
- "-show_format", "-show_streams",
45
- url_or_path
46
- ]
47
  out = subprocess.check_output(cmd)
48
  return json.loads(out)
49
 
@@ -54,9 +46,8 @@ def fetch_page_metadata(url):
54
  resp = session.get(url, timeout=5)
55
  resp.raise_for_status()
56
  soup = BeautifulSoup(resp.text, "html.parser")
57
- meta = {"url": url, "title": soup.title.string if soup.title else None}
58
- # OpenGraph & twitter tags
59
- for tag in soup.find_all("meta"):
60
  prop = tag.get("property") or tag.get("name")
61
  if prop and prop.startswith(("og:", "twitter:")):
62
  meta[prop] = tag.get("content")
@@ -64,7 +55,7 @@ def fetch_page_metadata(url):
64
  except Exception as e:
65
  return {"url": url, "error": str(e)}
66
 
67
- # --- Fetch favicon for clickable graph nodes ---
68
  @lru_cache(maxsize=256)
69
  def fetch_favicon(url):
70
  try:
@@ -77,37 +68,42 @@ def fetch_favicon(url):
77
  except Exception:
78
  return None
79
 
80
- # --- Trace origins recursively up to max depth ---
81
- def trace_origins(description, max_depth=2):
82
  graph = nx.DiGraph()
 
 
 
83
  def recurse(url, depth):
84
  if depth > max_depth or url in graph:
85
  return
86
  info = fetch_page_metadata(url)
87
  favicon = fetch_favicon(url)
88
  graph.add_node(url, title=info.get("title"), favicon=favicon)
89
- links = []
90
- if "og:url" in info:
91
- links.append(info["og:url"])
 
92
  else:
93
  try:
94
- soup = BeautifulSoup(session.get(url, timeout=5).text, "html.parser")
 
95
  for a in soup.find_all("a", href=True):
96
- if a["href"].startswith("http"):
97
- links.append(a["href"])
 
98
  except:
99
  pass
100
- for link in set(links):
101
  graph.add_edge(link, url)
102
  recurse(link, depth + 1)
103
- seeds = re.findall(r"https?://[^\s\"<]+", description)
104
  for seed in seeds:
105
  recurse(seed, 1)
106
  return graph
107
 
108
- # --- Build PyVis network HTML ---
109
  def build_graph_html(graph):
110
- net = Network(height="500px", width="100%", directed=True)
111
  for url, data in graph.nodes(data=True):
112
  if data.get("favicon"):
113
  net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
@@ -117,27 +113,33 @@ def build_graph_html(graph):
117
  net.add_edge(src, dst)
118
  return net.generate_html()
119
 
120
- # --- Fetch IA items (movies) ---
121
  def fetch_clean_videos(keywords, api_key, scan_enabled):
122
- query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
 
123
  ia_query = f"mediatype:(movies) AND ({query})"
124
- results = list(search_items(ia_query))[:20]
125
- clean_urls = []
126
- for res in results:
127
- identifier = res["identifier"]
128
- item = get_item(identifier)
 
 
 
 
 
129
  for f in item.files:
130
  fmt = f.get("format", "").lower()
131
  if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
132
- url = f"https://archive.org/download/{identifier}/{f['name']}"
133
  if scan_enabled and api_key:
134
  try:
135
- if not scan_url_vt(url, api_key):
136
  continue
137
  except:
138
  continue
139
- clean_urls.append(url)
140
- return clean_urls
141
 
142
  # --- Gradio UI ---
143
  with gr.Blocks() as demo:
@@ -155,7 +157,7 @@ with gr.Blocks() as demo:
155
  ffprobe_json = gr.JSON(label="► FFprobe Metadata")
156
  origins_graph = gr.HTML(label="► Source‑Origin Graph")
157
 
158
- executor = ThreadPoolExecutor(max_workers=10)
159
 
160
  def search_and_populate(keywords, api_key, scan_enabled):
161
  urls = fetch_clean_videos(keywords, api_key, scan_enabled)
@@ -163,14 +165,20 @@ with gr.Blocks() as demo:
163
 
164
  def update_all(selected_url, ff_on, api_key):
165
  if not selected_url:
166
- return None, {}, {}, ""
167
- identifier = selected_url.split("/")[4]
 
 
 
168
  # 1) IA metadata
169
  try:
170
  item = get_item(identifier)
171
- raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
172
- except:
173
- raw_ia = {"error": "Could not fetch IA metadata"}
 
 
 
174
  # 2) FFprobe
175
  ff_md = {}
176
  if ff_on:
@@ -179,8 +187,8 @@ with gr.Blocks() as demo:
179
  except Exception as e:
180
  ff_md = {"error": str(e)}
181
  # 3) Origins
182
- desc = raw_ia.get("metadata", {}).get("description", "")
183
- graph = trace_origins(desc, max_depth=2)
184
  graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
185
  return selected_url, raw_ia, ff_md, graph_html
186
 
 
6
  import json
7
  import re
8
  from bs4 import BeautifulSoup
9
+ from concurrent.futures import ThreadPoolExecutor
10
  from functools import lru_cache
11
  import networkx as nx
12
  from pyvis.network import Network
 
21
  # --- VirusTotal helper (optional) ---
22
  def scan_url_vt(url, api_key):
23
  headers = {"x-apikey": api_key}
24
+ resp = session.post("https://www.virustotal.com/api/v3/urls", headers=headers, data={"url": url})
 
 
25
  resp.raise_for_status()
26
  analysis_id = resp.json()["data"]["id"]
27
  # Poll until complete
28
  while True:
29
  time.sleep(5)
30
+ st = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
 
 
31
  st.raise_for_status()
32
  attr = st.json()["data"]["attributes"]
33
  if attr.get("status") == "completed":
 
35
 
36
  # --- FFprobe metadata extraction ---
37
  def extract_ffprobe_metadata(url_or_path):
38
+ cmd = ["ffprobe", "-v", "error", "-print_format", "json", "-show_format", "-show_streams", url_or_path]
 
 
 
 
39
  out = subprocess.check_output(cmd)
40
  return json.loads(out)
41
 
 
46
  resp = session.get(url, timeout=5)
47
  resp.raise_for_status()
48
  soup = BeautifulSoup(resp.text, "html.parser")
49
+ meta = {"url": url, "title": soup.title.string.strip() if soup.title and soup.title.string else url}
50
+ for tag in soup.find_all("meta"): # OpenGraph & Twitter tags
 
51
  prop = tag.get("property") or tag.get("name")
52
  if prop and prop.startswith(("og:", "twitter:")):
53
  meta[prop] = tag.get("content")
 
55
  except Exception as e:
56
  return {"url": url, "error": str(e)}
57
 
58
+ # --- Fetch favicon for graph nodes ---
59
  @lru_cache(maxsize=256)
60
  def fetch_favicon(url):
61
  try:
 
68
  except Exception:
69
  return None
70
 
71
+ # --- Recursive origin tracing ---
72
+ def trace_origins(description_html, max_depth=2):
73
  graph = nx.DiGraph()
74
+ # clean HTML to plain text for better URL extraction
75
+ desc_text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
76
+ seeds = re.findall(r"https?://[^\s]+", desc_text)
77
  def recurse(url, depth):
78
  if depth > max_depth or url in graph:
79
  return
80
  info = fetch_page_metadata(url)
81
  favicon = fetch_favicon(url)
82
  graph.add_node(url, title=info.get("title"), favicon=favicon)
83
+ # find further links via OG:url or anchor tags
84
+ next_links = []
85
+ if info.get("og:url"):
86
+ next_links.append(info["og:url"])
87
  else:
88
  try:
89
+ page = session.get(url, timeout=5).text
90
+ soup = BeautifulSoup(page, "html.parser")
91
  for a in soup.find_all("a", href=True):
92
+ href = a["href"].strip()
93
+ if href.startswith("http"):
94
+ next_links.append(href)
95
  except:
96
  pass
97
+ for link in set(next_links):
98
  graph.add_edge(link, url)
99
  recurse(link, depth + 1)
 
100
  for seed in seeds:
101
  recurse(seed, 1)
102
  return graph
103
 
104
+ # --- Generate PyVis graph HTML ---
105
  def build_graph_html(graph):
106
+ net = Network(height="500px", width="100%", directed=True, notebook=False)
107
  for url, data in graph.nodes(data=True):
108
  if data.get("favicon"):
109
  net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
 
113
  net.add_edge(src, dst)
114
  return net.generate_html()
115
 
116
+ # --- Search and filter IA videos ---
117
  def fetch_clean_videos(keywords, api_key, scan_enabled):
118
+ terms = [kw.strip() for kw in keywords.split(",")]
119
+ query = " OR ".join(term.replace(" ", "+") for term in terms)
120
  ia_query = f"mediatype:(movies) AND ({query})"
121
+ items = list(search_items(ia_query))[:20]
122
+ urls = []
123
+ for res in items:
124
+ identifier = res.get("identifier")
125
+ if not identifier:
126
+ continue
127
+ try:
128
+ item = get_item(identifier)
129
+ except Exception:
130
+ continue
131
  for f in item.files:
132
  fmt = f.get("format", "").lower()
133
  if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
134
+ video_url = f"https://archive.org/download/{identifier}/{f['name']}"
135
  if scan_enabled and api_key:
136
  try:
137
+ if not scan_url_vt(video_url, api_key):
138
  continue
139
  except:
140
  continue
141
+ urls.append(video_url)
142
+ return urls
143
 
144
  # --- Gradio UI ---
145
  with gr.Blocks() as demo:
 
157
  ffprobe_json = gr.JSON(label="► FFprobe Metadata")
158
  origins_graph = gr.HTML(label="► Source‑Origin Graph")
159
 
160
+ executor = ThreadPoolExecutor(max_workers=5)
161
 
162
  def search_and_populate(keywords, api_key, scan_enabled):
163
  urls = fetch_clean_videos(keywords, api_key, scan_enabled)
 
165
 
166
  def update_all(selected_url, ff_on, api_key):
167
  if not selected_url:
168
+ return None, {}, {}, "<p>No data.</p>"
169
+ # extract identifier robustly
170
+ parsed = urlparse(selected_url)
171
+ parts = parsed.path.strip("/").split("/")
172
+ identifier = parts[1] if len(parts) > 1 else None
173
  # 1) IA metadata
174
  try:
175
  item = get_item(identifier)
176
+ raw_ia = {
177
+ "metadata": item.metadata,
178
+ "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]
179
+ }
180
+ except Exception as e:
181
+ raw_ia = {"error": f"Could not fetch IA metadata: {e}"}
182
  # 2) FFprobe
183
  ff_md = {}
184
  if ff_on:
 
187
  except Exception as e:
188
  ff_md = {"error": str(e)}
189
  # 3) Origins
190
+ desc_html = raw_ia.get("metadata", {}).get("description", "")
191
+ graph = trace_origins(desc_html, max_depth=2)
192
  graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
193
  return selected_url, raw_ia, ff_md, graph_html
194