wuhp commited on
Commit
a832716
·
verified ·
1 Parent(s): 9053271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -24
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from internetarchive import search_items, get_item
3
  import requests
@@ -34,17 +35,25 @@ def extract_ffprobe_metadata(url_or_path):
34
  url_or_path
35
  ]
36
  out = subprocess.check_output(cmd)
37
- return json.loads(out)
 
 
 
 
 
 
 
 
 
 
38
 
39
  # --- Scrape basic page metadata (title + og: tags) ---
40
  def fetch_page_metadata(url):
41
  try:
42
  resp = requests.get(url, timeout=5)
43
  resp.raise_for_status()
44
- html = resp.text
45
- soup = BeautifulSoup(html, "html.parser")
46
  meta = {"url": url, "title": soup.title.string if soup.title else None}
47
- # grab OpenGraph tags
48
  for tag in soup.find_all("meta"):
49
  prop = tag.get("property") or tag.get("name")
50
  if prop and prop.startswith(("og:", "twitter:")):
@@ -55,7 +64,8 @@ def fetch_page_metadata(url):
55
 
56
  # --- Core search & scan logic ---
57
  def fetch_clean_videos(keywords, api_key, scan_enabled):
58
- query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
 
59
  ia_query = f"mediatype:(movies) AND ({query})"
60
  results = list(search_items(ia_query))[:50]
61
 
@@ -64,23 +74,22 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
64
  identifier = res["identifier"]
65
  item = get_item(identifier)
66
  for f in item.files:
67
- fmt = f.get("format", "").lower()
68
- if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
 
69
  url = f"https://archive.org/download/{identifier}/{f['name']}"
70
  if scan_enabled and api_key:
71
  try:
72
- is_clean = scan_url_vt(url, api_key)
 
73
  except Exception:
74
  continue
75
- else:
76
- is_clean = True
77
- if is_clean:
78
- clean_urls.append(url)
79
  return clean_urls
80
 
81
  # --- Gradio UI setup ---
82
  with gr.Blocks() as demo:
83
- gr.Markdown("# 📼 IA Drone‑Strike Explorer \nEnable VT scan, FFprobe & Origin Tracing")
84
  with gr.Row():
85
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
86
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -99,11 +108,10 @@ with gr.Blocks() as demo:
99
  return gr.update(choices=urls, value=urls[0] if urls else None)
100
 
101
  def update_all(selected_url, ff_on, api_key):
102
- # no selection guard
103
  if not selected_url:
104
  return None, {}, {}, []
105
 
106
- # 1) IA metadata + file list
107
  parts = selected_url.split("/")
108
  identifier = parts[4] if len(parts) > 4 else None
109
  raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
@@ -117,7 +125,7 @@ with gr.Blocks() as demo:
117
  "format": f.get("format"),
118
  "size": f.get("size"),
119
  "md5": f.get("md5"),
120
- **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
121
  }
122
  for f in item.files
123
  ]
@@ -132,15 +140,40 @@ with gr.Blocks() as demo:
132
  except Exception as e:
133
  ff_md = {"error": str(e)}
134
 
135
- # 3) Origin tracing: scrape each URL in description
136
  origins = []
137
- desc = raw_ia["metadata"].get("description", "")
138
- urls_found = re.findall(r'https?://[^\s"<]+', desc)
139
- for url in urls_found:
140
- meta = fetch_page_metadata(url)
141
- origins.append(meta)
142
- # stop at first “real” origin (you can remove this break to collect all)
143
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
  return selected_url, raw_ia, ff_md, origins
146
 
@@ -157,3 +190,4 @@ with gr.Blocks() as demo:
157
 
158
  if __name__ == "__main__":
159
  demo.launch()
 
 
1
+ ```python
2
  import gradio as gr
3
  from internetarchive import search_items, get_item
4
  import requests
 
35
  url_or_path
36
  ]
37
  out = subprocess.check_output(cmd)
38
+ md = json.loads(out)
39
+ # compute a human-readable FPS for the first video stream
40
+ for stream in md.get("streams", []):
41
+ if stream.get("codec_type") == "video":
42
+ avg_fr = stream.get("avg_frame_rate", "")
43
+ if avg_fr and "/" in avg_fr:
44
+ num, den = avg_fr.split("/")
45
+ if den != "0":
46
+ stream["computed_fps"] = round(int(num) / int(den), 2)
47
+ break
48
+ return md
49
 
50
  # --- Scrape basic page metadata (title + og: tags) ---
51
  def fetch_page_metadata(url):
52
  try:
53
  resp = requests.get(url, timeout=5)
54
  resp.raise_for_status()
55
+ soup = BeautifulSoup(resp.text, "html.parser")
 
56
  meta = {"url": url, "title": soup.title.string if soup.title else None}
 
57
  for tag in soup.find_all("meta"):
58
  prop = tag.get("property") or tag.get("name")
59
  if prop and prop.startswith(("og:", "twitter:")):
 
64
 
65
  # --- Core search & scan logic ---
66
  def fetch_clean_videos(keywords, api_key, scan_enabled):
67
+ # build IA query
68
+ query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
69
  ia_query = f"mediatype:(movies) AND ({query})"
70
  results = list(search_items(ia_query))[:50]
71
 
 
74
  identifier = res["identifier"]
75
  item = get_item(identifier)
76
  for f in item.files:
77
+ name = f.get("name", "").lower()
78
+ # include common video file extensions
79
+ if name.endswith((".mp4", ".m4v", ".mov", ".avi", ".mpg", ".mpeg", ".mkv", ".webm")):
80
  url = f"https://archive.org/download/{identifier}/{f['name']}"
81
  if scan_enabled and api_key:
82
  try:
83
+ if not scan_url_vt(url, api_key):
84
+ continue
85
  except Exception:
86
  continue
87
+ clean_urls.append(url)
 
 
 
88
  return clean_urls
89
 
90
  # --- Gradio UI setup ---
91
  with gr.Blocks() as demo:
92
+ gr.Markdown("# 📼 IA Scrape Enhanced Archive Video Explorer")
93
  with gr.Row():
94
  kw_input = gr.Textbox(label="Search keywords", value="drone strike, military uav")
95
  vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
 
108
  return gr.update(choices=urls, value=urls[0] if urls else None)
109
 
110
  def update_all(selected_url, ff_on, api_key):
 
111
  if not selected_url:
112
  return None, {}, {}, []
113
 
114
+ # 1) IA metadata + files
115
  parts = selected_url.split("/")
116
  identifier = parts[4] if len(parts) > 4 else None
117
  raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
 
125
  "format": f.get("format"),
126
  "size": f.get("size"),
127
  "md5": f.get("md5"),
128
+ **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
129
  }
130
  for f in item.files
131
  ]
 
140
  except Exception as e:
141
  ff_md = {"error": str(e)}
142
 
143
+ # 3) Source‑origin tracing
144
  origins = []
145
+ source_url = None
146
+ meta = raw_ia.get("metadata", {})
147
+
148
+ # check explicit metadata fields
149
+ for key, val in meta.items():
150
+ if key.lower() in ("source", "originalurl"):
151
+ source_url = val[0] if isinstance(val, list) else val
152
+ break
153
+
154
+ # fallback: external-identifier
155
+ if not source_url:
156
+ for key, val in meta.items():
157
+ if key.lower().startswith("external-identifier"):
158
+ ext = val[0] if isinstance(val, list) else val
159
+ if "youtube" in ext:
160
+ vid = ext.split(":")[-1]
161
+ source_url = f"https://www.youtube.com/watch?v={vid}"
162
+ elif "vimeo" in ext:
163
+ vid = ext.split(":")[-1]
164
+ source_url = f"https://vimeo.com/{vid}"
165
+ break
166
+
167
+ # last resort: first URL in description
168
+ if not source_url:
169
+ desc = meta.get("description", "")
170
+ found = re.findall(r"https?://[^\s\"<]+", desc)
171
+ if found:
172
+ source_url = found[0]
173
+
174
+ # fetch page metadata for the source
175
+ if source_url:
176
+ origins.append(fetch_page_metadata(source_url))
177
 
178
  return selected_url, raw_ia, ff_md, origins
179
 
 
190
 
191
  if __name__ == "__main__":
192
  demo.launch()
193
+ ```