Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

a832716

verified ·

1 Parent(s): 9053271

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -24

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from internetarchive import search_items, get_item
 import requests
@@ -34,17 +35,25 @@ def extract_ffprobe_metadata(url_or_path):
         url_or_path
     ]
     out = subprocess.check_output(cmd)
-    return json.loads(out)
 # --- Scrape basic page metadata (title + og: tags) ---
 def fetch_page_metadata(url):
     try:
         resp = requests.get(url, timeout=5)
         resp.raise_for_status()
-        html = resp.text
-        soup = BeautifulSoup(html, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
-        # grab OpenGraph tags
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
@@ -55,7 +64,8 @@ def fetch_page_metadata(url):
 # --- Core search & scan logic ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
-    query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:50]
@@ -64,23 +74,22 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
         identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files:
-            fmt = f.get("format", "").lower()
-            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
-                        is_clean = scan_url_vt(url, api_key)
                     except Exception:
                         continue
-                else:
-                    is_clean = True
-                if is_clean:
-                    clean_urls.append(url)
     return clean_urls
 # --- Gradio UI setup ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer  \nEnable VT scan, FFprobe & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -99,11 +108,10 @@ with gr.Blocks() as demo:
         return gr.update(choices=urls, value=urls[0] if urls else None)
     def update_all(selected_url, ff_on, api_key):
-        # no selection guard
         if not selected_url:
             return None, {}, {}, []
-        # 1) IA metadata + file list
         parts = selected_url.split("/")
         identifier = parts[4] if len(parts) > 4 else None
         raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
@@ -117,7 +125,7 @@ with gr.Blocks() as demo:
                         "format": f.get("format"),
                         "size": f.get("size"),
                         "md5": f.get("md5"),
-                        **{k: v for k,v in f.items() if k not in ("name","format","size","md5")}
                     }
                     for f in item.files
                 ]
@@ -132,15 +140,40 @@ with gr.Blocks() as demo:
             except Exception as e:
                 ff_md = {"error": str(e)}
-        # 3) Origin tracing: scrape each URL in description
         origins = []
-        desc = raw_ia["metadata"].get("description", "")
-        urls_found = re.findall(r'https?://[^\s"<]+', desc)
-        for url in urls_found:
-            meta = fetch_page_metadata(url)
-            origins.append(meta)
-            # stop at first “real” origin (you can remove this break to collect all)
-            break
         return selected_url, raw_ia, ff_md, origins
@@ -157,3 +190,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.launch()

+```python
 import gradio as gr
 from internetarchive import search_items, get_item
 import requests
         url_or_path
     ]
     out = subprocess.check_output(cmd)
+    md = json.loads(out)
+    # compute a human-readable FPS for the first video stream
+    for stream in md.get("streams", []):
+        if stream.get("codec_type") == "video":
+            avg_fr = stream.get("avg_frame_rate", "")
+            if avg_fr and "/" in avg_fr:
+                num, den = avg_fr.split("/")
+                if den != "0":
+                    stream["computed_fps"] = round(int(num) / int(den), 2)
+            break
+    return md
 # --- Scrape basic page metadata (title + og: tags) ---
 def fetch_page_metadata(url):
     try:
         resp = requests.get(url, timeout=5)
         resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
 # --- Core search & scan logic ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
+    # build IA query
+    query = " OR ".join(kw.strip().replace(" ", "+") for kw in keywords.split(","))
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:50]
         identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files:
+            name = f.get("name", "").lower()
+            # include common video file extensions
+            if name.endswith((".mp4", ".m4v", ".mov", ".avi", ".mpg", ".mpeg", ".mkv", ".webm")):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
+                        if not scan_url_vt(url, api_key):
+                            continue
                     except Exception:
                         continue
+                clean_urls.append(url)
     return clean_urls
 # --- Gradio UI setup ---
 with gr.Blocks() as demo:
+    gr.Markdown("# 📼 IA Scrape – Enhanced Archive Video Explorer")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
         return gr.update(choices=urls, value=urls[0] if urls else None)
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
             return None, {}, {}, []
+        # 1) IA metadata + files
         parts = selected_url.split("/")
         identifier = parts[4] if len(parts) > 4 else None
         raw_ia = {"identifier": identifier, "metadata": {}, "files": []}
                         "format": f.get("format"),
                         "size": f.get("size"),
                         "md5": f.get("md5"),
+                        **{k: v for k, v in f.items() if k not in ("name", "format", "size", "md5")}
                     }
                     for f in item.files
                 ]
             except Exception as e:
                 ff_md = {"error": str(e)}
+        # 3) Source‑origin tracing
         origins = []
+        source_url = None
+        meta = raw_ia.get("metadata", {})
+        # check explicit metadata fields
+        for key, val in meta.items():
+            if key.lower() in ("source", "originalurl"):
+                source_url = val[0] if isinstance(val, list) else val
+                break
+        # fallback: external-identifier
+        if not source_url:
+            for key, val in meta.items():
+                if key.lower().startswith("external-identifier"):
+                    ext = val[0] if isinstance(val, list) else val
+                    if "youtube" in ext:
+                        vid = ext.split(":")[-1]
+                        source_url = f"https://www.youtube.com/watch?v={vid}"
+                    elif "vimeo" in ext:
+                        vid = ext.split(":")[-1]
+                        source_url = f"https://vimeo.com/{vid}"
+                    break
+        # last resort: first URL in description
+        if not source_url:
+            desc = meta.get("description", "")
+            found = re.findall(r"https?://[^\s\"<]+", desc)
+            if found:
+                source_url = found[0]
+        # fetch page metadata for the source
+        if source_url:
+            origins.append(fetch_page_metadata(source_url))
         return selected_url, raw_ia, ff_md, origins
 if __name__ == "__main__":
     demo.launch()
+```