Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

4e52cce

verified ·

1 Parent(s): d4356c2

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -29

app.py CHANGED Viewed

@@ -11,13 +11,14 @@ from functools import lru_cache
 import networkx as nx
 from pyvis.network import Network
 from urllib.parse import urlparse
-\# --- Shared HTTP session for speed & headers ---
 session = requests.Session()
 session.headers.update({
     "User-Agent": "Mozilla/5.0 (compatible; IA-Video-Meta-Explorer/1.0)"
 })
-\# --- VirusTotal helper (optional) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = session.post(
@@ -28,13 +29,15 @@ def scan_url_vt(url, api_key):
     # Poll until complete
     while True:
         time.sleep(5)
-        st = session.get(f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers)
         st.raise_for_status()
         attr = st.json()["data"]["attributes"]
         if attr.get("status") == "completed":
             return attr.get("stats", {}).get("malicious", 0) == 0
-\# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
@@ -44,7 +47,7 @@ def extract_ffprobe_metadata(url_or_path):
     out = subprocess.check_output(cmd)
     return json.loads(out)
-\# --- Caching page metadata ---
 @lru_cache(maxsize=256)
 def fetch_page_metadata(url):
     try:
@@ -52,7 +55,7 @@ def fetch_page_metadata(url):
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
-        # OpenGraph & twitter
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
@@ -61,28 +64,28 @@ def fetch_page_metadata(url):
     except Exception as e:
         return {"url": url, "error": str(e)}
-\# --- Fetch favicon for clickable graph nodes ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
-        domain = urlparse(url).scheme + "://" + urlparse(url).netloc
-        ico_url = domain + "/favicon.ico"
         resp = session.get(ico_url, timeout=3)
         resp.raise_for_status()
         return ico_url
-    except:
         return None
-\# --- Trace origins recursively up to a max depth ---
-def trace_origins(description, max_depth=2, executor=None):
     graph = nx.DiGraph()
-    def _recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
-        # find OG:url or linked URLs on page as potential origins
         links = []
         if "og:url" in info:
             links.append(info["og:url"])
@@ -96,34 +99,36 @@ def trace_origins(description, max_depth=2, executor=None):
                 pass
         for link in set(links):
             graph.add_edge(link, url)
-            _recurse(link, depth + 1)
-    # initial URLs from IA description
-    seeds = re.findall(r'https?://[^\s"<]+', description)
     for seed in seeds:
-        _recurse(seed, 1)
     return graph
-\# --- Build PyVis network HTML ---
 def build_graph_html(graph):
     net = Network(height="500px", width="100%", directed=True)
     for url, data in graph.nodes(data=True):
-        net.add_node(url, label=data.get("title") or url, title=url, shape="image" if data.get("favicon") else "ellipse", image=data.get("favicon"))
     for src, dst in graph.edges():
         net.add_edge(src, dst)
     return net.generate_html()
-\# --- Fetch IA items (movies) ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
-    query = " OR ".join([f"{kw.strip().replace(' ', '+')}" for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:20]
     clean_urls = []
     for res in results:
-        identifier = res['identifier']
         item = get_item(identifier)
         for f in item.files:
-            fmt = f.get('format', '').lower()
-            if fmt.startswith(('mpeg','mp4','avi','mov','webm','m4v')):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
@@ -134,9 +139,9 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
                 clean_urls.append(url)
     return clean_urls
-\# --- Gradio UI ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer  — Enhanced Metadata & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -161,7 +166,6 @@ with gr.Blocks() as demo:
             return None, {}, {}, ""
         identifier = selected_url.split("/")[4]
         # 1) IA metadata
-        raw_ia = {}
         try:
             item = get_item(identifier)
             raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
@@ -176,7 +180,7 @@ with gr.Blocks() as demo:
                 ff_md = {"error": str(e)}
         # 3) Origins
         desc = raw_ia.get("metadata", {}).get("description", "")
-        graph = trace_origins(desc, max_depth=2, executor=executor)
         graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
         return selected_url, raw_ia, ff_md, graph_html

 import networkx as nx
 from pyvis.network import Network
 from urllib.parse import urlparse
+# --- Shared HTTP session for speed & headers ---
 session = requests.Session()
 session.headers.update({
     "User-Agent": "Mozilla/5.0 (compatible; IA-Video-Meta-Explorer/1.0)"
 })
+# --- VirusTotal helper (optional) ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     resp = session.post(
     # Poll until complete
     while True:
         time.sleep(5)
+        st = session.get(
+            f"https://www.virustotal.com/api/v3/analyses/{analysis_id}", headers=headers
+        )
         st.raise_for_status()
         attr = st.json()["data"]["attributes"]
         if attr.get("status") == "completed":
             return attr.get("stats", {}).get("malicious", 0) == 0
+# --- FFprobe metadata extraction ---
 def extract_ffprobe_metadata(url_or_path):
     cmd = [
         "ffprobe", "-v", "error", "-print_format", "json",
     out = subprocess.check_output(cmd)
     return json.loads(out)
+# --- Caching page metadata ---
 @lru_cache(maxsize=256)
 def fetch_page_metadata(url):
     try:
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
         meta = {"url": url, "title": soup.title.string if soup.title else None}
+        # OpenGraph & twitter tags
         for tag in soup.find_all("meta"):
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
     except Exception as e:
         return {"url": url, "error": str(e)}
+# --- Fetch favicon for clickable graph nodes ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
+        parsed = urlparse(url)
+        domain = f"{parsed.scheme}://{parsed.netloc}"
+        ico_url = f"{domain}/favicon.ico"
         resp = session.get(ico_url, timeout=3)
         resp.raise_for_status()
         return ico_url
+    except Exception:
         return None
+# --- Trace origins recursively up to max depth ---
+def trace_origins(description, max_depth=2):
     graph = nx.DiGraph()
+    def recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
         links = []
         if "og:url" in info:
             links.append(info["og:url"])
                 pass
         for link in set(links):
             graph.add_edge(link, url)
+            recurse(link, depth + 1)
+    seeds = re.findall(r"https?://[^\s\"<]+", description)
     for seed in seeds:
+        recurse(seed, 1)
     return graph
+# --- Build PyVis network HTML ---
 def build_graph_html(graph):
     net = Network(height="500px", width="100%", directed=True)
     for url, data in graph.nodes(data=True):
+        if data.get("favicon"):
+            net.add_node(url, label=data.get("title") or url, title=url, shape="image", image=data["favicon"])
+        else:
+            net.add_node(url, label=data.get("title") or url, title=url)
     for src, dst in graph.edges():
         net.add_edge(src, dst)
     return net.generate_html()
+# --- Fetch IA items (movies) ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
+    query = " OR ".join([kw.strip().replace(" ", "+") for kw in keywords.split(",")])
     ia_query = f"mediatype:(movies) AND ({query})"
     results = list(search_items(ia_query))[:20]
     clean_urls = []
     for res in results:
+        identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files:
+            fmt = f.get("format", "").lower()
+            if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
                 url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
                 clean_urls.append(url)
     return clean_urls
+# --- Gradio UI ---
 with gr.Blocks() as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
             return None, {}, {}, ""
         identifier = selected_url.split("/")[4]
         # 1) IA metadata
         try:
             item = get_item(identifier)
             raw_ia = {"metadata": item.metadata, "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]}
                 ff_md = {"error": str(e)}
         # 3) Origins
         desc = raw_ia.get("metadata", {}).get("description", "")
+        graph = trace_origins(desc, max_depth=2)
         graph_html = build_graph_html(graph) if graph.nodes else "<p>No origins found.</p>"
         return selected_url, raw_ia, ff_md, graph_html