Spaces:

wuhp
/

internetscrape

Sleeping

App Files Files Community

wuhp commited on Jul 18, 2025

Commit

12dbf40

verified ·

1 Parent(s): 9ab61fb

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -14

app.py CHANGED Viewed

@@ -46,8 +46,9 @@ def fetch_page_metadata(url):
         resp = session.get(url, timeout=5)
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
-        meta = {"url": url, "title": soup.title.string.strip() if soup.title and soup.title.string else url}
-        for tag in soup.find_all("meta"):  # OpenGraph & Twitter tags
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
@@ -55,7 +56,7 @@ def fetch_page_metadata(url):
     except Exception as e:
         return {"url": url, "error": str(e)}
-# --- Fetch favicon for graph nodes ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
@@ -65,22 +66,21 @@ def fetch_favicon(url):
         resp = session.get(ico_url, timeout=3)
         resp.raise_for_status()
         return ico_url
-    except Exception:
         return None
 # --- Recursive origin tracing ---
 def trace_origins(description_html, max_depth=2):
     graph = nx.DiGraph()
-    # clean HTML to plain text for better URL extraction
-    desc_text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
-    seeds = re.findall(r"https?://[^\s]+", desc_text)
     def recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
-        # find further links via OG:url or anchor tags
         next_links = []
         if info.get("og:url"):
             next_links.append(info["og:url"])
@@ -101,7 +101,7 @@ def trace_origins(description_html, max_depth=2):
         recurse(seed, 1)
     return graph
-# --- Generate PyVis graph HTML ---
 def build_graph_html(graph):
     net = Network(height="500px", width="100%", directed=True, notebook=False)
     for url, data in graph.nodes(data=True):
@@ -113,7 +113,7 @@ def build_graph_html(graph):
         net.add_edge(src, dst)
     return net.generate_html()
-# --- Search and filter IA videos ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
     terms = [kw.strip() for kw in keywords.split(",")]
     query = " OR ".join(term.replace(" ", "+") for term in terms)
@@ -131,7 +131,7 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
         for f in item.files:
             fmt = f.get("format", "").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
-                video_url = f"https://archive.org/download/{identifier}/{f['name']}"
                 if scan_enabled and api_key:
                     try:
                         if not scan_url_vt(video_url, api_key):
@@ -143,7 +143,7 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
 # --- Gradio UI ---
 with gr.Blocks() as demo:
-    gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origin Tracing")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
@@ -166,7 +166,6 @@ with gr.Blocks() as demo:
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
             return None, {}, {}, "<p>No data.</p>"
-        # extract identifier robustly
         parsed = urlparse(selected_url)
         parts = parsed.path.strip("/").split("/")
         identifier = parts[1] if len(parts) > 1 else None
@@ -175,7 +174,10 @@ with gr.Blocks() as demo:
             item = get_item(identifier)
             raw_ia = {
                 "metadata": item.metadata,
-                "files": [dict(name=f.name, format=f.format, size=f.size) for f in item.files]
             }
         except Exception as e:
             raw_ia = {"error": f"Could not fetch IA metadata: {e}"}

         resp = session.get(url, timeout=5)
         resp.raise_for_status()
         soup = BeautifulSoup(resp.text, "html.parser")
+        title = soup.title.string.strip() if soup.title and soup.title.string else url
+        meta = {"url": url, "title": title}
+        for tag in soup.find_all("meta"):  # OpenGraph & Twitter
             prop = tag.get("property") or tag.get("name")
             if prop and prop.startswith(("og:", "twitter:")):
                 meta[prop] = tag.get("content")
     except Exception as e:
         return {"url": url, "error": str(e)}
+# --- Fetch favicon ---
 @lru_cache(maxsize=256)
 def fetch_favicon(url):
     try:
         resp = session.get(ico_url, timeout=3)
         resp.raise_for_status()
         return ico_url
+    except:
         return None
 # --- Recursive origin tracing ---
 def trace_origins(description_html, max_depth=2):
     graph = nx.DiGraph()
+    text = BeautifulSoup(description_html or "", "html.parser").get_text(separator=' ')
+    seeds = re.findall(r"https?://[^\s]+", text)
     def recurse(url, depth):
         if depth > max_depth or url in graph:
             return
         info = fetch_page_metadata(url)
         favicon = fetch_favicon(url)
         graph.add_node(url, title=info.get("title"), favicon=favicon)
+        # find next links
         next_links = []
         if info.get("og:url"):
             next_links.append(info["og:url"])
         recurse(seed, 1)
     return graph
+# --- Build PyVis graph HTML ---
 def build_graph_html(graph):
     net = Network(height="500px", width="100%", directed=True, notebook=False)
     for url, data in graph.nodes(data=True):
         net.add_edge(src, dst)
     return net.generate_html()
+# --- Search IA videos ---
 def fetch_clean_videos(keywords, api_key, scan_enabled):
     terms = [kw.strip() for kw in keywords.split(",")]
     query = " OR ".join(term.replace(" ", "+") for term in terms)
         for f in item.files:
             fmt = f.get("format", "").lower()
             if fmt.startswith(("mpeg","mp4","avi","mov","webm","m4v")):
+                video_url = f"https://archive.org/download/{identifier}/{f.get('name')}"
                 if scan_enabled and api_key:
                     try:
                         if not scan_url_vt(video_url, api_key):
 # --- Gradio UI ---
 with gr.Blocks() as demo:
+    gr.Markdown("# 📼 IA Drone‑Strike Explorer — Enhanced Metadata & Origins")
     with gr.Row():
         kw_input     = gr.Textbox(label="Search keywords", value="drone strike, military uav")
         vt_key_input = gr.Textbox(label="VirusTotal API Key", type="password")
     def update_all(selected_url, ff_on, api_key):
         if not selected_url:
             return None, {}, {}, "<p>No data.</p>"
         parsed = urlparse(selected_url)
         parts = parsed.path.strip("/").split("/")
         identifier = parts[1] if len(parts) > 1 else None
             item = get_item(identifier)
             raw_ia = {
                 "metadata": item.metadata,
+                "files": [
+                    dict(name=f.get("name"), format=f.get("format"), size=f.get("size"))
+                    for f in item.files
+                ]
             }
         except Exception as e:
             raw_ia = {"error": f"Could not fetch IA metadata: {e}"}