Spaces:

wuhp
/

internetscrape

Sleeping

wuhp commited on Jul 18, 2025

Commit

8f43a39

verified ·

1 Parent(s): bf065ff

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,6 +7,16 @@ import json
 import re
 from bs4 import BeautifulSoup
 # --- VirusTotal helper functions ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
@@ -70,6 +80,11 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
     clean_urls = []
     for res in results:
         identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files:

 import re
 from bs4 import BeautifulSoup
+# --- News-station filter ---
+NEWS_FILTER = [
+    r"\bcnn\b", r"\bfox\b", r"\bmsnbc\b", r"\bbbc\b", r"\breuters\b",
+    r"\bal jazeera\b", r"\bbloomberg\b", r"\bsky news\b", r"\bcnbc\b",
+    r"\babc news\b", r"\bnbc\b", r"\bcbs\b", r"\bpbs\b", r"\bnewsweek\b",
+    r"\bthe guardian\b", r"\bvice news\b", r"\bpolitico\b", r"\bwashington post\b",
+    r"\bnew york times\b", r"\bforbes\b", r"\btime\b", r"\busa today\b"
+]
+FILTER_REGEX = re.compile("|".join(f"({pat})" for pat in NEWS_FILTER), re.IGNORECASE)
 # --- VirusTotal helper functions ---
 def scan_url_vt(url, api_key):
     headers = {"x-apikey": api_key}
     clean_urls = []
     for res in results:
+        title = res.get("title", "").lower()
+        # skip known news sources
+        if FILTER_REGEX.search(title):
+            continue
         identifier = res["identifier"]
         item = get_item(identifier)
         for f in item.files: