wuhp commited on
Commit
8f43a39
·
verified ·
1 Parent(s): bf065ff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -0
app.py CHANGED
@@ -7,6 +7,16 @@ import json
7
  import re
8
  from bs4 import BeautifulSoup
9
 
 
 
 
 
 
 
 
 
 
 
10
  # --- VirusTotal helper functions ---
11
  def scan_url_vt(url, api_key):
12
  headers = {"x-apikey": api_key}
@@ -70,6 +80,11 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
70
 
71
  clean_urls = []
72
  for res in results:
 
 
 
 
 
73
  identifier = res["identifier"]
74
  item = get_item(identifier)
75
  for f in item.files:
 
7
  import re
8
  from bs4 import BeautifulSoup
9
 
10
+ # --- News-station filter ---
11
+ NEWS_FILTER = [
12
+ r"\bcnn\b", r"\bfox\b", r"\bmsnbc\b", r"\bbbc\b", r"\breuters\b",
13
+ r"\bal jazeera\b", r"\bbloomberg\b", r"\bsky news\b", r"\bcnbc\b",
14
+ r"\babc news\b", r"\bnbc\b", r"\bcbs\b", r"\bpbs\b", r"\bnewsweek\b",
15
+ r"\bthe guardian\b", r"\bvice news\b", r"\bpolitico\b", r"\bwashington post\b",
16
+ r"\bnew york times\b", r"\bforbes\b", r"\btime\b", r"\busa today\b"
17
+ ]
18
+ FILTER_REGEX = re.compile("|".join(f"({pat})" for pat in NEWS_FILTER), re.IGNORECASE)
19
+
20
  # --- VirusTotal helper functions ---
21
  def scan_url_vt(url, api_key):
22
  headers = {"x-apikey": api_key}
 
80
 
81
  clean_urls = []
82
  for res in results:
83
+ title = res.get("title", "").lower()
84
+ # skip known news sources
85
+ if FILTER_REGEX.search(title):
86
+ continue
87
+
88
  identifier = res["identifier"]
89
  item = get_item(identifier)
90
  for f in item.files: