Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,16 @@ import json
|
|
| 7 |
import re
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# --- VirusTotal helper functions ---
|
| 11 |
def scan_url_vt(url, api_key):
|
| 12 |
headers = {"x-apikey": api_key}
|
|
@@ -70,6 +80,11 @@ def fetch_clean_videos(keywords, api_key, scan_enabled):
|
|
| 70 |
|
| 71 |
clean_urls = []
|
| 72 |
for res in results:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
identifier = res["identifier"]
|
| 74 |
item = get_item(identifier)
|
| 75 |
for f in item.files:
|
|
|
|
| 7 |
import re
|
| 8 |
from bs4 import BeautifulSoup
|
| 9 |
|
| 10 |
+
# --- News-station filter ---
|
| 11 |
+
NEWS_FILTER = [
|
| 12 |
+
r"\bcnn\b", r"\bfox\b", r"\bmsnbc\b", r"\bbbc\b", r"\breuters\b",
|
| 13 |
+
r"\bal jazeera\b", r"\bbloomberg\b", r"\bsky news\b", r"\bcnbc\b",
|
| 14 |
+
r"\babc news\b", r"\bnbc\b", r"\bcbs\b", r"\bpbs\b", r"\bnewsweek\b",
|
| 15 |
+
r"\bthe guardian\b", r"\bvice news\b", r"\bpolitico\b", r"\bwashington post\b",
|
| 16 |
+
r"\bnew york times\b", r"\bforbes\b", r"\btime\b", r"\busa today\b"
|
| 17 |
+
]
|
| 18 |
+
FILTER_REGEX = re.compile("|".join(f"({pat})" for pat in NEWS_FILTER), re.IGNORECASE)
|
| 19 |
+
|
| 20 |
# --- VirusTotal helper functions ---
|
| 21 |
def scan_url_vt(url, api_key):
|
| 22 |
headers = {"x-apikey": api_key}
|
|
|
|
| 80 |
|
| 81 |
clean_urls = []
|
| 82 |
for res in results:
|
| 83 |
+
title = res.get("title", "").lower()
|
| 84 |
+
# skip known news sources
|
| 85 |
+
if FILTER_REGEX.search(title):
|
| 86 |
+
continue
|
| 87 |
+
|
| 88 |
identifier = res["identifier"]
|
| 89 |
item = get_item(identifier)
|
| 90 |
for f in item.files:
|