Deploy from GitHub Actions
Browse files
src/utils/trending_detector.py
CHANGED
|
@@ -26,6 +26,58 @@ DEFAULT_DB_PATH = os.path.join(
|
|
| 26 |
os.path.dirname(__file__), "..", "..", "data", "trending.db"
|
| 27 |
)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
class TrendingDetector:
|
| 31 |
"""
|
|
@@ -129,6 +181,11 @@ class TrendingDetector:
|
|
| 129 |
domain: Domain (e.g., 'political', 'economical')
|
| 130 |
timestamp: When the mention occurred (default: now)
|
| 131 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
topic_hash = self._topic_hash(topic)
|
| 133 |
ts = timestamp or datetime.now(timezone.utc)
|
| 134 |
hour_bucket = self._get_hour_bucket(ts)
|
|
|
|
| 26 |
os.path.dirname(__file__), "..", "..", "data", "trending.db"
|
| 27 |
)
|
| 28 |
|
| 29 |
+
# Stopwords - common terms that should NOT trigger trending alerts
|
| 30 |
+
# These are generic Sri Lankan context words that appear in almost every news item
|
| 31 |
+
TRENDING_STOPWORDS = {
|
| 32 |
+
# Country/location
|
| 33 |
+
"sri",
|
| 34 |
+
"lanka",
|
| 35 |
+
"srilanka",
|
| 36 |
+
"sri lanka",
|
| 37 |
+
"colombo",
|
| 38 |
+
"lka",
|
| 39 |
+
# Government/political generic terms
|
| 40 |
+
"government",
|
| 41 |
+
"gov",
|
| 42 |
+
"political",
|
| 43 |
+
"politics",
|
| 44 |
+
"minister",
|
| 45 |
+
"ministry",
|
| 46 |
+
"parliament",
|
| 47 |
+
"president",
|
| 48 |
+
"presidential",
|
| 49 |
+
"cabinet",
|
| 50 |
+
# Economy generic terms
|
| 51 |
+
"economy",
|
| 52 |
+
"economic",
|
| 53 |
+
"economical",
|
| 54 |
+
"finance",
|
| 55 |
+
"financial",
|
| 56 |
+
# Common news words
|
| 57 |
+
"news",
|
| 58 |
+
"report",
|
| 59 |
+
"update",
|
| 60 |
+
"latest",
|
| 61 |
+
"breaking",
|
| 62 |
+
"today",
|
| 63 |
+
"announced",
|
| 64 |
+
"statement",
|
| 65 |
+
"official",
|
| 66 |
+
"officials",
|
| 67 |
+
# Time words
|
| 68 |
+
"yesterday",
|
| 69 |
+
"tomorrow",
|
| 70 |
+
"week",
|
| 71 |
+
"month",
|
| 72 |
+
"year",
|
| 73 |
+
# Generic actions
|
| 74 |
+
"said",
|
| 75 |
+
"says",
|
| 76 |
+
"told",
|
| 77 |
+
"according",
|
| 78 |
+
"sources",
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
|
| 82 |
class TrendingDetector:
|
| 83 |
"""
|
|
|
|
| 181 |
domain: Domain (e.g., 'political', 'economical')
|
| 182 |
timestamp: When the mention occurred (default: now)
|
| 183 |
"""
|
| 184 |
+
# Skip stopwords - common generic terms that shouldn't trigger trending
|
| 185 |
+
normalized_topic = topic.lower().strip()
|
| 186 |
+
if normalized_topic in TRENDING_STOPWORDS:
|
| 187 |
+
return # Silently skip stopwords
|
| 188 |
+
|
| 189 |
topic_hash = self._topic_hash(topic)
|
| 190 |
ts = timestamp or datetime.now(timezone.utc)
|
| 191 |
hour_bucket = self._get_hour_bucket(ts)
|