nivakaran commited on
Commit
2a94fae
·
verified ·
1 Parent(s): 206c10b

Deploy from GitHub Actions

Browse files
Files changed (1) hide show
  1. src/utils/trending_detector.py +57 -0
src/utils/trending_detector.py CHANGED
@@ -26,6 +26,58 @@ DEFAULT_DB_PATH = os.path.join(
26
  os.path.dirname(__file__), "..", "..", "data", "trending.db"
27
  )
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  class TrendingDetector:
31
  """
@@ -129,6 +181,11 @@ class TrendingDetector:
129
  domain: Domain (e.g., 'political', 'economical')
130
  timestamp: When the mention occurred (default: now)
131
  """
 
 
 
 
 
132
  topic_hash = self._topic_hash(topic)
133
  ts = timestamp or datetime.now(timezone.utc)
134
  hour_bucket = self._get_hour_bucket(ts)
 
26
  os.path.dirname(__file__), "..", "..", "data", "trending.db"
27
  )
28
 
29
+ # Stopwords - common terms that should NOT trigger trending alerts
30
+ # These are generic Sri Lankan context words that appear in almost every news item
31
+ TRENDING_STOPWORDS = {
32
+ # Country/location
33
+ "sri",
34
+ "lanka",
35
+ "srilanka",
36
+ "sri lanka",
37
+ "colombo",
38
+ "lka",
39
+ # Government/political generic terms
40
+ "government",
41
+ "gov",
42
+ "political",
43
+ "politics",
44
+ "minister",
45
+ "ministry",
46
+ "parliament",
47
+ "president",
48
+ "presidential",
49
+ "cabinet",
50
+ # Economy generic terms
51
+ "economy",
52
+ "economic",
53
+ "economical",
54
+ "finance",
55
+ "financial",
56
+ # Common news words
57
+ "news",
58
+ "report",
59
+ "update",
60
+ "latest",
61
+ "breaking",
62
+ "today",
63
+ "announced",
64
+ "statement",
65
+ "official",
66
+ "officials",
67
+ # Time words
68
+ "yesterday",
69
+ "tomorrow",
70
+ "week",
71
+ "month",
72
+ "year",
73
+ # Generic actions
74
+ "said",
75
+ "says",
76
+ "told",
77
+ "according",
78
+ "sources",
79
+ }
80
+
81
 
82
  class TrendingDetector:
83
  """
 
181
  domain: Domain (e.g., 'political', 'economical')
182
  timestamp: When the mention occurred (default: now)
183
  """
184
+ # Skip stopwords - common generic terms that shouldn't trigger trending
185
+ normalized_topic = topic.lower().strip()
186
+ if normalized_topic in TRENDING_STOPWORDS:
187
+ return # Silently skip stopwords
188
+
189
  topic_hash = self._topic_hash(topic)
190
  ts = timestamp or datetime.now(timezone.utc)
191
  hour_bucket = self._get_hour_bucket(ts)