DivYonko commited on
Commit
5a13d2c
Β·
1 Parent(s): 6b26039

Improve keyword accuracy from CSV analysis + gate action_type on topic

Browse files
app.py CHANGED
@@ -292,7 +292,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
292
  try:
293
  sentiment, s_conf = _safe_sentiment(text)
294
  topic, t_conf = _safe_topic(text)
295
- action_type, at_conf = _safe_action_type(text)
 
 
 
 
296
  except Exception as exc:
297
  logger.error("ML inference failed for text=%r: %s", text[:50], exc)
298
  sentiment, s_conf = "Neutral", 0.5
 
292
  try:
293
  sentiment, s_conf = _safe_sentiment(text)
294
  topic, t_conf = _safe_topic(text)
295
+ # Only classify action type for Question/Request topics
296
+ if topic in ("Question", "Request/Feedback"):
297
+ action_type, at_conf = _safe_action_type(text)
298
+ else:
299
+ action_type, at_conf = "N/A", 0.50
300
  except Exception as exc:
301
  logger.error("ML inference failed for text=%r: %s", text[:50], exc)
302
  sentiment, s_conf = "Neutral", 0.5
backend/scraper.py CHANGED
@@ -27,6 +27,7 @@ from backend.config import (
27
  )
28
  from ml.sentiment_model import predict_sentiment
29
  from ml.topic_model import predict_topic, VALID_TOPICS
 
30
 
31
  logging.basicConfig(
32
  level=logging.INFO,
@@ -35,7 +36,7 @@ logging.basicConfig(
35
  )
36
  logger = logging.getLogger("scraper")
37
 
38
- MAX_REDIS_MESSAGES = 10000
39
 
40
 
41
  def _safe_sentiment(text: str) -> tuple[str, float]:
@@ -57,6 +58,17 @@ def _safe_topic(text: str) -> tuple[str, float]:
57
  return "General", 0.50
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  def run(video_id: str, redis_key: str) -> None:
61
  r = redis.Redis(
62
  host=REDIS_HOST,
@@ -84,13 +96,21 @@ def run(video_id: str, redis_key: str) -> None:
84
  while chat.is_alive():
85
  try:
86
  for c in chat.get().sync_items():
87
- text = c.message.strip()
 
 
 
88
  author = c.author.name
89
  if not text:
90
  continue
91
 
92
  sentiment, s_conf = _safe_sentiment(text)
93
  topic, t_conf = _safe_topic(text)
 
 
 
 
 
94
 
95
  message_data = {
96
  "author": author,
@@ -99,6 +119,8 @@ def run(video_id: str, redis_key: str) -> None:
99
  "confidence": round(s_conf, 3),
100
  "topic": topic,
101
  "topic_conf": round(t_conf, 3),
 
 
102
  "time": datetime.now().isoformat(),
103
  }
104
 
@@ -108,11 +130,12 @@ def run(video_id: str, redis_key: str) -> None:
108
  pipe.execute()
109
 
110
  logger.info(
111
- "[%s] %s | %s(%.2f) %s(%.2f) | %r",
112
  message_data["time"][11:19],
113
  author[:20],
114
  sentiment, s_conf,
115
  topic, t_conf,
 
116
  text[:60],
117
  )
118
 
 
27
  )
28
  from ml.sentiment_model import predict_sentiment
29
  from ml.topic_model import predict_topic, VALID_TOPICS
30
+ from ml.action_type_model import predict_action_type, VALID_ACTION_TYPES
31
 
32
  logging.basicConfig(
33
  level=logging.INFO,
 
36
  )
37
  logger = logging.getLogger("scraper")
38
 
39
+ MAX_REDIS_MESSAGES = 40000
40
 
41
 
42
  def _safe_sentiment(text: str) -> tuple[str, float]:
 
58
  return "General", 0.50
59
 
60
 
61
+ def _safe_action_type(text: str) -> tuple[str, float]:
62
+ try:
63
+ action_type, conf = predict_action_type(text)
64
+ if action_type not in VALID_ACTION_TYPES:
65
+ return "N/A", 0.50
66
+ return action_type, conf
67
+ except Exception as exc:
68
+ logger.error("predict_action_type failed for %r: %s", text[:60], exc)
69
+ return "N/A", 0.50
70
+
71
+
72
  def run(video_id: str, redis_key: str) -> None:
73
  r = redis.Redis(
74
  host=REDIS_HOST,
 
96
  while chat.is_alive():
97
  try:
98
  for c in chat.get().sync_items():
99
+ # pytchat converts emoji to :name: codes β€” convert back to actual characters
100
+ import emoji as _emoji
101
+ raw_text = c.message.strip()
102
+ text = _emoji.emojize(raw_text, language="alias")
103
  author = c.author.name
104
  if not text:
105
  continue
106
 
107
  sentiment, s_conf = _safe_sentiment(text)
108
  topic, t_conf = _safe_topic(text)
109
+ # Only classify action type for Question/Request topics
110
+ if topic in ("Question", "Request/Feedback"):
111
+ action_type, at_conf = _safe_action_type(text)
112
+ else:
113
+ action_type, at_conf = "N/A", 0.50
114
 
115
  message_data = {
116
  "author": author,
 
119
  "confidence": round(s_conf, 3),
120
  "topic": topic,
121
  "topic_conf": round(t_conf, 3),
122
+ "action_type": action_type,
123
+ "action_type_conf": round(at_conf, 3),
124
  "time": datetime.now().isoformat(),
125
  }
126
 
 
130
  pipe.execute()
131
 
132
  logger.info(
133
+ "[%s] %s | %s(%.2f) %s(%.2f) %s(%.2f) | %r",
134
  message_data["time"][11:19],
135
  author[:20],
136
  sentiment, s_conf,
137
  topic, t_conf,
138
+ action_type, at_conf,
139
  text[:60],
140
  )
141
 
ml/action_type_model.py CHANGED
@@ -271,18 +271,16 @@ _PRICING_KW: set[str] = {
271
 
272
  # Fees + Financial Queries β€” how to purchase, payment, stipend
273
  _FEES_KW: set[str] = {
274
- # Purchase / payment
275
  "purchase", "buy", "kharidna", "kharide", "kharido",
276
  "payment", "pay", "paid",
277
- "kaise", "kaha", "kahan", "milega", "milegi",
278
  # Financial
279
  "stipend", "salary", "income", "earn", "earning",
280
  "emi", "installment", "loan",
281
  # Batch purchase
282
- "batch", "course", "enroll", "enrollment", "admission",
283
  "register", "registration",
284
- # Hinglish
285
- "lena", "lena hai", "chahiye", "chahta", "chahti",
286
  "pw", "physics wallah", "umeed",
287
  }
288
 
@@ -322,41 +320,38 @@ _BATCH_KW: set[str] = {
322
  "worth", "value",
323
  # Faculty in batch
324
  "faculty", "teacher", "sir", "mam",
325
- "included", "include", "hai", "hain",
326
  }
327
 
328
  # ── Keyword sets: remaining categories ───────────────────────────────────────
329
 
330
  # Information- Exam β€” exam rules, cutoffs, forms, reservations, percentages
331
  _EXAM_INFO_KW: set[str] = {
332
- # Exam process
333
  "form", "bhara", "bharana", "apply", "application",
334
- "notification", "vacancy", "post",
335
  # Exam results / cutoffs
336
  "cutoff", "cut off", "marks", "percentage", "percent",
337
  "prelims", "mains", "interview", "daf",
338
  "clear", "cleared", "qualify", "qualified",
339
  # Reservation / rules
340
- "reservation", "sc", "st", "obc", "ews", "general",
341
  "rule", "rules", "regulation", "norms",
342
  "upsc", "ssc", "ias", "ips", "ifs",
343
  # Exam statistics
344
  "attempt", "attempts", "age", "limit",
345
- "seats", "vacancy", "post",
346
- # Hinglish
347
- "ata", "aata", "kitna", "kitne", "kya", "hai",
348
  "dono", "both",
349
  }
350
 
351
  # Guidance β€” study strategy, what to study, life advice
352
  _GUIDANCE_KW: set[str] = {
353
- # Study strategy
354
- "kahan", "kaha", "kaise", "konsa", "konsi",
355
- "where", "how", "which", "what",
356
- "se", "kre", "karo", "karein", "start", "shuru",
357
  "strategy", "plan", "approach",
358
  # Subject selection
359
- "optional", "subject", "choose", "select", "lena",
360
  "economy", "geography", "history", "polity",
361
  "physics", "chemistry", "biology", "maths",
362
  # Life / personal advice
@@ -364,9 +359,9 @@ _GUIDANCE_KW: set[str] = {
364
  "control", "manage", "balance",
365
  "motivation", "motivate", "inspired",
366
  "chhod", "chhodna", "drop", "leave",
367
- # Hinglish
368
- "kya", "kya kre", "kya karu", "kya karun",
369
- "sir", "bata", "batao", "suggest",
370
  "ioc", "baaki", "rest",
371
  }
372
 
@@ -721,7 +716,6 @@ def _fast_path(t: str, words: set[str], has_q: bool) -> tuple[str, float] | None
721
  "link", "telegram", "channel", "group", "invite",
722
  "help", "support", "contact",
723
  "refund", "cancel",
724
- "chal", "chalta", "kaam", "karta",
725
  "khul", "khulta",
726
  }
727
  if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
@@ -778,7 +772,6 @@ def _rule_chain(t: str, words: set[str], has_q: bool) -> tuple[str, float] | Non
778
  "kab", "when", "bje", "baje", "time", "timing", "schedule",
779
  "aayega", "aayegi", "aata", "aati", "ata", "ati",
780
  "next", "agla", "agli",
781
- "end", "khatam", "finish",
782
  "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
783
  "somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
784
  "kb",
 
271
 
272
  # Fees + Financial Queries β€” how to purchase, payment, stipend
273
  _FEES_KW: set[str] = {
274
+ # Purchase / payment β€” specific financial terms only
275
  "purchase", "buy", "kharidna", "kharide", "kharido",
276
  "payment", "pay", "paid",
 
277
  # Financial
278
  "stipend", "salary", "income", "earn", "earning",
279
  "emi", "installment", "loan",
280
  # Batch purchase
281
+ "enroll", "enrollment", "admission",
282
  "register", "registration",
283
+ # Hinglish β€” only specific ones
 
284
  "pw", "physics wallah", "umeed",
285
  }
286
 
 
320
  "worth", "value",
321
  # Faculty in batch
322
  "faculty", "teacher", "sir", "mam",
323
+ "included", "include",
324
  }
325
 
326
  # ── Keyword sets: remaining categories ───────────────────────────────────────
327
 
328
  # Information- Exam β€” exam rules, cutoffs, forms, reservations, percentages
329
  _EXAM_INFO_KW: set[str] = {
330
+ # Exam process β€” specific exam terms only
331
  "form", "bhara", "bharana", "apply", "application",
332
+ "notification", "vacancy",
333
  # Exam results / cutoffs
334
  "cutoff", "cut off", "marks", "percentage", "percent",
335
  "prelims", "mains", "interview", "daf",
336
  "clear", "cleared", "qualify", "qualified",
337
  # Reservation / rules
338
+ "reservation", "sc", "st", "obc", "ews",
339
  "rule", "rules", "regulation", "norms",
340
  "upsc", "ssc", "ias", "ips", "ifs",
341
  # Exam statistics
342
  "attempt", "attempts", "age", "limit",
343
+ "seats",
344
+ # Hinglish β€” only specific exam-related ones
 
345
  "dono", "both",
346
  }
347
 
348
  # Guidance β€” study strategy, what to study, life advice
349
  _GUIDANCE_KW: set[str] = {
350
+ # Study strategy β€” specific guidance words only
351
+ "konsa", "konsi",
 
 
352
  "strategy", "plan", "approach",
353
  # Subject selection
354
+ "optional", "subject", "choose", "select",
355
  "economy", "geography", "history", "polity",
356
  "physics", "chemistry", "biology", "maths",
357
  # Life / personal advice
 
359
  "control", "manage", "balance",
360
  "motivation", "motivate", "inspired",
361
  "chhod", "chhodna", "drop", "leave",
362
+ # Hinglish β€” specific guidance phrases
363
+ "kya kre", "kya karu", "kya karun",
364
+ "suggest",
365
  "ioc", "baaki", "rest",
366
  }
367
 
 
716
  "link", "telegram", "channel", "group", "invite",
717
  "help", "support", "contact",
718
  "refund", "cancel",
 
719
  "khul", "khulta",
720
  }
721
  if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
 
772
  "kab", "when", "bje", "baje", "time", "timing", "schedule",
773
  "aayega", "aayegi", "aata", "aati", "ata", "ati",
774
  "next", "agla", "agli",
 
775
  "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
776
  "somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
777
  "kb",
ml/sentiment_model.py CHANGED
@@ -203,9 +203,28 @@ _POS_WORDS: set[str] = {
203
  # ── Common live chat positives ──
204
  "woww", "wowww", "woah", "whoa", "yay", "yayy",
205
  "haha", "hahaha", "lol", "lmao", # laughter = positive
206
- "clap", "claps", "bravo", "chappal", # chappal = clap in some contexts
207
  "heart", "hearts",
208
- "100", "1000", # "100%" positive
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  }
210
 
211
  # ── Negative keyword set ───────────────────────────────────────────────────────
 
203
  # ── Common live chat positives ──
204
  "woww", "wowww", "woah", "whoa", "yay", "yayy",
205
  "haha", "hahaha", "lol", "lmao", # laughter = positive
206
+ "clap", "claps", "bravo", "chappal",
207
  "heart", "hearts",
208
+ "100", "1000",
209
+
210
+ # ── Greetings / blessings (common in Indian live chats) ──
211
+ "pranam", "pranaam", "namaskar", "namaste", "namasthe",
212
+ "assalamualaikum", "walaikum", "walekum", "waalaikum",
213
+ "jai hind", "jai ho",
214
+ "gm", "gn", "ge",
215
+ "mubarak", "mubarakho",
216
+ "atb",
217
+ "god bless", "stay blessed", "stay safe",
218
+ "welcome", "wlcm", "wlc",
219
+ "congratulations", "congrats",
220
+ "well done", "keep it up", "keep going",
221
+ "proud", "proudly",
222
+ "maza aa gaya", "maza aaya", "maja aa gaya",
223
+ "khyal rakhna",
224
+ "take care",
225
+ "luck", # "good luck", "best of luck" β€” "luck" alone is positive context
226
+ "morning", # "good morning" β€” "morning" alone in greeting context
227
+ "evening", # "good evening"
228
  }
229
 
230
  # ── Negative keyword set ───────────────────────────────────────────────────────
ml/topic_model.py CHANGED
@@ -33,12 +33,12 @@ _APPRECIATION_KW = {
33
  "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
34
  "tysm", "tqsm", "thx",
35
  "informative", "fruitful", "motivating", "lovely",
36
- "bestest", "loved", "great", "good", "nice", "helpful",
37
  "semma", "mass", "solid", "fire", "goated",
38
  }
39
 
40
  _QUESTION_KW = {
41
- "kya", "kab", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
42
  "kaise", "kyun", "kyunki",
43
  "what", "when", "where", "who", "which", "how", "why",
44
  "bata", "batao", "bataye", "tell", "explain",
@@ -52,7 +52,7 @@ _RF_CONTENT_REQUEST_KW = {
52
  "karo", "kariye", "karaiye", "kardo",
53
  "lao", "laiye", "layiye",
54
  "start", "shuru", "launch", "resume",
55
- "video", "lecture", "session", "class", "series",
56
  "separate", "alag", "akele", "single",
57
  "cover", "include", "add", "topic",
58
  "chahiye", "chahte", "chahta", "chahti",
@@ -66,6 +66,7 @@ _RF_ACADEMIC_KW = {
66
  "timeline", "schedule", "timetable", "syllabus",
67
  "infographic", "slides", "ppt", "handout",
68
  "provide", "share", "send", "dedo", "dedijiye",
 
69
  }
70
 
71
  # Language requests
@@ -121,7 +122,7 @@ _SPAM_PATTERNS = [
121
  r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
122
  r"https?://\S+",
123
  r"_{4,}",
124
- r"(?:\b[a-z0-9]{6,}\b\s*){3,}",
125
  ]
126
 
127
  _SPAM_KW_SUBSTRINGS = {
@@ -209,7 +210,8 @@ def predict_topic(text: str) -> tuple[str, float]:
209
  rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
210
 
211
  # ── Appreciation ──
212
- min_appr_hits = 1 if len(t_clean) >= 15 else 2
 
213
  if (appreciation_hits >= min_appr_hits
214
  and question_hits == 0
215
  and not has_question_mark
 
33
  "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
34
  "tysm", "tqsm", "thx",
35
  "informative", "fruitful", "motivating", "lovely",
36
+ "bestest", "loved", "nice", "helpful",
37
  "semma", "mass", "solid", "fire", "goated",
38
  }
39
 
40
  _QUESTION_KW = {
41
+ "kya", "kab", "kb", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
42
  "kaise", "kyun", "kyunki",
43
  "what", "when", "where", "who", "which", "how", "why",
44
  "bata", "batao", "bataye", "tell", "explain",
 
52
  "karo", "kariye", "karaiye", "kardo",
53
  "lao", "laiye", "layiye",
54
  "start", "shuru", "launch", "resume",
55
+ "video", "class", "series", # removed "session" and "lecture" β€” too generic
56
  "separate", "alag", "akele", "single",
57
  "cover", "include", "add", "topic",
58
  "chahiye", "chahte", "chahta", "chahti",
 
66
  "timeline", "schedule", "timetable", "syllabus",
67
  "infographic", "slides", "ppt", "handout",
68
  "provide", "share", "send", "dedo", "dedijiye",
69
+ "milega", "milegi", "milenge", # "where to find" β€” specific to resource queries
70
  }
71
 
72
  # Language requests
 
122
  r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
123
  r"https?://\S+",
124
  r"_{4,}",
125
+ r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 β€” avoids catching real sentences
126
  ]
127
 
128
  _SPAM_KW_SUBSTRINGS = {
 
210
  rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
211
 
212
  # ── Appreciation ──
213
+ # Single strong appreciation word is enough regardless of length
214
+ min_appr_hits = 1
215
  if (appreciation_hits >= min_appr_hits
216
  and question_hits == 0
217
  and not has_question_mark
shared.py CHANGED
@@ -268,7 +268,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
268
  try:
269
  sentiment, s_conf = _safe_sentiment(text)
270
  topic, t_conf = _safe_topic(text)
271
- action_type, at_conf = _safe_action_type(text)
 
 
 
 
272
  except Exception as exc:
273
  logger.error("ML inference failed: %s", exc)
274
  sentiment, s_conf = "Neutral", 0.5
 
268
  try:
269
  sentiment, s_conf = _safe_sentiment(text)
270
  topic, t_conf = _safe_topic(text)
271
+ # Only classify action type for Question/Request topics
272
+ if topic in ("Question", "Request/Feedback"):
273
+ action_type, at_conf = _safe_action_type(text)
274
+ else:
275
+ action_type, at_conf = "N/A", 0.50
276
  except Exception as exc:
277
  logger.error("ML inference failed: %s", exc)
278
  sentiment, s_conf = "Neutral", 0.5