Spaces:

Divyonko
/

LivePulse

Sleeping

App Files Files Community

DivYonko commited on about 1 month ago

Commit

5a13d2c

1 Parent(s): 6b26039

Improve keyword accuracy from CSV analysis + gate action_type on topic

Browse files

Files changed (6) hide show

app.py +5 -1
backend/scraper.py +26 -3
ml/action_type_model.py +15 -22
ml/sentiment_model.py +21 -2
ml/topic_model.py +7 -5
shared.py +5 -1

app.py CHANGED Viewed

@@ -292,7 +292,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
                 try:
                     sentiment, s_conf = _safe_sentiment(text)
                     topic,     t_conf = _safe_topic(text)
-                    action_type, at_conf = _safe_action_type(text)
                 except Exception as exc:
                     logger.error("ML inference failed for text=%r: %s", text[:50], exc)
                     sentiment, s_conf = "Neutral", 0.5

                 try:
                     sentiment, s_conf = _safe_sentiment(text)
                     topic,     t_conf = _safe_topic(text)
+                    # Only classify action type for Question/Request topics
+                    if topic in ("Question", "Request/Feedback"):
+                        action_type, at_conf = _safe_action_type(text)
+                    else:
+                        action_type, at_conf = "N/A", 0.50
                 except Exception as exc:
                     logger.error("ML inference failed for text=%r: %s", text[:50], exc)
                     sentiment, s_conf = "Neutral", 0.5

backend/scraper.py CHANGED Viewed

@@ -27,6 +27,7 @@ from backend.config import (
 )
 from ml.sentiment_model import predict_sentiment
 from ml.topic_model import predict_topic, VALID_TOPICS
 logging.basicConfig(
     level=logging.INFO,
@@ -35,7 +36,7 @@ logging.basicConfig(
 )
 logger = logging.getLogger("scraper")
-MAX_REDIS_MESSAGES = 10000
 def _safe_sentiment(text: str) -> tuple[str, float]:
@@ -57,6 +58,17 @@ def _safe_topic(text: str) -> tuple[str, float]:
         return "General", 0.50
 def run(video_id: str, redis_key: str) -> None:
     r = redis.Redis(
         host=REDIS_HOST,
@@ -84,13 +96,21 @@ def run(video_id: str, redis_key: str) -> None:
     while chat.is_alive():
         try:
             for c in chat.get().sync_items():
-                text   = c.message.strip()
                 author = c.author.name
                 if not text:
                     continue
                 sentiment, s_conf = _safe_sentiment(text)
                 topic,     t_conf = _safe_topic(text)
                 message_data = {
                     "author":     author,
@@ -99,6 +119,8 @@ def run(video_id: str, redis_key: str) -> None:
                     "confidence": round(s_conf, 3),
                     "topic":      topic,
                     "topic_conf": round(t_conf, 3),
                     "time":       datetime.now().isoformat(),
                 }
@@ -108,11 +130,12 @@ def run(video_id: str, redis_key: str) -> None:
                 pipe.execute()
                 logger.info(
-                    "[%s] %s | %s(%.2f) %s(%.2f) | %r",
                     message_data["time"][11:19],
                     author[:20],
                     sentiment, s_conf,
                     topic, t_conf,
                     text[:60],
                 )

 )
 from ml.sentiment_model import predict_sentiment
 from ml.topic_model import predict_topic, VALID_TOPICS
+from ml.action_type_model import predict_action_type, VALID_ACTION_TYPES
 logging.basicConfig(
     level=logging.INFO,
 )
 logger = logging.getLogger("scraper")
+MAX_REDIS_MESSAGES = 40000
 def _safe_sentiment(text: str) -> tuple[str, float]:
         return "General", 0.50
+def _safe_action_type(text: str) -> tuple[str, float]:
+    try:
+        action_type, conf = predict_action_type(text)
+        if action_type not in VALID_ACTION_TYPES:
+            return "N/A", 0.50
+        return action_type, conf
+    except Exception as exc:
+        logger.error("predict_action_type failed for %r: %s", text[:60], exc)
+        return "N/A", 0.50
 def run(video_id: str, redis_key: str) -> None:
     r = redis.Redis(
         host=REDIS_HOST,
     while chat.is_alive():
         try:
             for c in chat.get().sync_items():
+                # pytchat converts emoji to :name: codes — convert back to actual characters
+                import emoji as _emoji
+                raw_text = c.message.strip()
+                text = _emoji.emojize(raw_text, language="alias")
                 author = c.author.name
                 if not text:
                     continue
                 sentiment, s_conf = _safe_sentiment(text)
                 topic,     t_conf = _safe_topic(text)
+                # Only classify action type for Question/Request topics
+                if topic in ("Question", "Request/Feedback"):
+                    action_type, at_conf = _safe_action_type(text)
+                else:
+                    action_type, at_conf = "N/A", 0.50
                 message_data = {
                     "author":     author,
                     "confidence": round(s_conf, 3),
                     "topic":      topic,
                     "topic_conf": round(t_conf, 3),
+                    "action_type":      action_type,
+                    "action_type_conf": round(at_conf, 3),
                     "time":       datetime.now().isoformat(),
                 }
                 pipe.execute()
                 logger.info(
+                    "[%s] %s | %s(%.2f) %s(%.2f) %s(%.2f) | %r",
                     message_data["time"][11:19],
                     author[:20],
                     sentiment, s_conf,
                     topic, t_conf,
+                    action_type, at_conf,
                     text[:60],
                 )

ml/action_type_model.py CHANGED Viewed

@@ -271,18 +271,16 @@ _PRICING_KW: set[str] = {
 # Fees + Financial Queries — how to purchase, payment, stipend
 _FEES_KW: set[str] = {
-    # Purchase / payment
     "purchase", "buy", "kharidna", "kharide", "kharido",
     "payment", "pay", "paid",
-    "kaise", "kaha", "kahan", "milega", "milegi",
     # Financial
     "stipend", "salary", "income", "earn", "earning",
     "emi", "installment", "loan",
     # Batch purchase
-    "batch", "course", "enroll", "enrollment", "admission",
     "register", "registration",
-    # Hinglish
-    "lena", "lena hai", "chahiye", "chahta", "chahti",
     "pw", "physics wallah", "umeed",
 }
@@ -322,41 +320,38 @@ _BATCH_KW: set[str] = {
     "worth", "value",
     # Faculty in batch
     "faculty", "teacher", "sir", "mam",
-    "included", "include", "hai", "hain",
 }
 # ── Keyword sets: remaining categories ───────────────────────────────────────
 # Information- Exam — exam rules, cutoffs, forms, reservations, percentages
 _EXAM_INFO_KW: set[str] = {
-    # Exam process
     "form", "bhara", "bharana", "apply", "application",
-    "notification", "vacancy", "post",
     # Exam results / cutoffs
     "cutoff", "cut off", "marks", "percentage", "percent",
     "prelims", "mains", "interview", "daf",
     "clear", "cleared", "qualify", "qualified",
     # Reservation / rules
-    "reservation", "sc", "st", "obc", "ews", "general",
     "rule", "rules", "regulation", "norms",
     "upsc", "ssc", "ias", "ips", "ifs",
     # Exam statistics
     "attempt", "attempts", "age", "limit",
-    "seats", "vacancy", "post",
-    # Hinglish
-    "ata", "aata", "kitna", "kitne", "kya", "hai",
     "dono", "both",
 }
 # Guidance — study strategy, what to study, life advice
 _GUIDANCE_KW: set[str] = {
-    # Study strategy
-    "kahan", "kaha", "kaise", "konsa", "konsi",
-    "where", "how", "which", "what",
-    "se", "kre", "karo", "karein", "start", "shuru",
     "strategy", "plan", "approach",
     # Subject selection
-    "optional", "subject", "choose", "select", "lena",
     "economy", "geography", "history", "polity",
     "physics", "chemistry", "biology", "maths",
     # Life / personal advice
@@ -364,9 +359,9 @@ _GUIDANCE_KW: set[str] = {
     "control", "manage", "balance",
     "motivation", "motivate", "inspired",
     "chhod", "chhodna", "drop", "leave",
-    # Hinglish
-    "kya", "kya kre", "kya karu", "kya karun",
-    "sir", "bata", "batao", "suggest",
     "ioc", "baaki", "rest",
 }
@@ -721,7 +716,6 @@ def _fast_path(t: str, words: set[str], has_q: bool) -> tuple[str, float] | None
         "link", "telegram", "channel", "group", "invite",
         "help", "support", "contact",
         "refund", "cancel",
-        "chal", "chalta", "kaam", "karta",
         "khul", "khulta",
     }
     if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
@@ -778,7 +772,6 @@ def _rule_chain(t: str, words: set[str], has_q: bool) -> tuple[str, float] | Non
         "kab", "when", "bje", "baje", "time", "timing", "schedule",
         "aayega", "aayegi", "aata", "aati", "ata", "ati",
         "next", "agla", "agli",
-        "end", "khatam", "finish",
         "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
         "somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
         "kb",

 # Fees + Financial Queries — how to purchase, payment, stipend
 _FEES_KW: set[str] = {
+    # Purchase / payment — specific financial terms only
     "purchase", "buy", "kharidna", "kharide", "kharido",
     "payment", "pay", "paid",
     # Financial
     "stipend", "salary", "income", "earn", "earning",
     "emi", "installment", "loan",
     # Batch purchase
+    "enroll", "enrollment", "admission",
     "register", "registration",
+    # Hinglish — only specific ones
     "pw", "physics wallah", "umeed",
 }
     "worth", "value",
     # Faculty in batch
     "faculty", "teacher", "sir", "mam",
+    "included", "include",
 }
 # ── Keyword sets: remaining categories ───────────────────────────────────────
 # Information- Exam — exam rules, cutoffs, forms, reservations, percentages
 _EXAM_INFO_KW: set[str] = {
+    # Exam process — specific exam terms only
     "form", "bhara", "bharana", "apply", "application",
+    "notification", "vacancy",
     # Exam results / cutoffs
     "cutoff", "cut off", "marks", "percentage", "percent",
     "prelims", "mains", "interview", "daf",
     "clear", "cleared", "qualify", "qualified",
     # Reservation / rules
+    "reservation", "sc", "st", "obc", "ews",
     "rule", "rules", "regulation", "norms",
     "upsc", "ssc", "ias", "ips", "ifs",
     # Exam statistics
     "attempt", "attempts", "age", "limit",
+    "seats",
+    # Hinglish — only specific exam-related ones
     "dono", "both",
 }
 # Guidance — study strategy, what to study, life advice
 _GUIDANCE_KW: set[str] = {
+    # Study strategy — specific guidance words only
+    "konsa", "konsi",
     "strategy", "plan", "approach",
     # Subject selection
+    "optional", "subject", "choose", "select",
     "economy", "geography", "history", "polity",
     "physics", "chemistry", "biology", "maths",
     # Life / personal advice
     "control", "manage", "balance",
     "motivation", "motivate", "inspired",
     "chhod", "chhodna", "drop", "leave",
+    # Hinglish — specific guidance phrases
+    "kya kre", "kya karu", "kya karun",
+    "suggest",
     "ioc", "baaki", "rest",
 }
         "link", "telegram", "channel", "group", "invite",
         "help", "support", "contact",
         "refund", "cancel",
         "khul", "khulta",
     }
     if len(words & _ACCESS_SUPPORT_CORE_KW) >= 1:
         "kab", "when", "bje", "baje", "time", "timing", "schedule",
         "aayega", "aayegi", "aata", "aati", "ata", "ati",
         "next", "agla", "agli",
         "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
         "somvar", "mangalvar", "budhvar", "guruvar", "shukravar", "shanivar", "ravivar",
         "kb",

ml/sentiment_model.py CHANGED Viewed

@@ -203,9 +203,28 @@ _POS_WORDS: set[str] = {
     # ── Common live chat positives ──
     "woww", "wowww", "woah", "whoa", "yay", "yayy",
     "haha", "hahaha", "lol", "lmao",   # laughter = positive
-    "clap", "claps", "bravo", "chappal",  # chappal = clap in some contexts
     "heart", "hearts",
-    "100", "1000",   # "100%" positive
 }
 # ── Negative keyword set ───────────────────────────────────────────────────────

     # ── Common live chat positives ──
     "woww", "wowww", "woah", "whoa", "yay", "yayy",
     "haha", "hahaha", "lol", "lmao",   # laughter = positive
+    "clap", "claps", "bravo", "chappal",
     "heart", "hearts",
+    "100", "1000",
+    # ── Greetings / blessings (common in Indian live chats) ──
+    "pranam", "pranaam", "namaskar", "namaste", "namasthe",
+    "assalamualaikum", "walaikum", "walekum", "waalaikum",
+    "jai hind", "jai ho",
+    "gm", "gn", "ge",
+    "mubarak", "mubarakho",
+    "atb",
+    "god bless", "stay blessed", "stay safe",
+    "welcome", "wlcm", "wlc",
+    "congratulations", "congrats",
+    "well done", "keep it up", "keep going",
+    "proud", "proudly",
+    "maza aa gaya", "maza aaya", "maja aa gaya",
+    "khyal rakhna",
+    "take care",
+    "luck",          # "good luck", "best of luck" — "luck" alone is positive context
+    "morning",       # "good morning" — "morning" alone in greeting context
+    "evening",       # "good evening"
 }
 # ── Negative keyword set ───────────────────────────────────────────────────────

ml/topic_model.py CHANGED Viewed

@@ -33,12 +33,12 @@ _APPRECIATION_KW = {
     "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
     "tysm", "tqsm", "thx",
     "informative", "fruitful", "motivating", "lovely",
-    "bestest", "loved", "great", "good", "nice", "helpful",
     "semma", "mass", "solid", "fire", "goated",
 }
 _QUESTION_KW = {
-    "kya", "kab", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
     "kaise", "kyun", "kyunki",
     "what", "when", "where", "who", "which", "how", "why",
     "bata", "batao", "bataye", "tell", "explain",
@@ -52,7 +52,7 @@ _RF_CONTENT_REQUEST_KW = {
     "karo", "kariye", "karaiye", "kardo",
     "lao", "laiye", "layiye",
     "start", "shuru", "launch", "resume",
-    "video", "lecture", "session", "class", "series",
     "separate", "alag", "akele", "single",
     "cover", "include", "add", "topic",
     "chahiye", "chahte", "chahta", "chahti",
@@ -66,6 +66,7 @@ _RF_ACADEMIC_KW = {
     "timeline", "schedule", "timetable", "syllabus",
     "infographic", "slides", "ppt", "handout",
     "provide", "share", "send", "dedo", "dedijiye",
 }
 # Language requests
@@ -121,7 +122,7 @@ _SPAM_PATTERNS = [
     r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
     r"https?://\S+",
     r"_{4,}",
-    r"(?:\b[a-z0-9]{6,}\b\s*){3,}",
 ]
 _SPAM_KW_SUBSTRINGS = {
@@ -209,7 +210,8 @@ def predict_topic(text: str) -> tuple[str, float]:
     rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
     # ── Appreciation ──
-    min_appr_hits = 1 if len(t_clean) >= 15 else 2
     if (appreciation_hits >= min_appr_hits
             and question_hits == 0
             and not has_question_mark

     "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
     "tysm", "tqsm", "thx",
     "informative", "fruitful", "motivating", "lovely",
+    "bestest", "loved", "nice", "helpful",
     "semma", "mass", "solid", "fire", "goated",
 }
 _QUESTION_KW = {
+    "kya", "kab", "kb", "kahan", "kaun", "kitna", "kitne", "konsa", "konsi",
     "kaise", "kyun", "kyunki",
     "what", "when", "where", "who", "which", "how", "why",
     "bata", "batao", "bataye", "tell", "explain",
     "karo", "kariye", "karaiye", "kardo",
     "lao", "laiye", "layiye",
     "start", "shuru", "launch", "resume",
+    "video", "class", "series",   # removed "session" and "lecture" — too generic
     "separate", "alag", "akele", "single",
     "cover", "include", "add", "topic",
     "chahiye", "chahte", "chahta", "chahti",
     "timeline", "schedule", "timetable", "syllabus",
     "infographic", "slides", "ppt", "handout",
     "provide", "share", "send", "dedo", "dedijiye",
+    "milega", "milegi", "milenge",   # "where to find" — specific to resource queries
 }
 # Language requests
     r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
     r"https?://\S+",
     r"_{4,}",
+    r"(?:\b[a-z0-9]{6,}\b\s*){6,}",   # raised from 3 to 6 — avoids catching real sentences
 ]
 _SPAM_KW_SUBSTRINGS = {
     rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
     # ── Appreciation ──
+    # Single strong appreciation word is enough regardless of length
+    min_appr_hits = 1
     if (appreciation_hits >= min_appr_hits
             and question_hits == 0
             and not has_question_mark

shared.py CHANGED Viewed

@@ -268,7 +268,11 @@ def _scraper_thread_fn(video_id: str, redis_key: str, stop_event: threading.Even
                 try:
                     sentiment, s_conf = _safe_sentiment(text)
                     topic,     t_conf = _safe_topic(text)
-                    action_type, at_conf = _safe_action_type(text)
                 except Exception as exc:
                     logger.error("ML inference failed: %s", exc)
                     sentiment, s_conf = "Neutral", 0.5

                 try:
                     sentiment, s_conf = _safe_sentiment(text)
                     topic,     t_conf = _safe_topic(text)
+                    # Only classify action type for Question/Request topics
+                    if topic in ("Question", "Request/Feedback"):
+                        action_type, at_conf = _safe_action_type(text)
+                    else:
+                        action_type, at_conf = "N/A", 0.50
                 except Exception as exc:
                     logger.error("ML inference failed: %s", exc)
                     sentiment, s_conf = "Neutral", 0.5