| |
| """ |
| ml/topic_model.py |
| ================= |
| Pure keyword/rule-based topic classifier for YouTube live-chat comments. |
| No ML models are loaded β classification is entirely keyword/regex-based. |
| |
| Topics |
| ------ |
| Appreciation β praise, thanks, love, encouragement |
| Question β direct questions and doubts/confusion |
| Request/Feedback β content requests, faculty requests, feedback, suggestions |
| Promo β self-promotion, links, "check my channel" |
| Spam β repeated noise, irrelevant flood, gibberish |
| MCQ Answer β single letter answers (a/b/c/d/e) |
| General β anything that doesn't fit the above (fallback) |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| |
| VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"} |
|
|
| |
| _APPRECIATION_KW = { |
| "love", "thanks", "thank", "superb", "amazing", "excellent", |
| "awesome", "wonderful", "brilliant", "fantastic", "best", "perfect", |
| "mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab", |
| "waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm", |
| "dhansu", "pyaar", "bindaas", "khush", "happy", |
| "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku", |
| "tysm", "tqsm", "thx", |
| "informative", "fruitful", "motivating", "lovely", |
| "bestest", "loved", "nice", "helpful", |
| "semma", "mass", "solid", "fire", "goated", |
| } |
|
|
| _QUESTION_KW = { |
| "kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi", |
| "kaise", "kyun", "kyunki", |
| "what", "when", "where", "who", "which", "how", "why", |
| "bata", "batao", "bataye", "tell", "explain", |
| "samajh", "confused", "confusion", "doubt", "unclear", |
| "matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha", |
| } |
|
|
| |
| _RF_CONTENT_REQUEST_KW = { |
| "banao", "banana", "banaye", "banaiye", "banado", |
| "karo", "kariye", "karaiye", "kardo", |
| "lao", "laiye", "layiye", |
| "start", "shuru", "launch", "resume", |
| "video", "series", |
| "separate", "alag", "akele", "single", |
| "cover", "include", "add", "topic", |
| "chahiye", "chahte", "chahta", "chahti", |
| "request", "requesting", |
| } |
|
|
| |
| _RF_ACADEMIC_KW = { |
| "pdf", "notes", "note", "download", "upload", |
| "drive", "google", "link", "material", "resource", |
| "timeline", "schedule", "timetable", "syllabus", |
| "infographic", "slides", "ppt", "handout", |
| "provide", "share", "send", "dedo", "dedijiye", |
| "milega", "milegi", "milenge", |
| } |
|
|
| |
| _RF_LANGUAGE_KW = { |
| "hindi", "english", "medium", "language", |
| "translate", "translation", |
| } |
|
|
| |
| _RF_FEEDBACK_KW = { |
| "side", "screen", "dikhta", "dikhai", |
| "correction", "correct", "galat", "wrong", "mistake", |
| "suggestion", "suggest", "improve", "better", |
| "feedback", "review", "opinion", |
| "sorry", "maafi", "apology", |
| "please", "plz", "pls", "plss", "plzz", |
| "dijiye", "dijie", "dena", "dedo", |
| "chahiye", "zaroorat", "need", |
| } |
|
|
| |
| _RF_PRODUCT_KW = { |
| "app", "feature", "option", "button", "setting", |
| "notification", "reminder", "alert", |
| "website", "portal", "platform", |
| } |
|
|
| |
| _RF_ALL_KW = ( |
| _RF_CONTENT_REQUEST_KW |
| | _RF_ACADEMIC_KW |
| | _RF_LANGUAGE_KW |
| | _RF_FEEDBACK_KW |
| | _RF_PRODUCT_KW |
| ) |
|
|
| |
| _RF_PHRASES = [ |
| r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b", |
| r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b", |
| r"\bsir\s+(please|pls|plz)\b", |
| r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b", |
| r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b", |
| r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b", |
| r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b", |
| r"\b(correction|galat|wrong)\s+\w+\b", |
| r"\brequest\s+(hai|he|h|kar|karna)\b", |
| r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b", |
| ] |
|
|
| _SPAM_PATTERNS = [ |
| r"^(.)\1{3,}$", |
| r"^[^a-zA-Z\u0900-\u097F]{0,3}$", |
| r"https?://\S+", |
| r"_{4,}", |
| r"(?:\b[a-z0-9]{6,}\b\s*){6,}", |
| ] |
|
|
| _SPAM_KW_SUBSTRINGS = { |
| "onelink", "zazb", "gatewallah_official", "pwappweb", |
| "kuldeepsir_pw", |
| } |
|
|
| _PROMO_KW = { |
| "subscribe", "channel", "link", "instagram", |
| "check", "visit", "click", "http", "www", ".com", "telegram", |
| "https", |
| } |
|
|
| _MIN_FASTPATH_LEN = 4 |
|
|
|
|
| |
|
|
| def predict_topic(text: str) -> tuple[str, float]: |
| """ |
| Classify a comment into a topic category. |
| |
| Parameters |
| ---------- |
| text : str |
| Raw comment text. |
| |
| Returns |
| ------- |
| topic : str |
| One of VALID_TOPICS. |
| confidence : float |
| Rule-based confidence in [0.50, 0.95]. |
| |
| Notes |
| ----- |
| - Fully keyword/regex-based, no ML models. |
| - Anything that doesn't match a keyword falls back to "General". |
| """ |
| if not text or not text.strip(): |
| return "General", 0.50 |
|
|
| t = text.strip().lower() |
| t_clean = re.sub(r":[a-z_]+:", " ", t).strip() |
| t_clean = re.sub(r"\s+", " ", t_clean).strip() |
|
|
| |
| if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean): |
| return "MCQ Answer", 0.95 |
| if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean): |
| return "MCQ Answer", 0.95 |
|
|
| |
| if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS): |
| return "Spam", 0.90 |
|
|
| |
| if re.search(r"https?://\S+", t_clean): |
| if any(kw in t_clean for kw in _PROMO_KW): |
| return "Promo", 0.85 |
| return "Spam", 0.85 |
|
|
| |
| for pat in _SPAM_PATTERNS[:-1]: |
| if re.search(pat, t_clean): |
| return "Spam", 0.85 |
| if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean): |
| return "Spam", 0.82 |
|
|
| |
| if any(kw in t_clean for kw in _PROMO_KW): |
| return "Promo", 0.80 |
|
|
| if len(t_clean) < _MIN_FASTPATH_LEN: |
| return "General", 0.55 |
|
|
| words = set(t_clean.split()) |
| has_question_mark = "?" in text |
|
|
| question_hits = len(words & _QUESTION_KW) |
| appreciation_hits = len(words & _APPRECIATION_KW) |
| rf_hits = len(words & _RF_ALL_KW) |
|
|
| |
| rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES) |
|
|
| |
| |
| min_appr_hits = 1 |
| if (appreciation_hits >= min_appr_hits |
| and question_hits == 0 |
| and not has_question_mark |
| and rf_hits == 0 |
| and not rf_phrase_match): |
| return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92) |
|
|
| |
| if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match: |
| return "Question", min(0.75 + 0.04 * question_hits, 0.92) |
|
|
| |
| if rf_phrase_match: |
| return "Request/Feedback", 0.85 |
|
|
| |
| min_rf_hits = 1 if len(t_clean) >= 20 else 2 |
| if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark: |
| return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90) |
|
|
| |
| return "General", 0.55 |
|
|