File size: 8,428 Bytes
7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 5a13d2c 7d815fe 11a0fc5 67899d6 11a0fc5 7d815fe 11a0fc5 7d815fe 67899d6 7d815fe 11a0fc5 7d815fe 5a13d2c 7d815fe 11a0fc5 7d815fe 5a13d2c 11a0fc5 7d815fe 11a0fc5 67899d6 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe a4612d4 7d815fe a4612d4 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 5a13d2c 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe 11a0fc5 7d815fe | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 | # -*- coding: utf-8 -*-
"""
ml/topic_model.py
=================
Pure keyword/rule-based topic classifier for YouTube live-chat comments.
No ML models are loaded β classification is entirely keyword/regex-based.
Topics
------
Appreciation β praise, thanks, love, encouragement
Question β direct questions and doubts/confusion
Request/Feedback β content requests, faculty requests, feedback, suggestions
Promo β self-promotion, links, "check my channel"
Spam β repeated noise, irrelevant flood, gibberish
MCQ Answer β single letter answers (a/b/c/d/e)
General β anything that doesn't fit the above (fallback)
"""
from __future__ import annotations
import re
# ββ Valid topics βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"}
# ββ Keyword fast-path ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_APPRECIATION_KW = {
"love", "thanks", "thank", "superb", "amazing", "excellent",
"awesome", "wonderful", "brilliant", "fantastic", "best", "perfect",
"mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab",
"waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm",
"dhansu", "pyaar", "bindaas", "khush", "happy",
"thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku",
"tysm", "tqsm", "thx",
"informative", "fruitful", "motivating", "lovely",
"bestest", "loved", "nice", "helpful",
"semma", "mass", "solid", "fire", "goated",
}
_QUESTION_KW = {
"kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi",
"kaise", "kyun", "kyunki",
"what", "when", "where", "who", "which", "how", "why",
"bata", "batao", "bataye", "tell", "explain",
"samajh", "confused", "confusion", "doubt", "unclear",
"matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha",
}
# Content requests β asking for new videos, topics, sessions
_RF_CONTENT_REQUEST_KW = {
"banao", "banana", "banaye", "banaiye", "banado",
"karo", "kariye", "karaiye", "kardo",
"lao", "laiye", "layiye",
"start", "shuru", "launch", "resume",
"video", "series", # removed "class" and "session" β too generic
"separate", "alag", "akele", "single",
"cover", "include", "add", "topic",
"chahiye", "chahte", "chahta", "chahti",
"request", "requesting",
}
# Academic/resource requests β PDFs, notes, downloads
_RF_ACADEMIC_KW = {
"pdf", "notes", "note", "download", "upload",
"drive", "google", "link", "material", "resource",
"timeline", "schedule", "timetable", "syllabus",
"infographic", "slides", "ppt", "handout",
"provide", "share", "send", "dedo", "dedijiye",
"milega", "milegi", "milenge", # "where to find" β specific to resource queries
}
# Language requests
_RF_LANGUAGE_KW = {
"hindi", "english", "medium", "language",
"translate", "translation",
}
# Feedback/suggestion keywords
_RF_FEEDBACK_KW = {
"side", "screen", "dikhta", "dikhai",
"correction", "correct", "galat", "wrong", "mistake",
"suggestion", "suggest", "improve", "better",
"feedback", "review", "opinion",
"sorry", "maafi", "apology",
"please", "plz", "pls", "plss", "plzz",
"dijiye", "dijie", "dena", "dedo",
"chahiye", "zaroorat", "need",
}
# Product/app feature requests
_RF_PRODUCT_KW = {
"app", "feature", "option", "button", "setting",
"notification", "reminder", "alert",
"website", "portal", "platform",
}
# Combined RF keyword set
_RF_ALL_KW = (
_RF_CONTENT_REQUEST_KW
| _RF_ACADEMIC_KW
| _RF_LANGUAGE_KW
| _RF_FEEDBACK_KW
| _RF_PRODUCT_KW
)
# Phrases that strongly indicate Request/Feedback (multi-word)
_RF_PHRASES = [
r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b",
r"\bsir\s+(please|pls|plz)\b",
r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b",
r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b",
r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b",
r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b",
r"\b(correction|galat|wrong)\s+\w+\b",
r"\brequest\s+(hai|he|h|kar|karna)\b",
r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b",
]
_SPAM_PATTERNS = [
r"^(.)\1{3,}$",
r"^[^a-zA-Z\u0900-\u097F]{0,3}$",
r"https?://\S+",
r"_{4,}",
r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 β avoids catching real sentences
]
_SPAM_KW_SUBSTRINGS = {
"onelink", "zazb", "gatewallah_official", "pwappweb",
"kuldeepsir_pw",
}
_PROMO_KW = {
"subscribe", "channel", "link", "instagram",
"check", "visit", "click", "http", "www", ".com", "telegram",
"https",
}
_MIN_FASTPATH_LEN = 4
# ββ Classification βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def predict_topic(text: str) -> tuple[str, float]:
"""
Classify a comment into a topic category.
Parameters
----------
text : str
Raw comment text.
Returns
-------
topic : str
One of VALID_TOPICS.
confidence : float
Rule-based confidence in [0.50, 0.95].
Notes
-----
- Fully keyword/regex-based, no ML models.
- Anything that doesn't match a keyword falls back to "General".
"""
if not text or not text.strip():
return "General", 0.50
t = text.strip().lower()
t_clean = re.sub(r":[a-z_]+:", " ", t).strip()
t_clean = re.sub(r"\s+", " ", t_clean).strip()
# ββ MCQ Answer: single letter or repeated letter(s) ββ
if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean):
return "MCQ Answer", 0.95
if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean):
return "MCQ Answer", 0.95
# ββ Spam: known spam substrings ββ
if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS):
return "Spam", 0.90
# ββ Spam/Promo: URL present ββ
if re.search(r"https?://\S+", t_clean):
if any(kw in t_clean for kw in _PROMO_KW):
return "Promo", 0.85
return "Spam", 0.85
# ββ Spam: repeated chars / gibberish ββ
for pat in _SPAM_PATTERNS[:-1]:
if re.search(pat, t_clean):
return "Spam", 0.85
if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean):
return "Spam", 0.82
# ββ Promo ββ
if any(kw in t_clean for kw in _PROMO_KW):
return "Promo", 0.80
if len(t_clean) < _MIN_FASTPATH_LEN:
return "General", 0.55
words = set(t_clean.split())
has_question_mark = "?" in text
question_hits = len(words & _QUESTION_KW)
appreciation_hits = len(words & _APPRECIATION_KW)
rf_hits = len(words & _RF_ALL_KW)
# Check Request/Feedback phrase patterns (strong signal)
rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES)
# ββ Appreciation ββ
# Single strong appreciation word is enough regardless of length
min_appr_hits = 1
if (appreciation_hits >= min_appr_hits
and question_hits == 0
and not has_question_mark
and rf_hits == 0
and not rf_phrase_match):
return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92)
# ββ Question ββ
if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match:
return "Question", min(0.75 + 0.04 * question_hits, 0.92)
# ββ Request/Feedback: phrase match ββ
if rf_phrase_match:
return "Request/Feedback", 0.85
# ββ Request/Feedback: keyword hits ββ
min_rf_hits = 1 if len(t_clean) >= 20 else 2
if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark:
return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90)
# ββ Fallback ββ
return "General", 0.55
|