# -*- coding: utf-8 -*- """ ml/topic_model.py ================= Pure keyword/rule-based topic classifier for YouTube live-chat comments. No ML models are loaded — classification is entirely keyword/regex-based. Topics ------ Appreciation — praise, thanks, love, encouragement Question — direct questions and doubts/confusion Request/Feedback — content requests, faculty requests, feedback, suggestions Promo — self-promotion, links, "check my channel" Spam — repeated noise, irrelevant flood, gibberish MCQ Answer — single letter answers (a/b/c/d/e) General — anything that doesn't fit the above (fallback) """ from __future__ import annotations import re # ── Valid topics ─────────────────────────────────────────────────────────────── VALID_TOPICS = {"Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"} # ── Keyword fast-path ────────────────────────────────────────────────────────── _APPRECIATION_KW = { "love", "thanks", "thank", "superb", "amazing", "excellent", "awesome", "wonderful", "brilliant", "fantastic", "best", "perfect", "mast", "zabardast", "kamaal", "jhakaas", "shandar", "lajawaab", "lajawab", "waah", "wah", "badhiya", "shukriya", "dhanyawad", "osm", "awsm", "dhansu", "pyaar", "bindaas", "khush", "happy", "thankyou", "thanku", "thnk", "thnq", "thnks", "thnx", "thnku", "tysm", "tqsm", "thx", "informative", "fruitful", "motivating", "lovely", "bestest", "loved", "nice", "helpful", "semma", "mass", "solid", "fire", "goated", } _QUESTION_KW = { "kya", "kab", "kb", "kahan", "kaun", "kon", "kitna", "kitne", "konsa", "konsi", "kaise", "kyun", "kyunki", "what", "when", "where", "who", "which", "how", "why", "bata", "batao", "bataye", "tell", "explain", "samajh", "confused", "confusion", "doubt", "unclear", "matlab", "matalab", "samjha", "samjhe", "samjhi", "smjh", "smjha", } # Content requests — asking for new videos, topics, sessions _RF_CONTENT_REQUEST_KW = { "banao", "banana", "banaye", "banaiye", "banado", "karo", "kariye", "karaiye", "kardo", "lao", "laiye", "layiye", "start", "shuru", "launch", "resume", "video", "series", # removed "class" and "session" — too generic "separate", "alag", "akele", "single", "cover", "include", "add", "topic", "chahiye", "chahte", "chahta", "chahti", "request", "requesting", } # Academic/resource requests — PDFs, notes, downloads _RF_ACADEMIC_KW = { "pdf", "notes", "note", "download", "upload", "drive", "google", "link", "material", "resource", "timeline", "schedule", "timetable", "syllabus", "infographic", "slides", "ppt", "handout", "provide", "share", "send", "dedo", "dedijiye", "milega", "milegi", "milenge", # "where to find" — specific to resource queries } # Language requests _RF_LANGUAGE_KW = { "hindi", "english", "medium", "language", "translate", "translation", } # Feedback/suggestion keywords _RF_FEEDBACK_KW = { "side", "screen", "dikhta", "dikhai", "correction", "correct", "galat", "wrong", "mistake", "suggestion", "suggest", "improve", "better", "feedback", "review", "opinion", "sorry", "maafi", "apology", "please", "plz", "pls", "plss", "plzz", "dijiye", "dijie", "dena", "dedo", "chahiye", "zaroorat", "need", } # Product/app feature requests _RF_PRODUCT_KW = { "app", "feature", "option", "button", "setting", "notification", "reminder", "alert", "website", "portal", "platform", } # Combined RF keyword set _RF_ALL_KW = ( _RF_CONTENT_REQUEST_KW | _RF_ACADEMIC_KW | _RF_LANGUAGE_KW | _RF_FEEDBACK_KW | _RF_PRODUCT_KW ) # Phrases that strongly indicate Request/Feedback (multi-word) _RF_PHRASES = [ r"\bplease\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b", r"\bpls\s+\w+\s+(karo|kijiye|dijiye|banao|lao|upload|provide|start)\b", r"\bsir\s+(please|pls|plz)\b", r"\b(pdf|notes?|material)\s+(upload|provide|share|send|dedo|dijiye)\b", r"\b(separate|alag|akele)\s+(video|session|class|lecture)\b", r"\b(hindi|english)\s+(medium|mein|me|pdf|notes?)\b", r"\b(side|screen)\s+(ho|hojao|hojaye|jaiye)\b", r"\b(correction|galat|wrong)\s+\w+\b", r"\brequest\s+(hai|he|h|kar|karna)\b", r"\b(chahiye|chahte|chahta|chahti)\s+\w+\b", ] _SPAM_PATTERNS = [ r"^(.)\1{3,}$", r"^[^a-zA-Z\u0900-\u097F]{0,3}$", r"https?://\S+", r"_{4,}", r"(?:\b[a-z0-9]{6,}\b\s*){6,}", # raised from 3 to 6 — avoids catching real sentences ] _SPAM_KW_SUBSTRINGS = { "onelink", "zazb", "gatewallah_official", "pwappweb", "kuldeepsir_pw", } _PROMO_KW = { "subscribe", "channel", "link", "instagram", "check", "visit", "click", "http", "www", ".com", "telegram", "https", } _MIN_FASTPATH_LEN = 4 # ── Classification ───────────────────────────────────────────────────────────── def predict_topic(text: str) -> tuple[str, float]: """ Classify a comment into a topic category. Parameters ---------- text : str Raw comment text. Returns ------- topic : str One of VALID_TOPICS. confidence : float Rule-based confidence in [0.50, 0.95]. Notes ----- - Fully keyword/regex-based, no ML models. - Anything that doesn't match a keyword falls back to "General". """ if not text or not text.strip(): return "General", 0.50 t = text.strip().lower() t_clean = re.sub(r":[a-z_]+:", " ", t).strip() t_clean = re.sub(r"\s+", " ", t_clean).strip() # ── MCQ Answer: single letter or repeated letter(s) ── if re.fullmatch(r"[a-e]", t_clean) or re.fullmatch(r"([a-e])\1*", t_clean): return "MCQ Answer", 0.95 if re.fullmatch(r"([a-e])\1*(\s*[,/]\s*([a-e])\3*)*", t_clean): return "MCQ Answer", 0.95 # ── Spam: known spam substrings ── if any(kw in t_clean for kw in _SPAM_KW_SUBSTRINGS): return "Spam", 0.90 # ── Spam/Promo: URL present ── if re.search(r"https?://\S+", t_clean): if any(kw in t_clean for kw in _PROMO_KW): return "Promo", 0.85 return "Spam", 0.85 # ── Spam: repeated chars / gibberish ── for pat in _SPAM_PATTERNS[:-1]: if re.search(pat, t_clean): return "Spam", 0.85 if len(t_clean) > 20 and re.search(_SPAM_PATTERNS[-1], t_clean): return "Spam", 0.82 # ── Promo ── if any(kw in t_clean for kw in _PROMO_KW): return "Promo", 0.80 if len(t_clean) < _MIN_FASTPATH_LEN: return "General", 0.55 words = set(t_clean.split()) has_question_mark = "?" in text question_hits = len(words & _QUESTION_KW) appreciation_hits = len(words & _APPRECIATION_KW) rf_hits = len(words & _RF_ALL_KW) # Check Request/Feedback phrase patterns (strong signal) rf_phrase_match = any(re.search(p, t_clean) for p in _RF_PHRASES) # ── Appreciation ── # Single strong appreciation word is enough regardless of length min_appr_hits = 1 if (appreciation_hits >= min_appr_hits and question_hits == 0 and not has_question_mark and rf_hits == 0 and not rf_phrase_match): return "Appreciation", min(0.72 + 0.05 * appreciation_hits, 0.92) # ── Question ── if (has_question_mark or question_hits >= 1) and rf_hits < 2 and not rf_phrase_match: return "Question", min(0.75 + 0.04 * question_hits, 0.92) # ── Request/Feedback: phrase match ── if rf_phrase_match: return "Request/Feedback", 0.85 # ── Request/Feedback: keyword hits ── min_rf_hits = 1 if len(t_clean) >= 20 else 2 if rf_hits >= min_rf_hits and question_hits == 0 and not has_question_mark: return "Request/Feedback", min(0.72 + 0.04 * rf_hits, 0.90) # ── Fallback ── return "General", 0.55