| |
| """ |
| ml/sentiment_model.py |
| ===================== |
| Pure keyword/rule-based sentiment classifier for YouTube live-chat comments. |
| No ML models are loaded β classification is entirely keyword/regex-based. |
| |
| Approach |
| -------- |
| 1. Emoji scoring β positive/negative emoji characters boost confidence |
| 2. Negation check β "nahi accha" flips Positive β Negative |
| 3. Intensifier boost β "bahut accha" raises confidence |
| 4. Keyword matching β expanded Hinglish + English + regional + typo variants |
| 5. Fallback β Neutral at 0.55 if nothing fires |
| |
| Public API |
| ---------- |
| predict_sentiment(text: str) -> tuple[str, float] |
| Returns (label, confidence) where label β {"Positive", "Neutral", "Negative"} |
| and confidence β [0.50, 0.95]. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import re |
|
|
| import emoji |
|
|
|
|
| |
| |
| _EMOJI_POS_KW = { |
| "love", "fire", "happy", "laugh", "win", "cool", "best", "heart", |
| "smile", "star", "clap", "pray", "sparkle", "sun", "rainbow", |
| "thumbs_up", "raised_hands", "partying", "grinning", "beaming", |
| "smiling", "joy", "hundred", "muscle", "trophy", "crown", |
| } |
| _EMOJI_NEG_KW = { |
| "angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit", |
| "rage", "broken", "disappointed", "thumbs_down", "weary", "tired", |
| "loudly_crying", "fearful", "anguished", "confounded", "persevere", |
| "unamused", "expressionless", "nauseated", "sneezing", |
| } |
|
|
|
|
| def _emoji_score(text: str) -> float: |
| """Return a float in roughly [-0.4, 0.4] based on emoji sentiment.""" |
| score = 0.0 |
| for ch in text: |
| if emoji.is_emoji(ch): |
| name = emoji.demojize(ch) |
| if any(k in name for k in _EMOJI_POS_KW): |
| score += 0.15 |
| elif any(k in name for k in _EMOJI_NEG_KW): |
| score -= 0.15 |
| return max(-0.4, min(score, 0.4)) |
|
|
|
|
| |
| |
| _NEGATION_WORDS: set[str] = { |
| |
| "nahi", "nhi", "nahin", "na", "mat", "naa", "nope", |
| "bilkul nahi", "kabhi nahi", "kabhi nhi", |
| |
| "not", "no", "never", "neither", "nor", "without", |
| "don't", "dont", "doesn't", "doesnt", "didn't", "didnt", |
| "can't", "cant", "won't", "wont", "isn't", "isnt", |
| "wasn't", "wasnt", "aren't", "arent", "weren't", "werent", |
| "hardly", "barely", "scarcely", |
| } |
|
|
| |
| _NEGATION_WINDOW = 3 |
|
|
|
|
| def _is_negated(word_list: list[str], sentiment_idx: int) -> bool: |
| """Return True if a negation word appears within _NEGATION_WINDOW words before OR after sentiment_idx. |
| |
| Handles both: |
| - pre-negation: "nahi accha tha" (negation before sentiment word) |
| - post-negation: "boring nahi tha" (negation after sentiment word) |
| """ |
| |
| start = max(0, sentiment_idx - _NEGATION_WINDOW) |
| before = word_list[start:sentiment_idx] |
| if any(w in _NEGATION_WORDS for w in before): |
| return True |
| |
| after = word_list[sentiment_idx + 1: sentiment_idx + 3] |
| return any(w in _NEGATION_WORDS for w in after) |
|
|
|
|
| |
| |
| _INTENSIFIERS: dict[str, float] = { |
| |
| "bahut": 0.10, |
| "bohot": 0.10, |
| "bht": 0.08, |
| "ekdum": 0.12, |
| "bilkul": 0.10, |
| "itna": 0.08, |
| "kitna": 0.06, |
| "zyada": 0.08, |
| "bohat": 0.10, |
| "atyant": 0.10, |
| "sampurn": 0.08, |
| |
| "very": 0.08, |
| "too": 0.08, |
| "so": 0.06, |
| "super": 0.10, |
| "ultra": 0.10, |
| "extremely": 0.12, |
| "absolutely": 0.12, |
| "totally": 0.10, |
| "really": 0.08, |
| "truly": 0.08, |
| "highly": 0.08, |
| "deeply": 0.08, |
| "insanely": 0.10, |
| "incredibly": 0.10, |
| "genuinely": 0.08, |
| } |
|
|
| _INTENSIFIER_WINDOW = 2 |
|
|
|
|
| def _intensifier_boost(word_list: list[str], sentiment_idx: int) -> float: |
| """Return confidence boost from intensifiers within _INTENSIFIER_WINDOW words before sentiment_idx.""" |
| start = max(0, sentiment_idx - _INTENSIFIER_WINDOW) |
| window = word_list[start:sentiment_idx] |
| boost = sum(_INTENSIFIERS.get(w, 0.0) for w in window) |
| return min(boost, 0.15) |
|
|
|
|
| |
| _POS_WORDS: set[str] = { |
| |
| "mast", "jhakaas", "kadak", "zabardast", "kamaal", "bindaas", |
| "shandar", "lajawaab", "lajawab", "lajaab", "waah", "wah", |
| "dhansu", "badhiya", "badiya", "maja", "mazza", "maza", |
| "acha", "accha", "achha", "acha", "sahi", "sach", |
| "shukriya", "dhanyawad", "dhanyavaad", "meherbani", "shukran", |
| "pyaar", "pyar", "khushi", "khush", |
| "fatafat", "jaldi", |
|
|
| |
| "osm", "awsm", "awsom", "awsome", "amzing", "amazng", |
| "gr8", "grt", "gr9", "fab", "fabbb", |
| "superrr", "amazinggg", "besttt", "niceee", "gooddd", |
| "thku", "thnku", "thnkuu", "thnkyou", "thanku", "thankyou", |
| "thnk", "thnq", "thnks", "thnx", "tysm", "tqsm", "thx", "ty", |
| "ty", "tyvm", "tyvmm", |
|
|
| |
| "amazing", "awesome", "excellent", "wonderful", "fantastic", |
| "brilliant", "outstanding", "exceptional", "magnificent", |
| "superb", "perfect", "great", "good", "nice", "beautiful", |
| "lovely", "loved", "love", "best", "better", |
| "helpful", "useful", "informative", "fruitful", "motivating", |
| "motivational", "inspiring", "inspired", "insightful", |
| "clear", "clarity", "simple", "easy", "smooth", |
| "thankful", "grateful", "blessed", "proud", |
| "happy", "glad", "pleased", "satisfied", "content", |
| "enjoy", "enjoyed", "enjoying", "fun", "interesting", |
| "impressive", "impressed", "incredible", "unbelievable", |
| "top", "topnotch", "firstclass", "worldclass", |
| "recommend", "recommended", "worth", "worthy", |
| "thanks", "thank", "appreciate", "appreciated", |
| "respect", "salute", "legend", "goat", "king", "queen", |
| "bestest", "bestttttt", "much", "op", "lit", |
|
|
| |
| "semma", |
| "mass", |
| "vera level", |
| "sema", |
| "bindass", |
| "dum", |
| "dhamakedaar", |
| "dhamaka", |
| "toofan", |
| "jalwa", |
| "josh", |
| "full josh", |
| "paisa vasool", |
| "makkhan", |
| "solid", |
| "tight", |
| "fire", |
| "goated", |
| "based", |
| "valid", |
| "clean", |
|
|
| |
| "shukriyaa", "shukriyaaa", "dhanyawaad", "dhanyawaaad", |
| "abhar", |
| "aabhar", |
|
|
| |
| "woww", "wowww", "woah", "whoa", "yay", "yayy", |
| "haha", "hahaha", "lol", "lmao", |
| "clap", "claps", "bravo", "chappal", |
| "heart", "hearts", |
| "100", "1000", |
|
|
| |
| "pranam", "pranaam", "namaskar", "namaste", "namasthe", |
| "assalamualaikum", "walaikum", "walekum", "waalaikum", |
| "jai hind", "jai ho", "jai shree ram", "jai mata di", |
| "gm", "gn", "ge", |
| "mubarak", "mubarakho", |
| "atb", |
| "god bless", "stay blessed", "stay safe", |
| "welcome", "wlcm", "wlc", |
| "congratulations", "congrats", |
| "well done", "keep it up", "keep going", |
| "proud", "proudly", |
| "maza aa gaya", "maza aaya", "maja aa gaya", |
| "khyal rakhna", |
| "take care", |
| "luck", |
| "morning", |
| "evening", |
|
|
| |
| "teaching", "teacher", "best teacher", "best sir", |
| "best pace", "best session", "best class", "best lecture", |
| "chaliye", "chalo", "lets go", "lets gooo", "lets", |
| "energy", "josh", |
| "1st time", "first time", |
| "aagye", "aa gye", "agye", |
| "jai", "hind", |
| } |
|
|
| |
| _NEG_WORDS: set[str] = { |
| |
| "bakwas", "bakwaas", "bakwaaas", |
| "faltu", "faltuu", |
| "bekar", "bekaar", "bekaaar", |
| "ghatiya", "ghatiiya", |
| "wahiyat", "wahiyaat", |
| "bura", "buraa", |
| "kharab", "kharaaab", |
| "boring", "borring", "booring", |
| "ullu", "pagal", "paagal", |
| "besharam", "besharaam", |
| "nafrat", "gussa", "naraaz", |
| "dukh", "takleef", "mushkil", |
| "uruttu", "battamizi", "battameezi", |
| "natak", "nautanki", |
| "dhoka", "dhokha", "jhooth", "jhoota", |
| "dikhawa", "dikhaawa", |
| "beizzati", "beizzatii", "bezaati", |
| "sharam", "sharaam", |
| "galat", "galt", |
| "jhanjhat", "jhamela", |
| "tang", "pareshan", "pareshaan", |
| "nirasha", "niraash", |
| "thaka", "thakaan", |
| "dard", "peeda", |
| "rona", "rota", "roti", |
| "darr", "dar", "darna", |
| "dara", "darti", "darte", |
| "tension", "tensed", |
| "scared", "nervous", |
| "cheat", "cheating", |
| "fraud", "fraudiya", |
| "loot", "loota", "looting", |
|
|
| |
| "useless", "unfair", "disappointing", "disappointed", |
| "foolish", "stupid", "idiot", "idiotic", |
| "terrible", "horrible", "awful", "dreadful", |
| "worst", "worse", "bad", "poor", |
| "waste", "wasted", "pathetic", |
| "annoying", "annoyed", "irritating", "irritated", |
| "frustrating", "frustrated", "frustration", |
| "confusing", "confused", "confusion", |
| "misleading", "clickbait", |
| "fake", "scam", "spam", |
| "hate", "hated", "hating", |
| "angry", "anger", "rage", |
| "sad", "sadness", "unhappy", "upset", |
| "wrong", "incorrect", "error", "mistake", |
| "problem", "issue", "bug", "broken", |
| "slow", "lagging", "lag", "buffering", |
| "crash", "crashed", "crashing", |
| "fail", "failed", "failure", |
| "ignore", "ignored", "ignoring", |
| "rude", "disrespect", "disrespectful", |
| "unfair", "biased", "bias", |
| "overpriced", "expensive", "costly", |
| "wtf", "wth", "omg", |
| "curse", "abusive", |
| "liar", "lie", "lies", |
| "cheat", "cheater", |
| "regret", "regretted", "regrets", |
| "never", "worst", |
|
|
| |
| "bakwaaas", "bekarrr", "borinnng", |
| "worstttt", "terribleee", |
|
|
| |
| "kabaad", |
| "raddi", |
| "kachra", |
| "bekar", |
| "nikamma", |
| "nalayak", |
| "kamina", |
| "harami", |
| "bewakoof", |
| "gadha", |
| "buddhu", |
| "duffer", |
| "flop", |
| "disaster", |
| "pathetic", |
| "cringe", |
| "cap", |
| "mid", |
| "trash", |
| "garbage", |
| "dogwater", |
| "lowkey bad", |
| "not good", |
| "not helpful", |
| "not worth", |
| "time waste", |
| "time wasted", |
| "waste of time", |
| } |
|
|
|
|
| |
|
|
| def _normalise(text: str) -> str: |
| """Lowercase, strip emoji codes, collapse repeated chars, collapse whitespace.""" |
| |
| t = re.sub(r":[a-z_]+:", " ", text) |
| t = t.lower() |
| |
| |
| t = re.sub(r"(.)\1{2,}", r"\1\1", t) |
| t = re.sub(r"\s+", " ", t).strip() |
| return t[:512] |
|
|
|
|
| |
|
|
| def _classify(text: str) -> tuple[str, float]: |
| """ |
| Classify normalised text using keyword matching with negation and intensifier handling. |
| |
| Returns (label, base_confidence) before emoji adjustment. |
| """ |
| t = _normalise(text) |
|
|
| if len(t) <= 2: |
| return "Neutral", 0.55 |
|
|
| word_list = t.split() |
| word_set = set(word_list) |
|
|
| pos_score = 0.0 |
| neg_score = 0.0 |
| pos_boost = 0.0 |
| neg_boost = 0.0 |
|
|
| for idx, word in enumerate(word_list): |
| negated = _is_negated(word_list, idx) |
| int_boost = _intensifier_boost(word_list, idx) |
|
|
| if word in _POS_WORDS: |
| if negated: |
| neg_score += 1.0 |
| neg_boost = max(neg_boost, int_boost) |
| else: |
| pos_score += 1.0 |
| pos_boost = max(pos_boost, int_boost) |
|
|
| elif word in _NEG_WORDS: |
| if negated: |
| pos_score += 1.0 |
| pos_boost = max(pos_boost, int_boost) |
| else: |
| neg_score += 1.0 |
| neg_boost = max(neg_boost, int_boost) |
|
|
| |
| if pos_score == 0 and neg_score == 0: |
| return "Neutral", 0.55 |
|
|
| |
| if pos_score > neg_score: |
| base_conf = min(0.72 + 0.05 * pos_score + pos_boost, 0.92) |
| return "Positive", round(base_conf, 3) |
|
|
| if neg_score > pos_score: |
| base_conf = min(0.72 + 0.05 * neg_score + neg_boost, 0.92) |
| return "Negative", round(base_conf, 3) |
|
|
| |
| return "Neutral", 0.58 |
|
|
|
|
| |
|
|
| def predict_sentiment(text: str) -> tuple[str, float]: |
| """ |
| Classify a comment's sentiment. |
| |
| Parameters |
| ---------- |
| text : str |
| Raw comment text (may be Hinglish, emoji-containing, mixed script, or None). |
| |
| Returns |
| ------- |
| label : str |
| One of "Positive", "Neutral", "Negative". |
| confidence : float |
| Rule-based confidence in [0.50, 0.95]. |
| |
| Notes |
| ----- |
| - Deterministic: same input always produces the same output. |
| - No ML models, no I/O, no side effects. |
| - None and empty/whitespace-only strings return ("Neutral", 0.55). |
| """ |
| if not text or not text.strip(): |
| return "Neutral", 0.55 |
|
|
| label, conf = _classify(text) |
|
|
| |
| emoji_adj = _emoji_score(text) |
| conf = round(max(0.50, min(conf + emoji_adj, 0.95)), 3) |
|
|
| return label, conf |
|
|