Spaces:

Divyonko
/

LivePulse

Sleeping

LivePulse / ml /sentiment_model.py

DivYonko

Keyword improvements from new CSV analysis - 28/28 tests passing

67899d6 11 days ago

16.6 kB

	# -- coding: utf-8 --
	"""
	ml/sentiment_model.py
	=====================
	Pure keyword/rule-based sentiment classifier for YouTube live-chat comments.
	No ML models are loaded — classification is entirely keyword/regex-based.

	Approach
	--------
	1. Emoji scoring — positive/negative emoji characters boost confidence
	2. Negation check — "nahi accha" flips Positive → Negative
	3. Intensifier boost — "bahut accha" raises confidence
	4. Keyword matching — expanded Hinglish + English + regional + typo variants
	5. Fallback — Neutral at 0.55 if nothing fires

	Public API
	----------
	predict_sentiment(text: str) -> tuple[str, float]
	Returns (label, confidence) where label ∈ {"Positive", "Neutral", "Negative"}
	and confidence ∈ [0.50, 0.95].
	"""

	from __future__ import annotations

	import re

	import emoji


	# ── Emoji scoring ──────────────────────────────────────────────────────────────
	# Positive and negative emoji keyword sets (matched against demojized names)
	_EMOJI_POS_KW = {
	"love", "fire", "happy", "laugh", "win", "cool", "best", "heart",
	"smile", "star", "clap", "pray", "sparkle", "sun", "rainbow",
	"thumbs_up", "raised_hands", "partying", "grinning", "beaming",
	"smiling", "joy", "hundred", "muscle", "trophy", "crown",
	}
	_EMOJI_NEG_KW = {
	"angry", "sad", "cry", "worst", "bad", "hate", "skull", "vomit",
	"rage", "broken", "disappointed", "thumbs_down", "weary", "tired",
	"loudly_crying", "fearful", "anguished", "confounded", "persevere",
	"unamused", "expressionless", "nauseated", "sneezing",
	}


	def _emoji_score(text: str) -> float:
	"""Return a float in roughly [-0.4, 0.4] based on emoji sentiment."""
	score = 0.0
	for ch in text:
	if emoji.is_emoji(ch):
	name = emoji.demojize(ch)
	if any(k in name for k in _EMOJI_POS_KW):
	score += 0.15
	elif any(k in name for k in _EMOJI_NEG_KW):
	score -= 0.15
	return max(-0.4, min(score, 0.4))


	# ── Negation words ─────────────────────────────────────────────────────────────
	# These flip the sentiment of a keyword that follows within a short window.
	_NEGATION_WORDS: set[str] = {
	# Hindi / Hinglish
	"nahi", "nhi", "nahin", "na", "mat", "naa", "nope",
	"bilkul nahi", "kabhi nahi", "kabhi nhi",
	# English
	"not", "no", "never", "neither", "nor", "without",
	"don't", "dont", "doesn't", "doesnt", "didn't", "didnt",
	"can't", "cant", "won't", "wont", "isn't", "isnt",
	"wasn't", "wasnt", "aren't", "arent", "weren't", "werent",
	"hardly", "barely", "scarcely",
	}

	# Window size: how many words before a sentiment word to check for negation
	_NEGATION_WINDOW = 3


	def _is_negated(word_list: list[str], sentiment_idx: int) -> bool:
	"""Return True if a negation word appears within _NEGATION_WINDOW words before OR after sentiment_idx.

	Handles both:
	- pre-negation: "nahi accha tha" (negation before sentiment word)
	- post-negation: "boring nahi tha" (negation after sentiment word)
	"""
	# Look before
	start = max(0, sentiment_idx - _NEGATION_WINDOW)
	before = word_list[start:sentiment_idx]
	if any(w in _NEGATION_WORDS for w in before):
	return True
	# Look after (smaller window — 2 words)
	after = word_list[sentiment_idx + 1: sentiment_idx + 3]
	return any(w in _NEGATION_WORDS for w in after)


	# ── Intensifier words ──────────────────────────────────────────────────────────
	# These amplify the confidence when they appear near a sentiment word.
	_INTENSIFIERS: dict[str, float] = {
	# Hindi / Hinglish
	"bahut": 0.10, # very
	"bohot": 0.10,
	"bht": 0.08,
	"ekdum": 0.12, # absolutely
	"bilkul": 0.10, # completely
	"itna": 0.08, # this much
	"kitna": 0.06,
	"zyada": 0.08, # more/too much
	"bohat": 0.10,
	"atyant": 0.10, # extremely (formal Hindi)
	"sampurn": 0.08, # completely
	# English
	"very": 0.08,
	"too": 0.08,
	"so": 0.06,
	"super": 0.10,
	"ultra": 0.10,
	"extremely": 0.12,
	"absolutely": 0.12,
	"totally": 0.10,
	"really": 0.08,
	"truly": 0.08,
	"highly": 0.08,
	"deeply": 0.08,
	"insanely": 0.10,
	"incredibly": 0.10,
	"genuinely": 0.08,
	}

	_INTENSIFIER_WINDOW = 2


	def _intensifier_boost(word_list: list[str], sentiment_idx: int) -> float:
	"""Return confidence boost from intensifiers within _INTENSIFIER_WINDOW words before sentiment_idx."""
	start = max(0, sentiment_idx - _INTENSIFIER_WINDOW)
	window = word_list[start:sentiment_idx]
	boost = sum(_INTENSIFIERS.get(w, 0.0) for w in window)
	return min(boost, 0.15) # cap single-word boost contribution


	# ── Positive keyword set ───────────────────────────────────────────────────────
	_POS_WORDS: set[str] = {
	# ── Core Hinglish slang ──
	"mast", "jhakaas", "kadak", "zabardast", "kamaal", "bindaas",
	"shandar", "lajawaab", "lajawab", "lajaab", "waah", "wah",
	"dhansu", "badhiya", "badiya", "maja", "mazza", "maza",
	"acha", "accha", "achha", "acha", "sahi", "sach",
	"shukriya", "dhanyawad", "dhanyavaad", "meherbani", "shukran",
	"pyaar", "pyar", "khushi", "khush",
	"fatafat", "jaldi",

	# ── Typo / abbreviation variants ──
	"osm", "awsm", "awsom", "awsome", "amzing", "amazng",
	"gr8", "grt", "gr9", "fab", "fabbb",
	"superrr", "amazinggg", "besttt", "niceee", "gooddd",
	"thku", "thnku", "thnkuu", "thnkyou", "thanku", "thankyou",
	"thnk", "thnq", "thnks", "thnx", "tysm", "tqsm", "thx", "ty",
	"ty", "tyvm", "tyvmm",

	# ── English positive ──
	"amazing", "awesome", "excellent", "wonderful", "fantastic",
	"brilliant", "outstanding", "exceptional", "magnificent",
	"superb", "perfect", "great", "good", "nice", "beautiful",
	"lovely", "loved", "love", "best", "better",
	"helpful", "useful", "informative", "fruitful", "motivating",
	"motivational", "inspiring", "inspired", "insightful",
	"clear", "clarity", "simple", "easy", "smooth",
	"thankful", "grateful", "blessed", "proud",
	"happy", "glad", "pleased", "satisfied", "content",
	"enjoy", "enjoyed", "enjoying", "fun", "interesting",
	"impressive", "impressed", "incredible", "unbelievable",
	"top", "topnotch", "firstclass", "worldclass",
	"recommend", "recommended", "worth", "worthy",
	"thanks", "thank", "appreciate", "appreciated",
	"respect", "salute", "legend", "goat", "king", "queen",
	"bestest", "bestttttt", "much", "op", "lit",

	# ── Regional / South Indian Hinglish ──
	"semma", # Tamil slang for awesome
	"mass", # Tamil/Telugu slang for impressive
	"vera level", # Tamil slang for next level
	"sema", # variant of semma
	"bindass", # variant of bindaas
	"dum", # strength/power (positive context)
	"dhamakedaar", # explosive/amazing
	"dhamaka", # blast/amazing
	"toofan", # storm (used positively)
	"jalwa", # aura/presence (positive)
	"josh", # enthusiasm/energy
	"full josh",
	"paisa vasool", # worth the money
	"makkhan", # butter smooth (positive)
	"solid", # solid/strong (positive)
	"tight", # tight/solid (positive slang)
	"fire", # fire (positive slang)
	"goated", # GOAT-ed (positive slang)
	"based", # based (positive slang)
	"valid", # valid (positive slang)
	"clean", # clean explanation

	# ── Gratitude phrases (single tokens after normalization) ──
	"shukriyaa", "shukriyaaa", "dhanyawaad", "dhanyawaaad",
	"abhar", # gratitude (formal Hindi)
	"aabhar",

	# ── Common live chat positives ──
	"woww", "wowww", "woah", "whoa", "yay", "yayy",
	"haha", "hahaha", "lol", "lmao", # laughter = positive
	"clap", "claps", "bravo", "chappal",
	"heart", "hearts",
	"100", "1000",

	# ── Greetings / blessings (common in Indian live chats) ──
	"pranam", "pranaam", "namaskar", "namaste", "namasthe",
	"assalamualaikum", "walaikum", "walekum", "waalaikum",
	"jai hind", "jai ho", "jai shree ram", "jai mata di",
	"gm", "gn", "ge",
	"mubarak", "mubarakho",
	"atb",
	"god bless", "stay blessed", "stay safe",
	"welcome", "wlcm", "wlc",
	"congratulations", "congrats",
	"well done", "keep it up", "keep going",
	"proud", "proudly",
	"maza aa gaya", "maza aaya", "maja aa gaya",
	"khyal rakhna",
	"take care",
	"luck",
	"morning",
	"evening",

	# ── Teaching/session appreciation ──
	"teaching", "teacher", "best teacher", "best sir",
	"best pace", "best session", "best class", "best lecture",
	"chaliye", "chalo", "lets go", "lets gooo", "lets", # enthusiasm
	"energy", "josh",
	"1st time", "first time",
	"aagye", "aa gye", "agye",
	"jai", "hind", # "jai hind" splits into two tokens
	}

	# ── Negative keyword set ───────────────────────────────────────────────────────
	_NEG_WORDS: set[str] = {
	# ── Core Hinglish slang ──
	"bakwas", "bakwaas", "bakwaaas",
	"faltu", "faltuu",
	"bekar", "bekaar", "bekaaar",
	"ghatiya", "ghatiiya",
	"wahiyat", "wahiyaat",
	"bura", "buraa",
	"kharab", "kharaaab",
	"boring", "borring", "booring",
	"ullu", "pagal", "paagal",
	"besharam", "besharaam",
	"nafrat", "gussa", "naraaz",
	"dukh", "takleef", "mushkil",
	"uruttu", "battamizi", "battameezi",
	"natak", "nautanki",
	"dhoka", "dhokha", "jhooth", "jhoota",
	"dikhawa", "dikhaawa",
	"beizzati", "beizzatii", "bezaati",
	"sharam", "sharaam",
	"galat", "galt",
	"jhanjhat", "jhamela",
	"tang", "pareshan", "pareshaan",
	"nirasha", "niraash", # disappointment
	"thaka", "thakaan", # tired/exhausted
	"dard", "peeda", # pain
	"rona", "rota", "roti", # crying
	"darr", "dar", "darna", # fear/anxiety
	"dara", "darti", "darte", # fear variants
	"tension", "tensed", # stress
	"scared", "nervous", # fear in English
	"cheat", "cheating",
	"fraud", "fraudiya",
	"loot", "loota", "looting",

	# ── English negative ──
	"useless", "unfair", "disappointing", "disappointed",
	"foolish", "stupid", "idiot", "idiotic",
	"terrible", "horrible", "awful", "dreadful",
	"worst", "worse", "bad", "poor",
	"waste", "wasted", "pathetic",
	"annoying", "annoyed", "irritating", "irritated",
	"frustrating", "frustrated", "frustration",
	"confusing", "confused", "confusion",
	"misleading", "clickbait",
	"fake", "scam", "spam",
	"hate", "hated", "hating",
	"angry", "anger", "rage",
	"sad", "sadness", "unhappy", "upset",
	"wrong", "incorrect", "error", "mistake",
	"problem", "issue", "bug", "broken",
	"slow", "lagging", "lag", "buffering",
	"crash", "crashed", "crashing",
	"fail", "failed", "failure",
	"ignore", "ignored", "ignoring",
	"rude", "disrespect", "disrespectful",
	"unfair", "biased", "bias",
	"overpriced", "expensive", "costly",
	"wtf", "wth", "omg", # context-dependent but often negative in complaints
	"curse", "abusive",
	"liar", "lie", "lies",
	"cheat", "cheater",
	"regret", "regretted", "regrets",
	"never", "worst",

	# ── Typo / abbreviation variants ──
	"bakwaaas", "bekarrr", "borinnng",
	"worstttt", "terribleee",

	# ── Regional / South Indian Hinglish ──
	"kabaad", # junk/trash
	"raddi", # waste/junk
	"kachra", # garbage
	"bekar", # useless (already above)
	"nikamma", # good-for-nothing
	"nalayak", # incompetent
	"kamina", # scoundrel
	"harami", # offensive negative
	"bewakoof", # fool
	"gadha", # donkey (fool)
	"buddhu", # fool
	"duffer", # dull/stupid
	"flop", # flop/failure
	"disaster", # disaster
	"pathetic", # pathetic (already above)
	"cringe", # cringe
	"cap", # cap = lie (slang)
	"mid", # mid = mediocre/bad (slang)
	"trash", # trash
	"garbage", # garbage
	"dogwater", # very bad (gaming slang)
	"lowkey bad",
	"not good",
	"not helpful",
	"not worth",
	"time waste",
	"time wasted",
	"waste of time",
	}


	# ── Text normalisation ─────────────────────────────────────────────────────────

	def _normalise(text: str) -> str:
	"""Lowercase, strip emoji codes, collapse repeated chars, collapse whitespace."""
	# Strip demojized emoji codes like :fire: :thumbs_up:
	t = re.sub(r":[a-z_]+:", " ", text)
	t = t.lower()
	# Collapse 3+ repeated chars to 2: "amazinggg" → "amazingg", "niceee" → "nicee"
	# (keeps double so "woww" still matches "woww" in keyword set)
	t = re.sub(r"(.)\1{2,}", r"\1\1", t)
	t = re.sub(r"\s+", " ", t).strip()
	return t[:512]


	# ── Core classification ────────────────────────────────────────────────────────

	def _classify(text: str) -> tuple[str, float]:
	"""
	Classify normalised text using keyword matching with negation and intensifier handling.

	Returns (label, base_confidence) before emoji adjustment.
	"""
	t = _normalise(text)

	if len(t) <= 2:
	return "Neutral", 0.55

	word_list = t.split()
	word_set = set(word_list)

	pos_score = 0.0
	neg_score = 0.0
	pos_boost = 0.0
	neg_boost = 0.0

	for idx, word in enumerate(word_list):
	negated = _is_negated(word_list, idx)
	int_boost = _intensifier_boost(word_list, idx)

	if word in _POS_WORDS:
	if negated:
	neg_score += 1.0
	neg_boost = max(neg_boost, int_boost)
	else:
	pos_score += 1.0
	pos_boost = max(pos_boost, int_boost)

	elif word in _NEG_WORDS:
	if negated:
	pos_score += 1.0
	pos_boost = max(pos_boost, int_boost)
	else:
	neg_score += 1.0
	neg_boost = max(neg_boost, int_boost)

	# No keyword hits → Neutral
	if pos_score == 0 and neg_score == 0:
	return "Neutral", 0.55

	# Determine winner
	if pos_score > neg_score:
	base_conf = min(0.72 + 0.05 * pos_score + pos_boost, 0.92)
	return "Positive", round(base_conf, 3)

	if neg_score > pos_score:
	base_conf = min(0.72 + 0.05 * neg_score + neg_boost, 0.92)
	return "Negative", round(base_conf, 3)

	# Tie → Neutral with moderate confidence
	return "Neutral", 0.58


	# ── Public API ─────────────────────────────────────────────────────────────────

	def predict_sentiment(text: str) -> tuple[str, float]:
	"""
	Classify a comment's sentiment.

	Parameters
	----------
	text : str
	Raw comment text (may be Hinglish, emoji-containing, mixed script, or None).

	Returns
	-------
	label : str
	One of "Positive", "Neutral", "Negative".
	confidence : float
	Rule-based confidence in [0.50, 0.95].

	Notes
	-----
	- Deterministic: same input always produces the same output.
	- No ML models, no I/O, no side effects.
	- None and empty/whitespace-only strings return ("Neutral", 0.55).
	"""
	if not text or not text.strip():
	return "Neutral", 0.55

	label, conf = _classify(text)

	# Adjust confidence by emoji sentiment in the original text
	emoji_adj = _emoji_score(text)
	conf = round(max(0.50, min(conf + emoji_adj, 0.95)), 3)

	return label, conf