Spaces:

codeswitch-emotion
/

codeswitch-emotion-gradio

Sleeping

App Files Files Community

codeswitch-emotion-gradio / app.py

HakimiMasstar

second commit

f0d6340 3 months ago

raw

history blame contribute delete

5 kB

	import ftfy
	import pandas as pd
	from tqdm.auto import tqdm
	import torch
	import re
	import emoji
	import html
	import gradio as gr

	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import os

	# ===============================
	# Label mapping (MUST match training)
	# ===============================
	ID2LABEL = {
	0: "Happiness",
	1: "Fear",
	2: "Anger",
	3: "Sadness",
	4: "Neutral"
	}

	# ===============================
	# Load Malay SMS abbreviations
	# ===============================
	abbr_file = "malay_sms_abbreviations.csv" # make sure this file exists
	abbr_df = pd.read_csv(abbr_file)

	abbr_df['Abbreviation'] = abbr_df['Abbreviation'].astype(str).str.lower().str.strip()
	abbr_df['Original'] = abbr_df['Original'].astype(str).str.lower().str.strip()

	ABBR_MAP = dict(zip(abbr_df['Abbreviation'], abbr_df['Original']))

	ABBR_PATTERN = re.compile(
	r'\\b(' + '\|'.join(map(re.escape, ABBR_MAP.keys())) + r')\\b'
	)

	def expand_malay_abbreviations(text: str) -> str:
	return ABBR_PATTERN.sub(lambda m: ABBR_MAP[m.group(0)], text)

	# ===============================
	# Mojibake Fix
	# ===============================

	def fix_mojibake(text):
	if not isinstance(text, str):
	return text

	replacements = {
	'馃槴': '😭',
	'馃槶': '😫',
	'馃 didn\'t': '😭',
	'鈥檓': "'m",
	'鈥橲': "'s",
	'鈥檚': "'s",
	'鈥': "'",
	'ðÿ˜': '😭',
	'ðÿ': '😭',
	}

	for bad, good in replacements.items():
	text = text.replace(bad, good)

	try:
	text = text.encode('cp1252').decode('utf-8')
	except Exception:
	pass

	text = ftfy.fix_text(text)
	return text

	# ===============================
	# Preprocessing pipeline
	# ===============================

	def preprocess_text(text: str) -> str:
	text = str(text)

	text = fix_mojibake(text)
	text = html.unescape(text)
	text = text.lower()
	text = expand_malay_abbreviations(text)
	text = emoji.demojize(text)
	text = re.sub(r"http\\S+\|www\\S+", "", text)
	text = re.sub(r"@[A-Za-z0-9_.]+", "", text)
	text = re.sub(r"<.*?>", "", text)
	text = re.sub(r"#(\\w+)", r"\\1", text)
	text = re.sub(r"(.)\\1{3,}", r"\\1\\1\\1", text)
	text = re.sub(r"\\s+", " ", text).strip()

	return text

	# ===============================
	# Emotion Classifier (HF Hub)
	# ===============================

	class EmotionClassifier:
	def __init__(self, model_name="codeswitch-emotion/emotion-v4"):
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# -------------------------------
	# Use HF Space token if present
	# Otherwise rely on local hf auth
	# -------------------------------
	token = os.environ.get("HUGGINGFACE_HUB_TOKEN", None)

	print(f"Loading model: {model_name}")
	if token:
	print("Using token from HUGGINGFACE_HUB_TOKEN")
	else:
	print("No token in environment; using local HF CLI credentials if available")

	self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
	self.model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=token)
	self.model.to(self.device)
	self.model.eval()

	def predict(self, text):
	processed_text = preprocess_text(text)

	inputs = self.tokenizer(
	processed_text,
	return_tensors="pt",
	truncation=True,
	padding=True,
	max_length=128
	)

	inputs = {k: v.to(self.device) for k, v in inputs.items()}

	with torch.no_grad():
	outputs = self.model(**inputs)
	logits = outputs.logits

	probs = torch.softmax(logits, dim=-1)
	pred_id = torch.argmax(probs, dim=-1).item()
	confidence = probs[0][pred_id].item()

	return (
	ID2LABEL[pred_id],
	round(confidence, 4),
	processed_text
	)


	# ===============================
	# Gradio Interface
	# ===============================

	classifier = EmotionClassifier()

	def gradio_predict(text):
	label, confidence, processed = classifier.predict(text)
	return {
	"Emotion": label,
	"Confidence": confidence,
	"Processed text": processed
	}

	with gr.Blocks(title="Manglish Emotion Classifier") as demo:
	gr.Markdown(
	"""
	# 🇲🇾 Manglish Emotion Classifier
	This demo detects emotion from Malay–English code-switching text.

	Supported emotions: Happiness, Fear, Anger, Sadness, Neutral
	"""
	)

	input_text = gr.Textbox(
	label="Input text",
	placeholder="weh today damn stress la 😭",
	lines=3
	)

	output = gr.JSON(label="Prediction")

	btn = gr.Button("Predict emotion")
	btn.click(fn=gradio_predict, inputs=input_text, outputs=output)

	if __name__ == "__main__":
	demo.launch()