HakimiMasstar's picture
second commit
f0d6340
import ftfy
import pandas as pd
from tqdm.auto import tqdm
import torch
import re
import emoji
import html
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
# ===============================
# Label mapping (MUST match training)
# ===============================
ID2LABEL = {
0: "Happiness",
1: "Fear",
2: "Anger",
3: "Sadness",
4: "Neutral"
}
# ===============================
# Load Malay SMS abbreviations
# ===============================
abbr_file = "malay_sms_abbreviations.csv" # make sure this file exists
abbr_df = pd.read_csv(abbr_file)
abbr_df['Abbreviation'] = abbr_df['Abbreviation'].astype(str).str.lower().str.strip()
abbr_df['Original'] = abbr_df['Original'].astype(str).str.lower().str.strip()
ABBR_MAP = dict(zip(abbr_df['Abbreviation'], abbr_df['Original']))
ABBR_PATTERN = re.compile(
r'\\b(' + '|'.join(map(re.escape, ABBR_MAP.keys())) + r')\\b'
)
def expand_malay_abbreviations(text: str) -> str:
return ABBR_PATTERN.sub(lambda m: ABBR_MAP[m.group(0)], text)
# ===============================
# Mojibake Fix
# ===============================
def fix_mojibake(text):
if not isinstance(text, str):
return text
replacements = {
'馃槴': '😭',
'馃槶': '😫',
'馃 didn\'t': '😭',
'鈥檓': "'m",
'鈥橲': "'s",
'鈥檚': "'s",
'鈥': "'",
'ðÿ˜­': '😭',
'ðÿ': '😭',
}
for bad, good in replacements.items():
text = text.replace(bad, good)
try:
text = text.encode('cp1252').decode('utf-8')
except Exception:
pass
text = ftfy.fix_text(text)
return text
# ===============================
# Preprocessing pipeline
# ===============================
def preprocess_text(text: str) -> str:
text = str(text)
text = fix_mojibake(text)
text = html.unescape(text)
text = text.lower()
text = expand_malay_abbreviations(text)
text = emoji.demojize(text)
text = re.sub(r"http\\S+|www\\S+", "", text)
text = re.sub(r"@[A-Za-z0-9_.]+", "", text)
text = re.sub(r"<.*?>", "", text)
text = re.sub(r"#(\\w+)", r"\\1", text)
text = re.sub(r"(.)\\1{3,}", r"\\1\\1\\1", text)
text = re.sub(r"\\s+", " ", text).strip()
return text
# ===============================
# Emotion Classifier (HF Hub)
# ===============================
class EmotionClassifier:
def __init__(self, model_name="codeswitch-emotion/emotion-v4"):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# -------------------------------
# Use HF Space token if present
# Otherwise rely on local hf auth
# -------------------------------
token = os.environ.get("HUGGINGFACE_HUB_TOKEN", None)
print(f"Loading model: {model_name}")
if token:
print("Using token from HUGGINGFACE_HUB_TOKEN")
else:
print("No token in environment; using local HF CLI credentials if available")
self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token)
self.model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=token)
self.model.to(self.device)
self.model.eval()
def predict(self, text):
processed_text = preprocess_text(text)
inputs = self.tokenizer(
processed_text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
pred_id = torch.argmax(probs, dim=-1).item()
confidence = probs[0][pred_id].item()
return (
ID2LABEL[pred_id],
round(confidence, 4),
processed_text
)
# ===============================
# Gradio Interface
# ===============================
classifier = EmotionClassifier()
def gradio_predict(text):
label, confidence, processed = classifier.predict(text)
return {
"Emotion": label,
"Confidence": confidence,
"Processed text": processed
}
with gr.Blocks(title="Manglish Emotion Classifier") as demo:
gr.Markdown(
"""
# 🇲🇾 Manglish Emotion Classifier
This demo detects **emotion** from Malay–English code-switching text.
**Supported emotions**: Happiness, Fear, Anger, Sadness, Neutral
"""
)
input_text = gr.Textbox(
label="Input text",
placeholder="weh today damn stress la 😭",
lines=3
)
output = gr.JSON(label="Prediction")
btn = gr.Button("Predict emotion")
btn.click(fn=gradio_predict, inputs=input_text, outputs=output)
if __name__ == "__main__":
demo.launch()