| import ftfy |
| import pandas as pd |
| from tqdm.auto import tqdm |
| import torch |
| import re |
| import emoji |
| import html |
| import gradio as gr |
|
|
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| import os |
|
|
| |
| |
| |
| ID2LABEL = { |
| 0: "Happiness", |
| 1: "Fear", |
| 2: "Anger", |
| 3: "Sadness", |
| 4: "Neutral" |
| } |
|
|
| |
| |
| |
| abbr_file = "malay_sms_abbreviations.csv" |
| abbr_df = pd.read_csv(abbr_file) |
|
|
| abbr_df['Abbreviation'] = abbr_df['Abbreviation'].astype(str).str.lower().str.strip() |
| abbr_df['Original'] = abbr_df['Original'].astype(str).str.lower().str.strip() |
|
|
| ABBR_MAP = dict(zip(abbr_df['Abbreviation'], abbr_df['Original'])) |
|
|
| ABBR_PATTERN = re.compile( |
| r'\\b(' + '|'.join(map(re.escape, ABBR_MAP.keys())) + r')\\b' |
| ) |
|
|
| def expand_malay_abbreviations(text: str) -> str: |
| return ABBR_PATTERN.sub(lambda m: ABBR_MAP[m.group(0)], text) |
|
|
| |
| |
| |
|
|
| def fix_mojibake(text): |
| if not isinstance(text, str): |
| return text |
|
|
| replacements = { |
| '馃槴': '😭', |
| '馃槶': '😫', |
| '馃 didn\'t': '😭', |
| '鈥檓': "'m", |
| '鈥橲': "'s", |
| '鈥檚': "'s", |
| '鈥': "'", |
| 'ðÿ˜': '😭', |
| 'ðÿ': '😭', |
| } |
|
|
| for bad, good in replacements.items(): |
| text = text.replace(bad, good) |
|
|
| try: |
| text = text.encode('cp1252').decode('utf-8') |
| except Exception: |
| pass |
|
|
| text = ftfy.fix_text(text) |
| return text |
|
|
| |
| |
| |
|
|
| def preprocess_text(text: str) -> str: |
| text = str(text) |
|
|
| text = fix_mojibake(text) |
| text = html.unescape(text) |
| text = text.lower() |
| text = expand_malay_abbreviations(text) |
| text = emoji.demojize(text) |
| text = re.sub(r"http\\S+|www\\S+", "", text) |
| text = re.sub(r"@[A-Za-z0-9_.]+", "", text) |
| text = re.sub(r"<.*?>", "", text) |
| text = re.sub(r"#(\\w+)", r"\\1", text) |
| text = re.sub(r"(.)\\1{3,}", r"\\1\\1\\1", text) |
| text = re.sub(r"\\s+", " ", text).strip() |
|
|
| return text |
|
|
| |
| |
| |
|
|
| class EmotionClassifier: |
| def __init__(self, model_name="codeswitch-emotion/emotion-v4"): |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
| |
| |
| |
| |
| token = os.environ.get("HUGGINGFACE_HUB_TOKEN", None) |
|
|
| print(f"Loading model: {model_name}") |
| if token: |
| print("Using token from HUGGINGFACE_HUB_TOKEN") |
| else: |
| print("No token in environment; using local HF CLI credentials if available") |
|
|
| self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token) |
| self.model = AutoModelForSequenceClassification.from_pretrained(model_name, use_auth_token=token) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| def predict(self, text): |
| processed_text = preprocess_text(text) |
|
|
| inputs = self.tokenizer( |
| processed_text, |
| return_tensors="pt", |
| truncation=True, |
| padding=True, |
| max_length=128 |
| ) |
|
|
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
| logits = outputs.logits |
|
|
| probs = torch.softmax(logits, dim=-1) |
| pred_id = torch.argmax(probs, dim=-1).item() |
| confidence = probs[0][pred_id].item() |
|
|
| return ( |
| ID2LABEL[pred_id], |
| round(confidence, 4), |
| processed_text |
| ) |
|
|
|
|
| |
| |
| |
|
|
| classifier = EmotionClassifier() |
|
|
| def gradio_predict(text): |
| label, confidence, processed = classifier.predict(text) |
| return { |
| "Emotion": label, |
| "Confidence": confidence, |
| "Processed text": processed |
| } |
|
|
| with gr.Blocks(title="Manglish Emotion Classifier") as demo: |
| gr.Markdown( |
| """ |
| # 🇲🇾 Manglish Emotion Classifier |
| This demo detects **emotion** from Malay–English code-switching text. |
| |
| **Supported emotions**: Happiness, Fear, Anger, Sadness, Neutral |
| """ |
| ) |
|
|
| input_text = gr.Textbox( |
| label="Input text", |
| placeholder="weh today damn stress la 😭", |
| lines=3 |
| ) |
|
|
| output = gr.JSON(label="Prediction") |
|
|
| btn = gr.Button("Predict emotion") |
| btn.click(fn=gradio_predict, inputs=input_text, outputs=output) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|