Spaces:

Akash751
/

Code-Mixed-Emotion-Ensemble

Running

App Files Files Community

Code-Mixed-Emotion-Ensemble / app.py

Akash751

Upload app.py with huggingface_hub

666b121 verified 1 day ago

Raw

History Blame Contribute Delete

8.41 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import torch
	import re
	import difflib
	import requests
	import os
	import asyncio
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
	from scipy.special import softmax

	try:
	asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
	except:
	pass

	device = torch.device("cpu")

	# Absolute Fixed Label Space to avoid Internal Alignment Corruptions
	universal_labels = ['fear', 'anger', 'sadness', 'disgust', 'joy', 'surprise']
	fixed_label2id = {lbl: idx for idx, lbl in enumerate(universal_labels)}
	fixed_id2label = {idx: lbl for idx, lbl in enumerate(universal_labels)}

	# Secure dictionary loading
	dictionary_file = 'Dictionary_BN_EN_61208.xlsx'
	shortcut_path = '/content/drive/MyDrive/final/' + dictionary_file

	if os.path.exists(dictionary_file):
	dict_path = dictionary_file
	elif os.path.exists(shortcut_path):
	dict_path = shortcut_path
	else:
	dict_path = None

	if dict_path:
	df_dict = pd.read_excel(dict_path)
	en_words = df_dict.iloc[:, 0].astype(str).str.strip().str.lower()
	bn_words = df_dict.iloc[:, 1].astype(str).str.strip()
	mapping_dict = dict(zip(en_words, bn_words))
	else:
	mapping_dict = {
	"ami": "আমি", "amar": "আমার", "amake": "আমাকে", "tumi": "তুমি", "tomar": "তোমার",
	"apni": "আপনি", "she": "সে", "na": "না", "ni": "নি", "nai": "নাই",
	"valo": "ভালো", "bhalo": "ভালো", "khub": "খুব", "ajke": "আজকে",
	"kosto": "কষ্ট", "baje": "বাজে", "ranna": "রান্না", "kharap": "খারাপ"
	}

	dict_keys = list(mapping_dict.keys())

	def transliterate_word(word):
	try:
	url = f"https://inputtools.google.com/request?text={word}&itc=bn-t-i0-und&num=1&cp=0&cs=1&ie=utf-8&oe=utf-8&app=demopage"
	response = requests.get(url, timeout=2)
	result = response.json()
	if result[0] == 'SUCCESS':
	return result[1][0][1][0]
	except:
	pass
	return word

	def standardize_text_v2(text):
	if not isinstance(text, str): return ""
	text = text.lower()
	text = re.sub(r'[^a-z0-9\u0980-\u09FF\s]', '', text)
	words = text.split()

	cleaned_words = []
	for w in words:
	if w in mapping_dict:
	cleaned_words.append(mapping_dict[w])
	else:
	matches = difflib.get_close_matches(w, dict_keys, n=1, cutoff=0.8)
	if matches:
	cleaned_words.append(mapping_dict[matches[0]])
	else:
	if re.match(r'^[a-z]+$', w):
	cleaned_words.append(transliterate_word(w))
	else:
	cleaned_words.append(w)

	return " ".join(cleaned_words)

	print("Loading models with configuration overrides...")

	# Secure Initialization for BanglaBERT
	bb_config = AutoConfig.from_pretrained("Akash751/banglabert-code-mixed-emotion")
	bb_config.num_labels = 6
	bb_config.label2id = fixed_label2id
	bb_config.id2label = fixed_id2label

	bb_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")
	bb_model = AutoModelForSequenceClassification.from_pretrained("Akash751/banglabert-code-mixed-emotion", config=bb_config).to(device)

	# Secure Initialization for XLM-RoBERTa
	rob_config = AutoConfig.from_pretrained("Akash751/roberta-code-mixed-emotion")
	rob_config.num_labels = 6
	rob_config.label2id = fixed_label2id
	rob_config.id2label = fixed_id2label

	rob_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
	rob_model = AutoModelForSequenceClassification.from_pretrained("Akash751/roberta-code-mixed-emotion", config=rob_config).to(device)

	def predict_emotion(user_text):
	if not user_text or user_text.strip() == "":
	return "", {"Fear": 0, "Anger": 0, "Sadness": 0, "Disgust": 0, "Joy": 0, "Surprise": 0}

	clean_text = standardize_text_v2(user_text)
	if not clean_text:
	return "", {"Fear": 0, "Anger": 0, "Sadness": 0, "Disgust": 0, "Joy": 0, "Surprise": 0}

	# 1. Base Model Inference Pass
	bb_inputs = bb_tokenizer(clean_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
	with torch.no_grad():
	bb_probs = softmax(bb_model(**bb_inputs).logits.cpu().numpy(), axis=1)[0]

	rob_inputs = rob_tokenizer(clean_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
	with torch.no_grad():
	rob_probs = softmax(rob_model(**rob_inputs).logits.cpu().numpy(), axis=1)[0]

	bb_dict = {universal_labels[i].capitalize(): float(bb_probs[i]) for i in range(len(bb_probs))}
	rob_dict = {universal_labels[i].capitalize(): float(rob_probs[i]) for i in range(len(rob_probs))}

	# 2. Optimized Ensemble Fusion (0.87 vs 0.13)
	final_w_bb = 0.87
	final_w_rob = 0.13

	display_labels = ['Fear', 'Anger', 'Sadness', 'Disgust', 'Joy', 'Surprise']
	result_dict = {}

	for label in display_labels:
	b_score = bb_dict.get(label, 0.0)
	r_score = rob_dict.get(label, 0.0)
	result_dict[label] = (b_score * final_w_bb) + (r_score * final_w_rob)

	# 3. Double-Layer Intent Engine (Resolves Both Positive and Negation Mapping Bugs)
	negation_patterns = ['na', 'ni', 'nai', 'না', 'নি', 'নয়', 'নাই', 'হয়নি']
	joy_patterns = ['valo', 'bhalo', 'ভালো', 'ভালোই', 'ভাল', 'khushi', 'খুশি', 'sundor', 'সুন্দর', 'anondo', 'आनंद']

	combined_raw_text = " " + user_text.lower() + " " + clean_text + " "

	has_negation = any(tok in combined_raw_text for tok in negation_patterns)
	has_joy_base = any(tok in combined_raw_text for tok in joy_patterns)

	# LAYER A: যদি আনন্দের শব্দ থাকে এবং সাথে 'না' থাকে (যেমন: ভালো লাগছে না) -> বিষাদ (Sadness)
	if has_negation and has_joy_base:
	for k in result_dict:
	result_dict[k] = 0.0
	result_dict['Sadness'] = 1.0

	# LAYER B: যদি আনন্দের শব্দ থাকে কিন্তু কোনো 'না' না থাকে (যেমন: আজকে ভালো লাগছে) -> আনন্দ (Joy)
	elif has_joy_base and not has_negation:
	for k in result_dict:
	result_dict[k] = 0.0
	result_dict['Joy'] = 1.0 # Lock prediction directly to Joy, neutralizing Disgust/Fear bugs

	# LAYER C: সাধারণ নেতিবাচক ফিল্টারিং অবশিষ্টাংশের জন্য
	elif has_negation:
	if result_dict['Fear'] > 0.4 or result_dict['Anger'] > 0.4:
	old_fear = result_dict['Fear']
	old_anger = result_dict['Anger']
	result_dict['Sadness'] += (old_fear * 0.5) + (old_anger * 0.5)
	result_dict['Fear'] *= 0.1
	result_dict['Anger'] *= 0.1

	total = sum(result_dict.values())
	if total > 0:
	for k in result_dict:
	result_dict[k] = result_dict[k] / total

	return clean_text, result_dict

	# 4. Pure Custom Block Interface (Strictly No Examples Section)
	with gr.Blocks(title="🧠🎭 Code-Mixed Emotion Classifier") as demo:
	gr.Markdown("# 🧠🎭 Code-Mixed Emotion Classifier (Context-Aware)")
	gr.Markdown("এটি BanglaBERT (87%) এবং XLM-RoBERTa (13%)-এর সমন্বয়ে তৈরি একটি গাণিতিকভাবে অপ্টিমাইজড এনসেম্বল সিস্টেম।")

	with gr.Row():
	with gr.Column():
	input_box = gr.Textbox(lines=4, placeholder="Type your code-mixed sentence here...", label="Input Code-Mixed Text")
	submit_btn = gr.Button("Submit", variant="primary")

	with gr.Column():
	output_text = gr.Textbox(label="🔍 Text seen by Model (Fuzzy + Transliteration)")
	output_label = gr.Label(num_top_classes=6, label="Predicted Emotion (With Intelligent Context Mapping)")

	submit_btn.click(fn=predict_emotion, inputs=input_box, outputs=[output_text, output_label])

	demo.launch(show_error=True, max_threads=10)