Spaces:

mahmoudmohammad
/

Arabic-Sentiment-Analyzer

Running

App Files Files Community

Arabic-Sentiment-Analyzer / app.py

mahmoudmohammad

Upload 2 files

f65d6cb verified about 1 month ago

raw

history blame contribute delete

6.83 kB

	import os
	import re
	import string
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import gradio as gr
	from transformers import AutoTokenizer, AutoModel
	from peft import LoraConfig, get_peft_model
	from huggingface_hub import hf_hub_download

	# ==========================================
	# 1. Config & Global Initialization (Cached)
	# ==========================================
	MODEL_NAME = "CAMeL-Lab/bert-base-arabic-camelbert-mix-sentiment"
	# PLEASE UPDATE THIS TO YOUR CREATED MODEL REPO FROM STEP 1
	REPO_ID = "mahmoudmohammad/MARBERTv2-Sentiment_Classification"
	MODEL_FILE = "best_stl_model.pth"

	MAX_LEN = 256
	LORA_R = 32
	LORA_ALPHA = 64
	LORA_DROPOUT = 0.1

	# Classes matched from sklearn LabelEncoder mapping: 0, 1, 2
	LABELS = {0: "Negative", 1: "Neutral", 2: "Positive"}

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# ==========================================
	# 2. Text Preprocessing Function
	# ==========================================
	def clean_text(text):
	if not isinstance(text, str):
	return str(text)
	text = re.sub(r'http\S+\|www\.\S+', '', text)
	text = re.sub(r'[@]\S+', '', text)
	text = re.sub(r'\S+@\S+', ' ', text)
	text = re.sub(r'\d+\|[٠١٢٣٤٥٦٧٨٩]+', '', text)
	text = text.replace('#', ' ').replace('_', ' ')
	text = re.sub("[إأآا]", "ا", text)
	text = re.sub("ى", "ي", text)
	text = re.sub("ؤ", "و", text)
	text = re.sub("ئ", "ي", text)
	text = re.sub("ة", "ه", text)
	arabic_punc = '`÷×؛«»<>()*&^%][ـ،/:".،,\'{}~¦+\|"…""–ـ'
	eng_punc = string.punctuation.replace('!', '').replace('?', '')
	text = text.translate(str.maketrans('', '', arabic_punc + eng_punc))
	return re.sub(r'\s+', ' ', text).strip()

	# ==========================================
	# 3. Model Architecture
	# ==========================================
	class SentimentSTL(nn.Module):
	def __init__(self, num_classes=3):
	super().__init__()
	self.encoder = AutoModel.from_pretrained(MODEL_NAME)

	peft_config = LoraConfig(
	task_type="FEATURE_EXTRACTION",
	r=LORA_R,
	lora_alpha=LORA_ALPHA,
	lora_dropout=LORA_DROPOUT,
	target_modules=["query", "value"]
	)
	self.peft_model = get_peft_model(self.encoder, peft_config)
	hidden_size = self.peft_model.config.hidden_size

	self.classifier = nn.Sequential(
	nn.Dropout(0.3),
	nn.Linear(hidden_size, hidden_size // 2),
	nn.ReLU(),
	nn.Dropout(0.1),
	nn.Linear(hidden_size // 2, num_classes),
	)

	def forward(self, input_ids, attention_mask):
	outputs = self.peft_model(input_ids=input_ids, attention_mask=attention_mask)
	hidden = outputs.last_hidden_state
	mask = attention_mask.unsqueeze(-1).float()
	mean_rep = (hidden * mask).sum(1) / mask.sum(1)
	cls_rep = hidden[:, 0, :]
	cls_rep = (cls_rep + mean_rep) / 2.0
	return self.classifier(cls_rep)

	# ==========================================
	# 4. Cache Load Logic
	# ==========================================
	print("Downloading / verifying cached weights...")
	weights_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_FILE)

	print("Initializing Tokenizer and Model...")
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = SentimentSTL(num_classes=3).to(device)

	print("Loading trained weights into model...")
	model.load_state_dict(torch.load(weights_path, map_location=device))
	model.eval() # Freeze layers into inference mode
	print("Model initialized completely!")

	# ==========================================
	# 5. Inference Logic for Gradio
	# ==========================================
	def predict(text):
	if not text.strip():
	return {LABELS[k]: 0.0 for k in LABELS}

	cleaned_text = clean_text(text)

	enc = tokenizer(
	str(cleaned_text),
	add_special_tokens=True,
	max_length=MAX_LEN,
	padding='max_length',
	truncation=True,
	return_attention_mask=True,
	return_tensors='pt',
	)

	ids = enc['input_ids'].to(device)
	mask = enc['attention_mask'].to(device)

	with torch.no_grad():
	logits = model(ids, mask)
	# Apply Softmax to map network score between [0,1]
	probs = F.softmax(logits, dim=1).squeeze().cpu().tolist()

	# Return formatted Dictionary for the gr.Label() output block
	output = {
	LABELS[0]: float(probs[0]),
	LABELS[1]: float(probs[1]),
	LABELS[2]: float(probs[2])
	}

	return output

	# ==========================================
	# 6. Build User Interface with Gradio
	# ==========================================
	custom_css = """
	body { font-family: 'Tajawal', sans-serif; }
	.output-class { font-weight: bold; }
	"""

	with gr.Blocks(css=custom_css, title="Arabic Sentiment Analyzer") as demo:
	gr.Markdown("<center><h1>🌐 Arabic Sentiment Analysis — CAMeLBERT + LoRA</h1></center>")
	gr.Markdown("<center><b>Analyze Arabic text and detect Negative, Neutral, or Positive sentiment.</b><br> This model is balanced trained specifically on an augmented pipeline, leveraging LoRA representation over base features.</center>")

	with gr.Row():
	with gr.Column(scale=2):
	input_textbox = gr.Textbox(
	label="Arabic Input Text",
	placeholder="...اكتب أو ألصق الجملة العربية هنا",
	lines=5,
	rtl=True
	)
	submit_button = gr.Button("Analyze / تحليل", variant="primary")

	gr.Examples(
	examples=[
	"هذا المطعم يقدم طعام سيء جدًا ومحروق، لا أنصح أحد بزيارته.",
	"المؤتمر الصحفي لوزير الاتصالات تم بالأمس لمناقشة بعض الأمور الإقتصادية.",
	"بصراحة، الخدمة كانت ممتازة والموظفين غاية في الإحترام، أنصح بشدة!"
	],
	inputs=input_textbox,
	label="Examples / أمثلة سريعة"
	)

	with gr.Column(scale=1):
	output_label = gr.Label(
	label="Sentiment Probabilities",
	num_top_classes=3
	)

	submit_button.click(fn=predict, inputs=input_textbox, outputs=output_label)

	# Launch UI!
	if __name__ == "__main__":
	demo.launch(share=False)