| import gradio as gr
|
| import torch
|
| import re
|
| from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
|
|
|
|
|
| MODEL_ID = "mahmoudmohammad/marbertv2_single-label-dialect"
|
|
|
|
|
| LABEL_MAP = {
|
| 0: 'Algerian', 1: 'Egyptian', 2: 'Iraqi', 3: 'Jordanian',
|
| 4: 'Lebanese', 5: 'Libyan', 6: 'MSA', 7: 'Moroccan',
|
| 8: 'Palestinian', 9: 'Qatari', 10: 'Saudi', 11: 'Syrian',
|
| 12: 'Tunisian', 13: 'Yemeni'
|
| }
|
|
|
|
|
|
|
|
|
|
|
|
|
| print(f"Loading {MODEL_ID} from Hugging Face...")
|
| try:
|
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
|
| model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
|
| model.eval()
|
| print("✅ Model loaded successfully!")
|
| except Exception as e:
|
| print(f"❌ Error loading model: {e}")
|
|
|
|
|
|
|
|
|
| def preprocess_arabic_dialect(text: str) -> str:
|
| """Cleans social media dialectal Arabic text. Exact copy from training script."""
|
| if not isinstance(text, str):
|
| return ""
|
|
|
| text = re.sub(r'http\S+|www\.\S+|<.*?>', ' ', text)
|
| text = re.sub(r'@\w+', ' ', text)
|
| text = re.sub(r'#', '', text)
|
| tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
|
| text = re.sub(tashkeel, '', text)
|
| text = re.sub(r'\u0640', '', text)
|
| text = re.sub(r'(.)\1+', r'\1\1', text)
|
| text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
|
| text = re.sub(r'\s+', ' ', text).strip()
|
|
|
| return text
|
|
|
|
|
|
|
|
|
| def predict_dialect(text: str):
|
| if not text.strip():
|
|
|
| return {label: 0.0 for label in LABEL_MAP.values()}
|
|
|
|
|
| clean_text = preprocess_arabic_dialect(text)
|
|
|
|
|
| inputs = tokenizer(
|
| clean_text,
|
| return_tensors="pt",
|
| truncation=True,
|
| max_length=128,
|
| padding="max_length"
|
| )
|
|
|
|
|
| with torch.no_grad():
|
| outputs = model(**inputs)
|
| logits = outputs.logits
|
|
|
| probs = torch.nn.functional.softmax(logits, dim=-1)[0]
|
|
|
|
|
|
|
| results = {LABEL_MAP[i]: float(probs[i]) for i in range(len(LABEL_MAP))}
|
|
|
| return results
|
|
|
|
|
|
|
|
|
|
|
|
|
| dark_mode_js = """
|
| function() {
|
| document.body.classList.add('dark');
|
| }
|
| """
|
|
|
| with gr.Blocks(js=dark_mode_js, theme=gr.themes.Monochrome(primary_hue="purple")) as demo:
|
| gr.Markdown("# 🌍 Arabic Dialect Detector")
|
| gr.Markdown("Identify whether text represents **MSA** or one of 13 Regional **Arabic Dialects** (e.g., Egyptian, Saudi, Moroccan, Lebanese...). \n*Powered by a Fine-Tuned MARBERTv2 base model.*")
|
|
|
| with gr.Row():
|
|
|
|
|
| with gr.Column(scale=5):
|
| text_input = gr.Textbox(
|
| label="أدخل النص (Enter Arabic Text Here)",
|
| placeholder="إزيك يا صاحبي عامل إيه؟",
|
| lines=5
|
| )
|
| submit_btn = gr.Button("Detect Dialect 🔎", variant="primary")
|
|
|
|
|
| examples_list = [
|
| ["إزيك يا صاحبي عامل إيه؟ فينك من زمان"],
|
| ["شو أخبارك؟ وين هالغيبة اشتقنالك كتير"],
|
| ["كيداير لاباس عليك؟ شنو كتدير؟"],
|
| ["وشلونك طال عمرك؟ عساك طيب ومبسوط"],
|
| ["السلام عليكم ورحمة الله وبركاته، كيف حالكم اليوم؟"],
|
| ["أنا هسا رايح عالدار بدك اشي؟"],
|
| ]
|
| gr.Examples(
|
| examples=examples_list,
|
| inputs=text_input,
|
| label="Try these Examples"
|
| )
|
|
|
|
|
| with gr.Column(scale=4):
|
|
|
| output_labels = gr.Label(num_top_classes=4, label="Dialect Confidence")
|
|
|
|
|
| gr.Markdown("*(Internal Text pre-processing strips tags, mentions, tashkeel, repeated letters etc. via REGEX just like the model training before execution!)*")
|
|
|
|
|
| submit_btn.click(
|
| fn=predict_dialect,
|
| inputs=text_input,
|
| outputs=output_labels
|
| )
|
|
|
|
|
| if __name__ == "__main__":
|
|
|
| demo.launch(show_error=True) |