mahmoudmohammad commited on
Commit
17f2039
·
verified ·
1 Parent(s): 5a72a9e

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +145 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import re
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
5
+
6
+ # ============================================
7
+ # 1. Configuration & Label Mapping
8
+ # ============================================
9
+ MODEL_ID = "mahmoudmohammad/marbertv2_single-label-dialect"
10
+
11
+ # The exact label map mapped during your training
12
+ LABEL_MAP = {
13
+ 0: 'Algerian', 1: 'Egyptian', 2: 'Iraqi', 3: 'Jordanian',
14
+ 4: 'Lebanese', 5: 'Libyan', 6: 'MSA', 7: 'Moroccan',
15
+ 8: 'Palestinian', 9: 'Qatari', 10: 'Saudi', 11: 'Syrian',
16
+ 12: 'Tunisian', 13: 'Yemeni'
17
+ }
18
+
19
+ # ============================================
20
+ # 2. Caching & Loading Model Locally
21
+ # ============================================
22
+ # Defining them at the module level loads them once during Space spin-up
23
+ # making all future inferences blazingly fast.
24
+ print(f"Loading {MODEL_ID} from Hugging Face...")
25
+ try:
26
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
27
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
28
+ model.eval() # Ensure dropout layers are frozen
29
+ print("✅ Model loaded successfully!")
30
+ except Exception as e:
31
+ print(f"❌ Error loading model: {e}")
32
+
33
+ # ============================================
34
+ # 3. Preprocessing Logic
35
+ # ============================================
36
+ def preprocess_arabic_dialect(text: str) -> str:
37
+ """Cleans social media dialectal Arabic text. Exact copy from training script."""
38
+ if not isinstance(text, str):
39
+ return ""
40
+
41
+ text = re.sub(r'http\S+|www\.\S+|<.*?>', ' ', text)
42
+ text = re.sub(r'@\w+', ' ', text)
43
+ text = re.sub(r'#', '', text)
44
+ tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
45
+ text = re.sub(tashkeel, '', text)
46
+ text = re.sub(r'\u0640', '', text)
47
+ text = re.sub(r'(.)\1+', r'\1\1', text)
48
+ text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
49
+ text = re.sub(r'\s+', ' ', text).strip()
50
+
51
+ return text
52
+
53
+ # ============================================
54
+ # 4. Inference Function
55
+ # ============================================
56
+ def predict_dialect(text: str):
57
+ if not text.strip():
58
+ # Handle empty text gently
59
+ return {label: 0.0 for label in LABEL_MAP.values()}
60
+
61
+ # 1. Clean the incoming text
62
+ clean_text = preprocess_arabic_dialect(text)
63
+
64
+ # 2. Tokenize (ensuring dimensions align with max_len 128)
65
+ inputs = tokenizer(
66
+ clean_text,
67
+ return_tensors="pt",
68
+ truncation=True,
69
+ max_length=128,
70
+ padding="max_length" # As trained in the model script
71
+ )
72
+
73
+ # 3. Model Inference (No Gradient tracking)
74
+ with torch.no_grad():
75
+ outputs = model(**inputs)
76
+ logits = outputs.logits
77
+ # Calculate Softmax Probabilities
78
+ probs = torch.nn.functional.softmax(logits, dim=-1)[0]
79
+
80
+ # 4. Format into a Dictionary for the Gradio 'Label' UI
81
+ # Gradio will use these numbers to automatically populate prediction progress bars
82
+ results = {LABEL_MAP[i]: float(probs[i]) for i in range(len(LABEL_MAP))}
83
+
84
+ return results
85
+
86
+ # ============================================
87
+ # 5. UI Application Definition (Dark Mode Native)
88
+ # ============================================
89
+
90
+ # Dark mode snippet using Gradio js injection
91
+ dark_mode_js = """
92
+ function() {
93
+ document.body.classList.add('dark');
94
+ }
95
+ """
96
+
97
+ with gr.Blocks(js=dark_mode_js, theme=gr.themes.Monochrome(primary_hue="purple")) as demo:
98
+ gr.Markdown("# 🌍 Arabic Dialect Detector")
99
+ gr.Markdown("Identify whether text represents **MSA** or one of 13 Regional **Arabic Dialects** (e.g., Egyptian, Saudi, Moroccan, Lebanese...). \n*Powered by a Fine-Tuned MARBERTv2 base model.*")
100
+
101
+ with gr.Row():
102
+
103
+ # Left Panel (Inputs and Buttons)
104
+ with gr.Column(scale=5):
105
+ text_input = gr.Textbox(
106
+ label="أدخل النص (Enter Arabic Text Here)",
107
+ placeholder="إزيك يا صاحبي عامل إيه؟",
108
+ lines=5
109
+ )
110
+ submit_btn = gr.Button("Detect Dialect 🔎", variant="primary")
111
+
112
+ # Diverse dialect examples to populate inside the Space
113
+ examples_list = [
114
+ ["إزيك يا صاحبي عامل إيه؟ فينك من زمان"], # Egyptian
115
+ ["شو أخبارك؟ وين هالغيبة اشتقنالك كتير"], # Lebanese/Syrian
116
+ ["كيداير لاباس عليك؟ شنو كتدير؟"], # Moroccan
117
+ ["وشلونك طال عمرك؟ عساك طيب ومبسوط"], # Saudi / Gulf
118
+ ["السلام عليكم ورحمة الله وبركاته، كيف حالكم اليوم؟"], # MSA
119
+ ["أنا هسا رايح عالدار بدك اشي؟"], # Jordanian/Palestinian
120
+ ]
121
+ gr.Examples(
122
+ examples=examples_list,
123
+ inputs=text_input,
124
+ label="Try these Examples"
125
+ )
126
+
127
+ # Right Panel (Output Predictions Bar)
128
+ with gr.Column(scale=4):
129
+ # Showing Top 4 detected probabilities smoothly
130
+ output_labels = gr.Label(num_top_classes=4, label="Dialect Confidence")
131
+
132
+ # Just to show preprocessing mapping in backend visually to users
133
+ gr.Markdown("*(Internal Text pre-processing strips tags, mentions, tashkeel, repeated letters etc. via REGEX just like the model training before execution!)*")
134
+
135
+ # Connect UI button -> Inference Logic
136
+ submit_btn.click(
137
+ fn=predict_dialect,
138
+ inputs=text_input,
139
+ outputs=output_labels
140
+ )
141
+
142
+ # Boot Gradio Application
143
+ if __name__ == "__main__":
144
+ # Ensure memory handling on Gradio hosting wrapper
145
+ demo.launch(show_error=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ transformers
3
+ gradio