AHAAM commited on
Commit
cbac10a
ยท
1 Parent(s): 55562ed
Files changed (3) hide show
  1. README.md +52 -1
  2. app.py +152 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -10,4 +10,55 @@ pinned: false
10
  short_description: Multi label Arabic Dialect Identification
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  short_description: Multi label Arabic Dialect Identification
11
  ---
12
 
13
+ ---
14
+ title: B2BERT Arabic Dialect Classifier
15
+ emoji: ๐ŸŒ
16
+ colorFrom: blue
17
+ colorTo: green
18
+ sdk: gradio
19
+ sdk_version: 4.44.0
20
+ app_file: app.py
21
+ pinned: false
22
+ license: apache-2.0
23
+ ---
24
+
25
+ # B2BERT Arabic Dialect Classifier
26
+
27
+ This Space uses the [B2BERT model](https://huggingface.co/AHAAM/B2BERT) to classify Arabic text into 18 different dialects.
28
+
29
+ ## Supported Dialects
30
+
31
+ - Algeria
32
+ - Bahrain
33
+ - Egypt
34
+ - Iraq
35
+ - Jordan
36
+ - Kuwait
37
+ - Lebanon
38
+ - Libya
39
+ - Morocco
40
+ - Oman
41
+ - Palestine
42
+ - Qatar
43
+ - Saudi Arabia
44
+ - Sudan
45
+ - Syria
46
+ - Tunisia
47
+ - UAE
48
+ - Yemen
49
+
50
+ ## How to Use
51
+
52
+ 1. Enter Arabic text in the input box
53
+ 2. Adjust the confidence threshold (default: 0.3)
54
+ 3. Click "Predict Dialects" to see results
55
+ 4. View confidence scores for each dialect
56
+
57
+ ## Model Details
58
+
59
+ The model performs multi-label classification, meaning a single text can be valid in multiple dialects. Each dialect is evaluated independently with a confidence score.
60
+
61
+ ## Citation
62
+
63
+ If you use this model, please cite the original work.
64
+
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
4
+ import pandas as pd
5
+
6
+ # Load the model and tokenizer
7
+ model_name = "AHAAM/B2BERT"
8
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
9
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
10
+
11
+ # Define dialects
12
+ DIALECTS = [
13
+ "Algeria", "Bahrain", "Egypt", "Iraq", "Jordan", "Kuwait", "Lebanon", "Libya",
14
+ "Morocco", "Oman", "Palestine", "Qatar", "Saudi_Arabia", "Sudan", "Syria",
15
+ "Tunisia", "UAE", "Yemen"
16
+ ]
17
+
18
+ def predict_dialects_with_confidence(text, threshold=0.3):
19
+ """
20
+ Predict Arabic dialects for the given text and return confidence scores.
21
+
22
+ Args:
23
+ text: Input Arabic text
24
+ threshold: Confidence threshold for classification (default 0.3)
25
+
26
+ Returns:
27
+ DataFrame with dialects and their confidence scores
28
+ """
29
+ if not text.strip():
30
+ return pd.DataFrame({"Dialect": [], "Confidence": [], "Prediction": []})
31
+
32
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
+ model.to(device)
34
+
35
+ # Tokenize input
36
+ encodings = tokenizer(
37
+ [text],
38
+ truncation=True,
39
+ padding=True,
40
+ max_length=128,
41
+ return_tensors="pt"
42
+ )
43
+
44
+ input_ids = encodings["input_ids"].to(device)
45
+ attention_mask = encodings["attention_mask"].to(device)
46
+
47
+ # Get predictions
48
+ with torch.no_grad():
49
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask)
50
+ logits = outputs.logits
51
+
52
+ # Calculate probabilities
53
+ probabilities = torch.sigmoid(logits).cpu().numpy().reshape(-1)
54
+
55
+ # Create results dataframe
56
+ results = []
57
+ for dialect, prob in zip(DIALECTS, probabilities):
58
+ prediction = "โœ“ Valid" if prob >= threshold else "โœ— Invalid"
59
+ results.append({
60
+ "Dialect": dialect,
61
+ "Confidence": f"{prob:.4f}",
62
+ "Prediction": prediction
63
+ })
64
+
65
+ # Sort by confidence (descending)
66
+ df = pd.DataFrame(results)
67
+ df = df.sort_values("Confidence", ascending=False, key=lambda x: x.astype(float))
68
+
69
+ return df
70
+
71
+ def predict_wrapper(text, threshold):
72
+ """Wrapper function for Gradio interface"""
73
+ df = predict_dialects_with_confidence(text, threshold)
74
+
75
+ # Also create a summary of predicted dialects
76
+ predicted = df[df["Prediction"] == "โœ“ Valid"]["Dialect"].tolist()
77
+ summary = f"**Predicted Dialects ({len(predicted)}):** {', '.join(predicted) if predicted else 'None'}"
78
+
79
+ return df, summary
80
+
81
+ # Create Gradio interface
82
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
83
+ gr.Markdown(
84
+ """
85
+ # ๐ŸŒ B2BERT Arabic Dialect Classifier
86
+
87
+ This model identifies which Arabic dialects are valid for a given text input.
88
+ Enter Arabic text below to see the dialect predictions and confidence scores.
89
+
90
+ **Supported Dialects:** Algeria, Bahrain, Egypt, Iraq, Jordan, Kuwait, Lebanon, Libya,
91
+ Morocco, Oman, Palestine, Qatar, Saudi Arabia, Sudan, Syria, Tunisia, UAE, Yemen
92
+ """
93
+ )
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ text_input = gr.Textbox(
98
+ label="Arabic Text Input",
99
+ placeholder="ุฃุฏุฎู„ ุงู„ู†ุต ุงู„ุนุฑุจูŠ ู‡ู†ุง... (e.g., ูƒูŠู ุญุงู„ูƒุŸ)",
100
+ lines=3,
101
+ rtl=True
102
+ )
103
+ threshold_slider = gr.Slider(
104
+ minimum=0.1,
105
+ maximum=0.9,
106
+ value=0.3,
107
+ step=0.05,
108
+ label="Confidence Threshold",
109
+ info="Dialects with confidence above this threshold will be marked as valid"
110
+ )
111
+ predict_button = gr.Button("๐Ÿ” Predict Dialects", variant="primary")
112
+
113
+ with gr.Column():
114
+ summary_output = gr.Markdown(label="Summary")
115
+ results_output = gr.Dataframe(
116
+ label="Detailed Results",
117
+ headers=["Dialect", "Confidence", "Prediction"],
118
+ datatype=["str", "str", "str"]
119
+ )
120
+
121
+ # Examples
122
+ gr.Examples(
123
+ examples=[
124
+ ["ูƒูŠู ุญุงู„ูƒุŸ", 0.3],
125
+ ["ุดู„ูˆู†ูƒุŸ", 0.3],
126
+ ["ุฅุฒูŠูƒ ูŠุง ุนู…ุŸ", 0.3],
127
+ ["ุดูˆ ุฃุฎุจุงุฑูƒุŸ", 0.3],
128
+ ],
129
+ inputs=[text_input, threshold_slider],
130
+ label="Try these examples"
131
+ )
132
+
133
+ # Connect button to function
134
+ predict_button.click(
135
+ fn=predict_wrapper,
136
+ inputs=[text_input, threshold_slider],
137
+ outputs=[results_output, summary_output]
138
+ )
139
+
140
+ gr.Markdown(
141
+ """
142
+ ---
143
+ **Model:** [AHAAM/B2BERT](https://huggingface.co/AHAAM/B2BERT)
144
+
145
+ **Note:** The model uses a multi-label classification approach where each dialect is
146
+ independently evaluated. A single text can be valid in multiple dialects.
147
+ """
148
+ )
149
+
150
+ # Launch the app
151
+ if __name__ == "__main__":
152
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers>=4.30.0
2
+ torch>=2.0.0
3
+ pandas>=1.5.0