Spaces:

AHAAM
/

LahjatBERT

Sleeping

App Files Files Community

AHAAM commited on Oct 13, 2025

Commit

cbac10a

1 Parent(s): 55562ed

Add B2B

Browse files

Files changed (3) hide show

README.md +52 -1
app.py +152 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -10,4 +10,55 @@ pinned: false
 short_description: Multi label Arabic Dialect Identification
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 short_description: Multi label Arabic Dialect Identification
 ---
+---
+title: B2BERT Arabic Dialect Classifier
+emoji: 🌍
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+# B2BERT Arabic Dialect Classifier
+This Space uses the [B2BERT model](https://huggingface.co/AHAAM/B2BERT) to classify Arabic text into 18 different dialects.
+## Supported Dialects
+- Algeria
+- Bahrain
+- Egypt
+- Iraq
+- Jordan
+- Kuwait
+- Lebanon
+- Libya
+- Morocco
+- Oman
+- Palestine
+- Qatar
+- Saudi Arabia
+- Sudan
+- Syria
+- Tunisia
+- UAE
+- Yemen
+## How to Use
+1. Enter Arabic text in the input box
+2. Adjust the confidence threshold (default: 0.3)
+3. Click "Predict Dialects" to see results
+4. View confidence scores for each dialect
+## Model Details
+The model performs multi-label classification, meaning a single text can be valid in multiple dialects. Each dialect is evaluated independently with a confidence score.
+## Citation
+If you use this model, please cite the original work.

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import torch
+import gradio as gr
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+import pandas as pd
+# Load the model and tokenizer
+model_name = "AHAAM/B2BERT"
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Define dialects
+DIALECTS = [
+    "Algeria", "Bahrain", "Egypt", "Iraq", "Jordan", "Kuwait", "Lebanon", "Libya",
+    "Morocco", "Oman", "Palestine", "Qatar", "Saudi_Arabia", "Sudan", "Syria",
+    "Tunisia", "UAE", "Yemen"
+]
+def predict_dialects_with_confidence(text, threshold=0.3):
+    """
+    Predict Arabic dialects for the given text and return confidence scores.
+    Args:
+        text: Input Arabic text
+        threshold: Confidence threshold for classification (default 0.3)
+    Returns:
+        DataFrame with dialects and their confidence scores
+    """
+    if not text.strip():
+        return pd.DataFrame({"Dialect": [], "Confidence": [], "Prediction": []})
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    # Tokenize input
+    encodings = tokenizer(
+        [text],
+        truncation=True,
+        padding=True,
+        max_length=128,
+        return_tensors="pt"
+    )
+    input_ids = encodings["input_ids"].to(device)
+    attention_mask = encodings["attention_mask"].to(device)
+    # Get predictions
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+    # Calculate probabilities
+    probabilities = torch.sigmoid(logits).cpu().numpy().reshape(-1)
+    # Create results dataframe
+    results = []
+    for dialect, prob in zip(DIALECTS, probabilities):
+        prediction = "✓ Valid" if prob >= threshold else "✗ Invalid"
+        results.append({
+            "Dialect": dialect,
+            "Confidence": f"{prob:.4f}",
+            "Prediction": prediction
+        })
+    # Sort by confidence (descending)
+    df = pd.DataFrame(results)
+    df = df.sort_values("Confidence", ascending=False, key=lambda x: x.astype(float))
+    return df
+def predict_wrapper(text, threshold):
+    """Wrapper function for Gradio interface"""
+    df = predict_dialects_with_confidence(text, threshold)
+    # Also create a summary of predicted dialects
+    predicted = df[df["Prediction"] == "✓ Valid"]["Dialect"].tolist()
+    summary = f"**Predicted Dialects ({len(predicted)}):** {', '.join(predicted) if predicted else 'None'}"
+    return df, summary
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🌍 B2BERT Arabic Dialect Classifier
+        This model identifies which Arabic dialects are valid for a given text input.
+        Enter Arabic text below to see the dialect predictions and confidence scores.
+        **Supported Dialects:** Algeria, Bahrain, Egypt, Iraq, Jordan, Kuwait, Lebanon, Libya,
+        Morocco, Oman, Palestine, Qatar, Saudi Arabia, Sudan, Syria, Tunisia, UAE, Yemen
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            text_input = gr.Textbox(
+                label="Arabic Text Input",
+                placeholder="أدخل النص العربي هنا... (e.g., كيف حالك؟)",
+                lines=3,
+                rtl=True
+            )
+            threshold_slider = gr.Slider(
+                minimum=0.1,
+                maximum=0.9,
+                value=0.3,
+                step=0.05,
+                label="Confidence Threshold",
+                info="Dialects with confidence above this threshold will be marked as valid"
+            )
+            predict_button = gr.Button("🔍 Predict Dialects", variant="primary")
+        with gr.Column():
+            summary_output = gr.Markdown(label="Summary")
+            results_output = gr.Dataframe(
+                label="Detailed Results",
+                headers=["Dialect", "Confidence", "Prediction"],
+                datatype=["str", "str", "str"]
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["كيف حالك؟", 0.3],
+            ["شلونك؟", 0.3],
+            ["إزيك يا عم؟", 0.3],
+            ["شو أخبارك؟", 0.3],
+        ],
+        inputs=[text_input, threshold_slider],
+        label="Try these examples"
+    )
+    # Connect button to function
+    predict_button.click(
+        fn=predict_wrapper,
+        inputs=[text_input, threshold_slider],
+        outputs=[results_output, summary_output]
+    )
+    gr.Markdown(
+        """
+        ---
+        **Model:** [AHAAM/B2BERT](https://huggingface.co/AHAAM/B2BERT)
+        **Note:** The model uses a multi-label classification approach where each dialect is
+        independently evaluated. A single text can be valid in multiple dialects.
+        """
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers>=4.30.0
+torch>=2.0.0
+pandas>=1.5.0