ADI

Sleeping

App Files Files Community

AMR-KELEG commited on Mar 21, 2025

Commit

f3b7541

1 Parent(s): 3563942

Update the model name

Browse files

Files changed (2) hide show

app.py +52 -85
constants.py +23 -2

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 # Hint: this cheatsheet is magic! https://cheat-sheet.streamlit.app/
 import constants
 import pandas as pd
 import streamlit as st
 import matplotlib.pyplot as plt
 from transformers import BertForSequenceClassification, AutoTokenizer
 import altair as alt
 from altair import X, Y, Scale
@@ -11,6 +13,38 @@ import base64
 import re
 def preprocess_text(arabic_text):
     """Apply preprocessing to the given Arabic text.
@@ -57,42 +91,10 @@ tokenizer = AutoTokenizer.from_pretrained(constants.MODEL_NAME)
 model = load_model(constants.MODEL_NAME)
-def compute_ALDi(sentences):
-    """Computes the ALDi score for the given sentences.
-    Args:
-        sentences: A list of Arabic sentences.
-    Returns:
-        A list of ALDi scores for the given sentences.
-    """
-    progress_text = "Computing ALDi..."
-    my_bar = st.progress(0, text=progress_text)
-    BATCH_SIZE = 4
-    output_logits = []
-    preprocessed_sentences = [preprocess_text(s) for s in sentences]
-    for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
-        inputs = tokenizer(
-            preprocessed_sentences[first_index : first_index + BATCH_SIZE],
-            return_tensors="pt",
-            padding=True,
-        )
-        outputs = model(**inputs).logits.reshape(-1).tolist()
-        output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
-        my_bar.progress(
-            min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
-            text=progress_text,
-        )
-    my_bar.empty()
-    return output_logits
 @st.cache_data
 def render_metadata():
     """Renders the metadata."""
     html = r"""<p align="center">
         <a href="https://huggingface.co/AMR-KELEG/Sentence-ALDi"><img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-8A2BE2"></a>
         <a href="https://github.com/AMR-KELEG/ALDi"><img alt="GitHub" src="https://img.shields.io/badge/%F0%9F%93%A6%20GitHub-orange"></a>
@@ -101,10 +103,11 @@ def render_metadata():
     c = st.container()
     c.write(html, unsafe_allow_html=True)
-render_svg(open("assets/ALDi_logo.svg").read())
 render_metadata()
-tab1, tab2 = st.tabs(["Input a Sentence", "Upload a File"])
 with tab1:
     sent = st.text_input(
@@ -115,7 +118,7 @@ with tab1:
     clicked = st.button("Submit")
     if sent:
-        ALDi_score = compute_ALDi([sent])[0]
         ORANGE_COLOR = "#FF8000"
         fig, ax = plt.subplots(figsize=(8, 1))
@@ -128,55 +131,19 @@ with tab1:
         ax.spines[["right", "top"]].set_visible(False)
-        ax.barh(y=[0], width=[ALDi_score], color=ORANGE_COLOR)
-        ax.set_xlim(0, 1)
-        ax.set_ylim(-1, 1)
-        ax.set_title(f"ALDi score is: {round(ALDi_score, 3)}", color=ORANGE_COLOR)
-        ax.get_yaxis().set_visible(False)
-        ax.set_xlabel("ALDi score", color=ORANGE_COLOR)
         st.pyplot(fig)
         print(sent)
-        with open("logs.txt", "a") as f:
-            f.write(sent + "\n")
-with tab2:
-    file = st.file_uploader("Upload a file", type=["txt"])
-    if file is not None:
-        df = pd.read_csv(file, sep="\t", header=None)
-        df.columns = ["Sentence"]
-        df.reset_index(drop=True, inplace=True)
-        # TODO: Run the model
-        df["ALDi"] = compute_ALDi(df["Sentence"].tolist())
-        # A horizontal rule
-        st.markdown("""---""")
-        chart = (
-            alt.Chart(df.reset_index())
-            .mark_area(color="darkorange", opacity=0.5)
-            .encode(
-                x=X(field="index", title="Sentence Index"),
-                y=Y("ALDi", scale=Scale(domain=[0, 1])),
-            )
-        )
-        st.altair_chart(chart.interactive(), use_container_width=True)
-        col1, col2 = st.columns([4, 1])
-        with col1:
-            # Display the output
-            st.table(
-                df,
-            )
-        with col2:
-            # Add a download button
-            csv = convert_df(df)
-            st.download_button(
-                label=":file_folder: Download predictions as CSV",
-                data=csv,
-                file_name="ALDi_scores.csv",
-                mime="text/csv",
-            )

 # Hint: this cheatsheet is magic! https://cheat-sheet.streamlit.app/
 import constants
+import torch
 import pandas as pd
 import streamlit as st
 import matplotlib.pyplot as plt
 from transformers import BertForSequenceClassification, AutoTokenizer
+from constants import DIALECTS
 import altair as alt
 from altair import X, Y, Scale
 import re
+def predict_binary_outcomes(model, tokenizer, text, threshold=0.3):
+    """Predict the validity in each dialect, by indepenently applying a sigmoid activation to each dialect's logit.
+    Dialects with probabilities (sigmoid activations) above a threshold (set by defauly to 0.3) are predicted as valid.
+    The model is expected to generate logits for each dialect of the following dialects in the same order:
+    Algeria, Bahrain, Egypt, Iraq, Jordan, Kuwait, Lebanon, Libya, Morocco, Oman, Palestine, Qatar, Saudi_Arabia, Sudan, Syria, Tunisia, UAE, Yemen.
+    Credits: method proposed by Ali Mekky, Lara Hassan, and Mohamed ELZeftawy from MBZUAI.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    encodings = tokenizer(
+        text, truncation=True, padding=True, max_length=128, return_tensors="pt"
+    )
+    ## inputs
+    input_ids = encodings["input_ids"].to(device)
+    attention_mask = encodings["attention_mask"].to(device)
+    with torch.no_grad():
+        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
+        logits = outputs.logits
+    probabilities = torch.sigmoid(logits).cpu().numpy().reshape(-1)
+    binary_predictions = (probabilities >= threshold).astype(int)
+    # Map indices to actual labels
+    predicted_dialects = [
+        dialect
+        for dialect, dialect_prediction in zip(DIALECTS, binary_predictions)
+        if dialect_prediction == 1
+    ]
+    return predicted_dialects
 def preprocess_text(arabic_text):
     """Apply preprocessing to the given Arabic text.
 model = load_model(constants.MODEL_NAME)
 @st.cache_data
 def render_metadata():
     """Renders the metadata."""
+    # TODO: Update!
     html = r"""<p align="center">
         <a href="https://huggingface.co/AMR-KELEG/Sentence-ALDi"><img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-8A2BE2"></a>
         <a href="https://github.com/AMR-KELEG/ALDi"><img alt="GitHub" src="https://img.shields.io/badge/%F0%9F%93%A6%20GitHub-orange"></a>
     c = st.container()
     c.write(html, unsafe_allow_html=True)
+# TODO: Update!
+# render_svg(open("assets/ALDi_logo.svg").read())
 render_metadata()
+tab1= st.tabs(["Input a Sentence"])
 with tab1:
     sent = st.text_input(
     clicked = st.button("Submit")
     if sent:
+        valid_dialects = predict_binary_outcomes(model, tokenizer, sent)
         ORANGE_COLOR = "#FF8000"
         fig, ax = plt.subplots(figsize=(8, 1))
         ax.spines[["right", "top"]].set_visible(False)
+        dialect_labels = [int(dialect in valid_dialects) for dialect in DIALECTS]
+        im = ax.imshow(dialect_labels, cmap="vanimo", alpha=0.5, vmin=0, vmax=1, annot=False)
+        ax.set_yticks(range(len(DIALECTS)))
+        ax.set_yticklabels(DIALECTS, fontsize=8)
+        ax.set_xticks([])
+        ax.set_title("Valid Dialects", color=ORANGE_COLOR)
+        # ax.barh(y=[0], width=[ALDi_score], color=ORANGE_COLOR)
+        # ax.set_xlim(0, 1)
+        # ax.set_ylim(-1, 1)
+        # ax.set_title(f"ALDi score is: {round(ALDi_score, 3)}", color=ORANGE_COLOR)
+        # ax.get_yaxis().set_visible(False)
+        # ax.set_xlabel("ALDi score", color=ORANGE_COLOR)
         st.pyplot(fig)
         print(sent)

constants.py CHANGED Viewed

@@ -1,4 +1,25 @@
 CHOICE_TEXT = "Input Text"
 CHOICE_FILE = "Upload File"
-TITLE = "ALDi: Arabic Level of Dialectness"
-MODEL_NAME = "AMR-KELEG/Sentence-ALDi"

 CHOICE_TEXT = "Input Text"
 CHOICE_FILE = "Upload File"
+TITLE = "ADI: Arabic Dialect Idenitifcation"
+MODEL_NAME = "AHAAM/B2BERT"
+DIALECTS = [
+    "Algeria",
+    "Bahrain",
+    "Egypt",
+    "Iraq",
+    "Jordan",
+    "Kuwait",
+    "Lebanon",
+    "Libya",
+    "Morocco",
+    "Oman",
+    "Palestine",
+    "Qatar",
+    "Saudi_Arabia",
+    "Sudan",
+    "Syria",
+    "Tunisia",
+    "UAE",
+    "Yemen",
+]