Spaces:

AMR-KELEG
/

ALDi

Running

App Files Files Community

AMR-KELEG commited on Oct 10, 2023

Commit

ab3e62e

1 Parent(s): c464f06

Apply black

Browse files

Files changed (1) hide show

app.py +38 -5

app.py CHANGED Viewed

@@ -9,6 +9,28 @@ import altair as alt
 from altair import X, Y, Scale
 import base64
 @st.cache_data
 def render_svg(svg):
@@ -36,22 +58,33 @@ model = load_model(constants.MODEL_NAME)
 def compute_ALDi(sentences):
-    # TODO: Perform inference in batches
     progress_text = "Computing ALDi..."
     my_bar = st.progress(0, text=progress_text)
     BATCH_SIZE = 4
     output_logits = []
-    for first_index in range(0, len(sentences), BATCH_SIZE):
         inputs = tokenizer(
-            sentences[first_index : first_index + BATCH_SIZE],
             return_tensors="pt",
             padding=True,
         )
         outputs = model(**inputs).logits.reshape(-1).tolist()
         output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
         my_bar.progress(
-            min((first_index + BATCH_SIZE) / len(sentences), 1), text=progress_text
         )
     my_bar.empty()
     return output_logits
@@ -93,7 +126,7 @@ with tab1:
         print(sent)
         with open("logs.txt", "a") as f:
-            f.write(sent+"\n")
 with tab2:
     file = st.file_uploader("Upload a file", type=["txt"])

 from altair import X, Y, Scale
 import base64
+import re
+def preprocess_text(arabic_text):
+    """Apply preprocessing to the given Arabic text.
+    Args:
+        arabic_text: The Arabic text to be preprocessed.
+    Returns:
+        The preprocessed Arabic text.
+    """
+    no_urls = re.sub(
+        r"(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b",
+        "",
+        arabic_text,
+        flags=re.MULTILINE,
+    )
+    no_english = re.sub(r"[a-zA-Z]", "", no_urls)
+    return no_english
 @st.cache_data
 def render_svg(svg):
 def compute_ALDi(sentences):
+    """Computes the ALDi score for the given sentences.
+    Args:
+        sentences: A list of Arabic sentences.
+    Returns:
+        A list of ALDi scores for the given sentences.
+    """
     progress_text = "Computing ALDi..."
     my_bar = st.progress(0, text=progress_text)
     BATCH_SIZE = 4
     output_logits = []
+    preprocessed_sentences = [preprocess_text(s) for s in sentences]
+    for first_index in range(0, len(preprocessed_sentences), BATCH_SIZE):
         inputs = tokenizer(
+            preprocessed_sentences[first_index : first_index + BATCH_SIZE],
             return_tensors="pt",
             padding=True,
         )
         outputs = model(**inputs).logits.reshape(-1).tolist()
         output_logits = output_logits + [max(min(o, 1), 0) for o in outputs]
         my_bar.progress(
+            min((first_index + BATCH_SIZE) / len(preprocessed_sentences), 1),
+            text=progress_text,
         )
     my_bar.empty()
     return output_logits
         print(sent)
         with open("logs.txt", "a") as f:
+            f.write(sent + "\n")
 with tab2:
     file = st.file_uploader("Upload a file", type=["txt"])