Spaces:

AMR-KELEG
/

MLADI

Running

AMR-KELEG commited on Oct 23, 2024

Commit

24cf6c5

1 Parent(s): f818d64

Track the constants and eval modules

Files changed (2) hide show

constants.py ADDED Viewed

+DIALECTS = [
+    "Algeria",
+    "Bahrain",
+    "Egypt",
+    "Iraq",
+    "Jordan",
+    "Kuwait",
+    "Lebanon",
+    "Libya",
+    "Morocco",
+    "Oman",
+    "Palestine",
+    "Qatar",
+    "Saudi_Arabia",
+    "Sudan",
+    "Syria",
+    "Tunisia",
+    "UAE",
+    "Yemen",
+]
+assert len(DIALECTS) == 18
+DIALECTS_WITH_LABELS = [
+    "Algeria",
+    "Egypt",
+    "Iraq",
+    "Jordan",
+    "Morocco",
+    "Palestine",
+    "Saudi_Arabia",
+    "Sudan",
+    "Syria",
+    "Tunisia",
+    "Yemen",
+]
+assert len(DIALECTS_WITH_LABELS) == 11

eval_utils.py ADDED Viewed

+import torch
+from constants import DIALECTS, DIALECTS_WITH_LABELS
+def predict_top_p(model, tokenizer, text, P=0.9):
+    """Predict the top dialects with an accumulative confidence of at least P."""
+    assert P <= 1 and P >= 0
+    logits = model(**tokenizer(text, return_tensors="pt")).logits
+    probabilities = torch.softmax(logits, dim=1).flatten().tolist()
+    topk_predictions = torch.topk(logits, 18).indices.flatten().tolist()
+    predictions = [0 for _ in range(18)]
+    total_prob = 0
+    for i in range(18):
+        total_prob += probabilities[topk_predictions[i]]
+        predictions[topk_predictions[i]] = 1
+        if total_prob >= P:
+            break
+    return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]