Generate a prediction table
Browse files- app.py +24 -7
- requirements.txt +1 -0
app.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
# TODO: requirments.txt
|
| 2 |
-
# TODO: Secrets?
|
| 3 |
-
# TODO: Dataset_name to Secrets and not hard-coded!
|
| 4 |
-
|
| 5 |
import os
|
|
|
|
| 6 |
import streamlit as st
|
| 7 |
|
| 8 |
import torch
|
|
@@ -12,7 +10,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
|
| 12 |
|
| 13 |
model_name = st.text_input("Enter a model's name on HF")
|
| 14 |
# MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
|
| 15 |
-
DIALECTS = [
|
|
|
|
| 16 |
"Bahrain",
|
| 17 |
"Egypt",
|
| 18 |
"Iraq",
|
|
@@ -36,6 +35,7 @@ assert len(DIALECTS) == 18
|
|
| 36 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 37 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 38 |
|
|
|
|
| 39 |
def predict_top_p(text, P=0.9):
|
| 40 |
"""Predict the top dialects with an accumulative confidence of at least P."""
|
| 41 |
assert P <= 1 and P >= 0
|
|
@@ -56,12 +56,29 @@ def predict_top_p(text, P=0.9):
|
|
| 56 |
return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
|
| 57 |
|
| 58 |
|
|
|
|
| 59 |
dataset_name = "AMR-KELEG/test-dataset"
|
| 60 |
dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
|
| 61 |
|
|
|
|
|
|
|
| 62 |
for sample in tqdm(dataset):
|
| 63 |
text = sample["sentence"]
|
| 64 |
-
labels
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
pred = predict_top_p(text)
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# TODO: requirments.txt
|
|
|
|
|
|
|
|
|
|
| 2 |
import os
|
| 3 |
+
import pandas as pd
|
| 4 |
import streamlit as st
|
| 5 |
|
| 6 |
import torch
|
|
|
|
| 10 |
|
| 11 |
model_name = st.text_input("Enter a model's name on HF")
|
| 12 |
# MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
|
| 13 |
+
DIALECTS = [
|
| 14 |
+
"Algeria",
|
| 15 |
"Bahrain",
|
| 16 |
"Egypt",
|
| 17 |
"Iraq",
|
|
|
|
| 35 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 36 |
model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
| 37 |
|
| 38 |
+
|
| 39 |
def predict_top_p(text, P=0.9):
|
| 40 |
"""Predict the top dialects with an accumulative confidence of at least P."""
|
| 41 |
assert P <= 1 and P >= 0
|
|
|
|
| 56 |
return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
|
| 57 |
|
| 58 |
|
| 59 |
+
# Load the dataset
|
| 60 |
dataset_name = "AMR-KELEG/test-dataset"
|
| 61 |
dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
|
| 62 |
|
| 63 |
+
sentences_labels, sentences_predictions = [], []
|
| 64 |
+
|
| 65 |
for sample in tqdm(dataset):
|
| 66 |
text = sample["sentence"]
|
| 67 |
+
labels = [
|
| 68 |
+
DIALECTS[i]
|
| 69 |
+
for i in range(len(DIALECTS))
|
| 70 |
+
if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1
|
| 71 |
+
]
|
| 72 |
pred = predict_top_p(text)
|
| 73 |
+
sentences_labels.append(labels)
|
| 74 |
+
sentences_predictions.append(pred)
|
| 75 |
+
|
| 76 |
+
st.table(
|
| 77 |
+
data=pd.DataFrame(
|
| 78 |
+
{
|
| 79 |
+
"text": dataset["sentence"],
|
| 80 |
+
"labels": sentences_labels,
|
| 81 |
+
"predictions": sentences_predictions,
|
| 82 |
+
}
|
| 83 |
+
)
|
| 84 |
+
)
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
transformers
|
| 2 |
torch
|
| 3 |
datasets
|
|
|
|
|
|
| 1 |
transformers
|
| 2 |
torch
|
| 3 |
datasets
|
| 4 |
+
pandas
|