AMR-KELEG commited on
Commit
80852b8
·
1 Parent(s): 947aa37

Generate a prediction table

Browse files
Files changed (2) hide show
  1. app.py +24 -7
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,8 +1,6 @@
1
  # TODO: requirments.txt
2
- # TODO: Secrets?
3
- # TODO: Dataset_name to Secrets and not hard-coded!
4
-
5
  import os
 
6
  import streamlit as st
7
 
8
  import torch
@@ -12,7 +10,8 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer
12
 
13
  model_name = st.text_input("Enter a model's name on HF")
14
  # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
15
- DIALECTS = ["Algeria",
 
16
  "Bahrain",
17
  "Egypt",
18
  "Iraq",
@@ -36,6 +35,7 @@ assert len(DIALECTS) == 18
36
  tokenizer = AutoTokenizer.from_pretrained(model_name)
37
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
38
 
 
39
  def predict_top_p(text, P=0.9):
40
  """Predict the top dialects with an accumulative confidence of at least P."""
41
  assert P <= 1 and P >= 0
@@ -56,12 +56,29 @@ def predict_top_p(text, P=0.9):
56
  return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
57
 
58
 
 
59
  dataset_name = "AMR-KELEG/test-dataset"
60
  dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
61
 
 
 
62
  for sample in tqdm(dataset):
63
  text = sample["sentence"]
64
- labels= [DIALECTS[i] for i in range(len(DIALECTS)) if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1]
 
 
 
 
65
  pred = predict_top_p(text)
66
- sample["pred"] = pred
67
- st.write("Text:", text)
 
 
 
 
 
 
 
 
 
 
 
1
  # TODO: requirments.txt
 
 
 
2
  import os
3
+ import pandas as pd
4
  import streamlit as st
5
 
6
  import torch
 
10
 
11
  model_name = st.text_input("Enter a model's name on HF")
12
  # MODEL_NAME = "AMR-KELEG/NADI2024-baseline"
13
+ DIALECTS = [
14
+ "Algeria",
15
  "Bahrain",
16
  "Egypt",
17
  "Iraq",
 
35
  tokenizer = AutoTokenizer.from_pretrained(model_name)
36
  model = AutoModelForSequenceClassification.from_pretrained(model_name)
37
 
38
+
39
  def predict_top_p(text, P=0.9):
40
  """Predict the top dialects with an accumulative confidence of at least P."""
41
  assert P <= 1 and P >= 0
 
56
  return [DIALECTS[i] for i, p in enumerate(predictions) if p == 1]
57
 
58
 
59
+ # Load the dataset
60
  dataset_name = "AMR-KELEG/test-dataset"
61
  dataset = datasets.load_dataset(dataset_name, token=os.environ["HF_TOKEN"])["test"]
62
 
63
+ sentences_labels, sentences_predictions = [], []
64
+
65
  for sample in tqdm(dataset):
66
  text = sample["sentence"]
67
+ labels = [
68
+ DIALECTS[i]
69
+ for i in range(len(DIALECTS))
70
+ if DIALECTS[i] in sample.keys() and int(sample[DIALECTS[i]]) == 1
71
+ ]
72
  pred = predict_top_p(text)
73
+ sentences_labels.append(labels)
74
+ sentences_predictions.append(pred)
75
+
76
+ st.table(
77
+ data=pd.DataFrame(
78
+ {
79
+ "text": dataset["sentence"],
80
+ "labels": sentences_labels,
81
+ "predictions": sentences_predictions,
82
+ }
83
+ )
84
+ )
requirements.txt CHANGED
@@ -1,3 +1,4 @@
1
  transformers
2
  torch
3
  datasets
 
 
1
  transformers
2
  torch
3
  datasets
4
+ pandas