Spaces:
Runtime error
Runtime error
updated token error
Browse files
app.py
CHANGED
|
@@ -42,12 +42,12 @@ class MLMDataset(Dataset):
|
|
| 42 |
def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
|
| 43 |
self.sentence = sentence
|
| 44 |
self.tokenizer = tokenizer
|
| 45 |
-
self.num_samples =
|
| 46 |
|
| 47 |
self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
|
| 48 |
-
self.batch_input = self.tensor_input.repeat(
|
| 49 |
|
| 50 |
-
self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],
|
| 51 |
self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
|
| 52 |
|
| 53 |
# Added by Chris Emezue on 29.01.2023
|
|
@@ -178,25 +178,26 @@ models = get_model_infos(multilingual=None)
|
|
| 178 |
selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
|
| 179 |
|
| 180 |
)
|
| 181 |
-
|
| 182 |
-
progress_text = "Computing recommendation Scores"
|
| 183 |
-
st.write(help(st.progress))
|
| 184 |
-
my_bar = st.progress(0)
|
| 185 |
|
|
|
|
|
|
|
| 186 |
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
scores={}
|
| 189 |
-
for index, model_id in enumerate(selected_models):
|
| 190 |
-
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 191 |
-
model = AutoModelWithLMHead.from_pretrained(model_id)
|
| 192 |
-
if model_id == 'castorini/afriberta_base':
|
| 193 |
-
tokenizer.model_max_length = 512
|
| 194 |
-
MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
|
| 195 |
-
MLM_UNK_TOKEN = tokenizer.unk_token_id
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def __init__(self,sentence,tokenizer,num_samples,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
|
| 43 |
self.sentence = sentence
|
| 44 |
self.tokenizer = tokenizer
|
| 45 |
+
self.num_samples = len(self.sentence) - 2
|
| 46 |
|
| 47 |
self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
|
| 48 |
+
self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
|
| 49 |
|
| 50 |
+
self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
|
| 51 |
self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
|
| 52 |
|
| 53 |
# Added by Chris Emezue on 29.01.2023
|
|
|
|
| 178 |
selected_models = st.multiselect("Select of number of models you would like to compare", models['id']
|
| 179 |
|
| 180 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
+
run = st.button("Get Scores")
|
| 183 |
+
if run:
|
| 184 |
|
| 185 |
+
progress_text = "Computing recommendation Scores"
|
| 186 |
+
my_bar = st.progress(0)
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
|
| 189 |
+
scores={}
|
| 190 |
+
for index, model_id in enumerate(selected_models):
|
| 191 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 192 |
+
model = AutoModelWithLMHead.from_pretrained(model_id)
|
| 193 |
+
if model_id == 'castorini/afriberta_base':
|
| 194 |
+
tokenizer.model_max_length = 512
|
| 195 |
+
MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
|
| 196 |
+
MLM_UNK_TOKEN = tokenizer.unk_token_id
|
| 197 |
|
| 198 |
+
BATCH_SIZE = 1
|
| 199 |
+
score = get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,None,BATCH_SIZE)
|
| 200 |
+
scores[model_id] = score
|
| 201 |
+
my_bar.progress(index + 1, text=progress_text)
|
| 202 |
+
scores = sort_dictionary(scores)
|
| 203 |
+
st.write("Our recommendation is:", scores)
|