Spaces:
Runtime error
Runtime error
updated token error
Browse files
app.py
CHANGED
|
@@ -39,14 +39,15 @@ def get_model_infos(multilingual="multilingual"):
|
|
| 39 |
return df
|
| 40 |
|
| 41 |
class MLMDataset(Dataset):
|
| 42 |
-
def __init__(self,sentence,tokenizer,
|
| 43 |
self.sentence = sentence
|
| 44 |
self.tokenizer = tokenizer
|
| 45 |
-
self.num_samples = len(self.sentence) - 2
|
| 46 |
|
| 47 |
self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
|
| 48 |
-
self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
|
| 49 |
|
|
|
|
|
|
|
|
|
|
| 50 |
self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
|
| 51 |
self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
|
| 52 |
|
|
@@ -77,8 +78,8 @@ class MLMDataset(Dataset):
|
|
| 77 |
return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
|
| 78 |
|
| 79 |
|
| 80 |
-
def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,
|
| 81 |
-
mlm_dataset = MLMDataset(sentence,tokenizer,
|
| 82 |
dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
|
| 83 |
|
| 84 |
score =1
|
|
@@ -119,7 +120,7 @@ def get_sense_score(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,num_sa
|
|
| 119 |
|
| 120 |
tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
|
| 121 |
batch_input = tensor_input.repeat(num_samples, 1)
|
| 122 |
-
|
| 123 |
random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
|
| 124 |
random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
|
| 125 |
|
|
@@ -190,7 +191,7 @@ if run:
|
|
| 190 |
for index, model_id in enumerate(selected_models):
|
| 191 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 192 |
model = AutoModelWithLMHead.from_pretrained(model_id)
|
| 193 |
-
if model_id
|
| 194 |
tokenizer.model_max_length = 512
|
| 195 |
MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
|
| 196 |
MLM_UNK_TOKEN = tokenizer.unk_token_id
|
|
|
|
| 39 |
return df
|
| 40 |
|
| 41 |
class MLMDataset(Dataset):
|
| 42 |
+
def __init__(self,sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN):
|
| 43 |
self.sentence = sentence
|
| 44 |
self.tokenizer = tokenizer
|
|
|
|
| 45 |
|
| 46 |
self.tensor_input = self.tokenizer(sentence, return_tensors='pt')['input_ids']
|
|
|
|
| 47 |
|
| 48 |
+
self.num_samples = self.tensor_input.size()[-1] - 2
|
| 49 |
+
|
| 50 |
+
self.batch_input = self.tensor_input.repeat(self.num_samples, 1)
|
| 51 |
self.random_ids = np.random.choice([i for i in range(1,self.tensor_input.size(1)-1)],self.num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
|
| 52 |
self.random_ids = torch.Tensor(self.random_ids).long().unsqueeze(0).T
|
| 53 |
|
|
|
|
| 78 |
return self.masked_input[idx], self.mask[idx],self.labels[idx], self.unk_mask[idx]
|
| 79 |
|
| 80 |
|
| 81 |
+
def get_sense_score_batched(sentence,tokenizer,model,MLM_MASK_TOKEN,MLM_UNK_TOKEN,BATCH_SIZE):
|
| 82 |
+
mlm_dataset = MLMDataset(sentence,tokenizer,MLM_MASK_TOKEN,MLM_UNK_TOKEN)
|
| 83 |
dataloader = DataLoader(mlm_dataset,batch_size=BATCH_SIZE)
|
| 84 |
|
| 85 |
score =1
|
|
|
|
| 120 |
|
| 121 |
tensor_input = tokenizer(sentence, return_tensors='pt')['input_ids']
|
| 122 |
batch_input = tensor_input.repeat(num_samples, 1)
|
| 123 |
+
|
| 124 |
random_ids = np.random.choice([i for i in range(1,tensor_input.size(1)-1)],num_samples,replace=False) # ensuring that the masking is not done on the BOS and EOS tokens since they are not connected to the sentence itself.
|
| 125 |
random_ids = torch.Tensor(random_ids).long().unsqueeze(0).T
|
| 126 |
|
|
|
|
| 191 |
for index, model_id in enumerate(selected_models):
|
| 192 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 193 |
model = AutoModelWithLMHead.from_pretrained(model_id)
|
| 194 |
+
if model_id.startswith("castorini"):
|
| 195 |
tokenizer.model_max_length = 512
|
| 196 |
MLM_MASK_TOKEN = tokenizer.mask_token_id #[(103, '[MASK]')]
|
| 197 |
MLM_UNK_TOKEN = tokenizer.unk_token_id
|