alverciito commited on
Commit ·
dd4f3a8
1
Parent(s): d0a3e2d
similarity method fixed for two embedding sets
Browse files
model.py
CHANGED
|
@@ -214,13 +214,18 @@ class SentenceCoseNet(PreTrainedModel):
|
|
| 214 |
)
|
| 215 |
return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
|
| 216 |
|
| 217 |
-
|
|
|
|
| 218 |
"""
|
| 219 |
Compute cosine similarity scores between two sets of embeddings.
|
| 220 |
|
| 221 |
Args:
|
| 222 |
-
|
| 223 |
-
Tensor of shape (B, S, D) containing
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
embeddings concatenated along the first dimension.
|
| 225 |
|
| 226 |
Returns:
|
|
@@ -228,9 +233,10 @@ class SentenceCoseNet(PreTrainedModel):
|
|
| 228 |
Similarity scores of shape (B, S, S)
|
| 229 |
"""
|
| 230 |
# Normalize embeddings
|
| 231 |
-
|
|
|
|
| 232 |
# Compute cosine similarity
|
| 233 |
-
return
|
| 234 |
|
| 235 |
def forward(
|
| 236 |
self,
|
|
|
|
| 214 |
)
|
| 215 |
return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
|
| 216 |
|
| 217 |
+
@staticmethod
|
| 218 |
+
def similarity(embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
|
| 219 |
"""
|
| 220 |
Compute cosine similarity scores between two sets of embeddings.
|
| 221 |
|
| 222 |
Args:
|
| 223 |
+
embeddings_1 (torch.Tensor):
|
| 224 |
+
Tensor of shape (B, S, D) containing the first set of
|
| 225 |
+
embeddings concatenated along the first dimension.
|
| 226 |
+
|
| 227 |
+
embeddings_2 (torch.Tensor):
|
| 228 |
+
Tensor of shape (B, S, D) containing the second set of
|
| 229 |
embeddings concatenated along the first dimension.
|
| 230 |
|
| 231 |
Returns:
|
|
|
|
| 233 |
Similarity scores of shape (B, S, S)
|
| 234 |
"""
|
| 235 |
# Normalize embeddings
|
| 236 |
+
embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=-1)
|
| 237 |
+
embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=-1)
|
| 238 |
# Compute cosine similarity
|
| 239 |
+
return (embeddings_1 * embeddings_2).sum(dim=-1)
|
| 240 |
|
| 241 |
def forward(
|
| 242 |
self,
|
pelt_bert-base-multilingual-cased.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py
CHANGED
|
@@ -28,16 +28,26 @@ def zero_shot_proposed(
|
|
| 28 |
data_repo = input("Enter the dataset repository path or identifier: ")
|
| 29 |
|
| 30 |
# Loading:
|
| 31 |
-
tokenizer = AutoTokenizer.from_pretrained(model_repo)
|
| 32 |
-
model = AutoModel.from_pretrained(model_repo)
|
| 33 |
dataset = load_dataset(data_repo)
|
| 34 |
|
| 35 |
for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
|
| 36 |
|
| 37 |
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 43 |
# END OF FILE #
|
|
|
|
| 28 |
data_repo = input("Enter the dataset repository path or identifier: ")
|
| 29 |
|
| 30 |
# Loading:
|
| 31 |
+
tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
|
| 32 |
+
model = AutoModel.from_pretrained(model_repo, trust_remote_code=True)
|
| 33 |
dataset = load_dataset(data_repo)
|
| 34 |
|
| 35 |
for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
|
| 36 |
|
| 37 |
|
| 38 |
+
inputs_1 = tokenizer(batch['sentence_1'], return_tensors="pt", padding=True, truncation=True)
|
| 39 |
+
inputs_2 = tokenizer(batch['sentence_2'], return_tensors="pt", padding=True, truncation=True)
|
| 40 |
+
embeddings_1 = model.get_sentence_embedding(
|
| 41 |
+
input_ids=inputs_1["input_ids"],
|
| 42 |
+
attention_mask=inputs_1["attention_mask"],
|
| 43 |
+
normalize=False
|
| 44 |
+
)
|
| 45 |
+
embeddings_2 = model.get_sentence_embedding(
|
| 46 |
+
input_ids=inputs_2["input_ids"],
|
| 47 |
+
attention_mask=inputs_2["attention_mask"],
|
| 48 |
+
normalize=False
|
| 49 |
+
)
|
| 50 |
+
similarity = model.similarity(embeddings_1, embeddings_2)
|
| 51 |
|
| 52 |
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 53 |
# END OF FILE #
|
research_files/zero_shot_tranfer_experiment.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 2 |
+
# #
|
| 3 |
+
# This file was created by: Alberto Palomo Alonso #
|
| 4 |
+
# Universidad de Alcalá - Escuela Politécnica Superior #
|
| 5 |
+
# #
|
| 6 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 7 |
+
# Import statements:
|
| 8 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
| 9 |
+
# START OF FILE #
|
| 10 |
+
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
|
| 11 |
+
import os
|
| 12 |
+
from benchmark.segmentation_benchmark.zero_shot_transfer import zero_shot_proposed
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
__file_path__ = os.path.dirname(__file__)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
if __name__ == '__main__':
|
| 19 |
+
zero_shot_proposed()
|
| 20 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|
| 21 |
+
# END OF FILE #
|
| 22 |
+
# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
|