similarity method fixed for two embedding sets

Files changed (4) hide show

model.py CHANGED Viewed

@@ -214,13 +214,18 @@ class SentenceCoseNet(PreTrainedModel):
         )
         return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
-    def similarity(self, embeddings: torch.Tensor) -> torch.Tensor:
         """
         Compute cosine similarity scores between two sets of embeddings.
         Args:
-            embeddings (torch.Tensor):
-                Tensor of shape (B, S, D) containing two sets of
                 embeddings concatenated along the first dimension.
         Returns:
@@ -228,9 +233,10 @@ class SentenceCoseNet(PreTrainedModel):
                 Similarity scores of shape (B, S, S)
         """
         # Normalize embeddings
-        x = self.model.distance_layer(embeddings)
         # Compute cosine similarity
-        return x
     def forward(
         self,

         )
         return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
+    @staticmethod
+    def similarity(embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
         """
         Compute cosine similarity scores between two sets of embeddings.
         Args:
+            embeddings_1 (torch.Tensor):
+                Tensor of shape (B, S, D) containing the first set of
+                embeddings concatenated along the first dimension.
+            embeddings_2 (torch.Tensor):
+                Tensor of shape (B, S, D) containing the second set of
                 embeddings concatenated along the first dimension.
         Returns:
                 Similarity scores of shape (B, S, S)
         """
         # Normalize embeddings
+        embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=-1)
+        embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=-1)
         # Compute cosine similarity
+        return (embeddings_1 * embeddings_2).sum(dim=-1)
     def forward(
         self,

pelt_bert-base-multilingual-cased.json ADDED Viewed

The diff for this file is too large to render. See raw diff

research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py CHANGED Viewed

@@ -28,16 +28,26 @@ def zero_shot_proposed(
         data_repo = input("Enter the dataset repository path or identifier: ")
     # Loading:
-    tokenizer = AutoTokenizer.from_pretrained(model_repo)
-    model = AutoModel.from_pretrained(model_repo)
     dataset = load_dataset(data_repo)
     for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
-        inputs = tokenizer(..., return_tensors="pt", padding=True, truncation=True)
-        embeddings = model.get_sentence_embedding(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], normalize=True)
-        similarity = model.similarity(embeddings)
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #

         data_repo = input("Enter the dataset repository path or identifier: ")
     # Loading:
+    tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
+    model = AutoModel.from_pretrained(model_repo, trust_remote_code=True)
     dataset = load_dataset(data_repo)
     for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
+        inputs_1 = tokenizer(batch['sentence_1'], return_tensors="pt", padding=True, truncation=True)
+        inputs_2 = tokenizer(batch['sentence_2'], return_tensors="pt", padding=True, truncation=True)
+        embeddings_1 = model.get_sentence_embedding(
+            input_ids=inputs_1["input_ids"],
+            attention_mask=inputs_1["attention_mask"],
+            normalize=False
+        )
+        embeddings_2 = model.get_sentence_embedding(
+            input_ids=inputs_2["input_ids"],
+            attention_mask=inputs_2["attention_mask"],
+            normalize=False
+        )
+        similarity = model.similarity(embeddings_1, embeddings_2)
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #

research_files/zero_shot_tranfer_experiment.py ADDED Viewed

+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                                                           #
+#   This file was created by: Alberto Palomo Alonso         #
+# Universidad de Alcalá - Escuela Politécnica Superior      #
+#                                                           #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+# Import statements:
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
+#                         START OF FILE                             #
+# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
+import os
+from benchmark.segmentation_benchmark.zero_shot_transfer import zero_shot_proposed
+__file_path__ = os.path.dirname(__file__)
+if __name__ == '__main__':
+    zero_shot_proposed()
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
+#                        END OF FILE                        #
+# - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #