zero shot experiment (fix v2)

Files changed (5) hide show

model.py CHANGED Viewed

@@ -169,8 +169,8 @@ class SentenceCoseNet(PreTrainedModel):
                 `(batch_size, sequence_length, emb_dim)`.
         """
         # Convert to type:
-        x = input_ids.int().unsqueeze(0)
-        mask = attention_mask.unsqueeze(0) if attention_mask is not None else None
         # Embedding and positional encoding:
         x = self.model.embedding(x)
@@ -213,8 +213,8 @@ class SentenceCoseNet(PreTrainedModel):
                 Sentence embeddings of shape (B, D)
         """
         # Convert to type:
-        x = input_ids.int().unsqueeze(0)
-        mask = attention_mask.unsqueeze(0) if attention_mask is not None else None
         # Embedding and positional encoding:
         x = self.model.embedding(x)

                 `(batch_size, sequence_length, emb_dim)`.
         """
         # Convert to type:
+        x = input_ids.int()
+        mask = attention_mask if attention_mask is not None else None
         # Embedding and positional encoding:
         x = self.model.embedding(x)
                 Sentence embeddings of shape (B, D)
         """
         # Convert to type:
+        x = input_ids.int()
+        mask = attention_mask if attention_mask is not None else None
         # Embedding and positional encoding:
         x = self.model.embedding(x)

research_files/bench.py CHANGED Viewed

@@ -6,7 +6,7 @@ import os
 import json
 from benchmark.segmentation_benchmark.proposed import evaluate_proposed
 from benchmark.segmentation_benchmark.heuristic import evaluate_textile
-from benchmark.segmentation_benchmark.transformers import evaluate_lms
 from benchmark.segmentation_benchmark.inference_proposed import evaluate_hf_proposed
@@ -41,7 +41,7 @@ if __name__ == '__main__':
         # 'hiiamsid/sentence_similarity_spanish_es',          # Spanish similarity - sBERT
         # "jaimevera1107/all-MiniLM-L6-v2-similarity-es",     # Spanish similarity - sBERT
         # "google-bert/bert-base-multilingual-cased",                     # mBERT (google)
-        "sentence-transformers/LaBSE",                                  # LaBSE (google)
         "FacebookAI/xlm-roberta-base"                                  # XLM-R (facebook)
     ]:
         print("Evaluating Model (3 methods):", model)

 import json
 from benchmark.segmentation_benchmark.proposed import evaluate_proposed
 from benchmark.segmentation_benchmark.heuristic import evaluate_textile
+from benchmark.segmentation_benchmark.sota_transformers import evaluate_lms
 from benchmark.segmentation_benchmark.inference_proposed import evaluate_hf_proposed
         # 'hiiamsid/sentence_similarity_spanish_es',          # Spanish similarity - sBERT
         # "jaimevera1107/all-MiniLM-L6-v2-similarity-es",     # Spanish similarity - sBERT
         # "google-bert/bert-base-multilingual-cased",                     # mBERT (google)
+        # "sentence-transformers/LaBSE",                                  # LaBSE (google)
         "FacebookAI/xlm-roberta-base"                                  # XLM-R (facebook)
     ]:
         print("Evaluating Model (3 methods):", model)

research_files/benchmark/segmentation_benchmark/{transformers.py → sota_transformers.py} RENAMED Viewed

File without changes

research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py CHANGED Viewed

@@ -45,7 +45,7 @@ def zero_shot_proposed(
     with torch.no_grad():
         for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
-            if not hasattr(model, 'get_sentence_emebedding'):
                 inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding=True, truncation=True, max_length=382)
                 inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding=True, truncation=True, max_length=382)
                 inputs_1 = {k: v.to(device) for k, v in inputs_1.items()}

     with torch.no_grad():
         for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
+            if not hasattr(model, 'get_sentence_embedding'):
                 inputs_1 = tokenizer(batch['sentence1'], return_tensors="pt", padding=True, truncation=True, max_length=382)
                 inputs_2 = tokenizer(batch['sentence2'], return_tensors="pt", padding=True, truncation=True, max_length=382)
                 inputs_1 = {k: v.to(device) for k, v in inputs_1.items()}

research_files/zero_shot_tranfer_experiment.py CHANGED Viewed

@@ -14,7 +14,7 @@ from benchmark.segmentation_benchmark.zero_shot_transfer import zero_shot_propos
 __file_path__ = os.path.dirname(__file__)
 if __name__ == '__main__':
-    zero_shot_proposed("hiiamsid/sentence_similarity_spanish_es", "nflechas/semantic_sentence_similarity_ES")
     zero_shot_proposed("Alverciito/wikipedia_segmentation", "nflechas/semantic_sentence_similarity_ES")
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #

 __file_path__ = os.path.dirname(__file__)
 if __name__ == '__main__':
+    # zero_shot_proposed("hiiamsid/sentence_similarity_spanish_es", "nflechas/semantic_sentence_similarity_ES")
     zero_shot_proposed("Alverciito/wikipedia_segmentation", "nflechas/semantic_sentence_similarity_ES")
 # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
 #                        END OF FILE                        #