alverciito commited on
Commit
dd4f3a8
·
1 Parent(s): d0a3e2d

similarity method fixed for two embedding sets

Browse files
model.py CHANGED
@@ -214,13 +214,18 @@ class SentenceCoseNet(PreTrainedModel):
214
  )
215
  return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
216
 
217
- def similarity(self, embeddings: torch.Tensor) -> torch.Tensor:
 
218
  """
219
  Compute cosine similarity scores between two sets of embeddings.
220
 
221
  Args:
222
- embeddings (torch.Tensor):
223
- Tensor of shape (B, S, D) containing two sets of
 
 
 
 
224
  embeddings concatenated along the first dimension.
225
 
226
  Returns:
@@ -228,9 +233,10 @@ class SentenceCoseNet(PreTrainedModel):
228
  Similarity scores of shape (B, S, S)
229
  """
230
  # Normalize embeddings
231
- x = self.model.distance_layer(embeddings)
 
232
  # Compute cosine similarity
233
- return x
234
 
235
  def forward(
236
  self,
 
214
  )
215
  return torch.nn.functional.normalize(pooled, p=2, dim=-1) if normalize else pooled
216
 
217
+ @staticmethod
218
+ def similarity(embeddings_1: torch.Tensor, embeddings_2: torch.Tensor) -> torch.Tensor:
219
  """
220
  Compute cosine similarity scores between two sets of embeddings.
221
 
222
  Args:
223
+ embeddings_1 (torch.Tensor):
224
+ Tensor of shape (B, S, D) containing the first set of
225
+ embeddings concatenated along the first dimension.
226
+
227
+ embeddings_2 (torch.Tensor):
228
+ Tensor of shape (B, S, D) containing the second set of
229
  embeddings concatenated along the first dimension.
230
 
231
  Returns:
 
233
  Similarity scores of shape (B, S, S)
234
  """
235
  # Normalize embeddings
236
+ embeddings_1 = torch.nn.functional.normalize(embeddings_1, p=2, dim=-1)
237
+ embeddings_2 = torch.nn.functional.normalize(embeddings_2, p=2, dim=-1)
238
  # Compute cosine similarity
239
+ return (embeddings_1 * embeddings_2).sum(dim=-1)
240
 
241
  def forward(
242
  self,
pelt_bert-base-multilingual-cased.json ADDED
The diff for this file is too large to render. See raw diff
 
research_files/benchmark/segmentation_benchmark/zero_shot_transfer.py CHANGED
@@ -28,16 +28,26 @@ def zero_shot_proposed(
28
  data_repo = input("Enter the dataset repository path or identifier: ")
29
 
30
  # Loading:
31
- tokenizer = AutoTokenizer.from_pretrained(model_repo)
32
- model = AutoModel.from_pretrained(model_repo)
33
  dataset = load_dataset(data_repo)
34
 
35
  for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
36
 
37
 
38
- inputs = tokenizer(..., return_tensors="pt", padding=True, truncation=True)
39
- embeddings = model.get_sentence_embedding(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], normalize=True)
40
- similarity = model.similarity(embeddings)
 
 
 
 
 
 
 
 
 
 
41
 
42
  # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
43
  # END OF FILE #
 
28
  data_repo = input("Enter the dataset repository path or identifier: ")
29
 
30
  # Loading:
31
+ tokenizer = AutoTokenizer.from_pretrained(model_repo, trust_remote_code=True)
32
+ model = AutoModel.from_pretrained(model_repo, trust_remote_code=True)
33
  dataset = load_dataset(data_repo)
34
 
35
  for batch in tqdm.tqdm(dataset['test'].batch(batch_size)):
36
 
37
 
38
+ inputs_1 = tokenizer(batch['sentence_1'], return_tensors="pt", padding=True, truncation=True)
39
+ inputs_2 = tokenizer(batch['sentence_2'], return_tensors="pt", padding=True, truncation=True)
40
+ embeddings_1 = model.get_sentence_embedding(
41
+ input_ids=inputs_1["input_ids"],
42
+ attention_mask=inputs_1["attention_mask"],
43
+ normalize=False
44
+ )
45
+ embeddings_2 = model.get_sentence_embedding(
46
+ input_ids=inputs_2["input_ids"],
47
+ attention_mask=inputs_2["attention_mask"],
48
+ normalize=False
49
+ )
50
+ similarity = model.similarity(embeddings_1, embeddings_2)
51
 
52
  # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
53
  # END OF FILE #
research_files/zero_shot_tranfer_experiment.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
2
+ # #
3
+ # This file was created by: Alberto Palomo Alonso #
4
+ # Universidad de Alcalá - Escuela Politécnica Superior #
5
+ # #
6
+ # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
7
+ # Import statements:
8
+ # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
9
+ # START OF FILE #
10
+ # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
11
+ import os
12
+ from benchmark.segmentation_benchmark.zero_shot_transfer import zero_shot_proposed
13
+
14
+
15
+ __file_path__ = os.path.dirname(__file__)
16
+
17
+
18
+ if __name__ == '__main__':
19
+ zero_shot_proposed()
20
+ # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #
21
+ # END OF FILE #
22
+ # - x - x - x - x - x - x - x - x - x - x - x - x - x - x - #