viju.sudhi@iais.fraunhofer.de commited on
Commit
cd40de0
·
1 Parent(s): ec12e0a

adding sentence-transformers specific files

Browse files
0_Model/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_name_or_path": "/mnt/mydrive/embedding-models-fine-tuned/merged_models/7B_EU24_2_5t_with_BIG_dataset_multiple_neg_full_model"
3
+ }
1_TokenPooling/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "dimension": 4096
3
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.2.1",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.4.0+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": null
10
+ }
embedding_model.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import Union, List, Dict, Tuple
4
+
5
+ import torch
6
+ from sentence_transformers import models
7
+
8
+ MODEL_PATH_IN_MODULE = "0_Model"
9
+
10
+
11
+ class EmbeddingModel(models.Transformer):
12
+ def __init__(self, model_name_or_path: str, *args, **kwargs):
13
+ self.model_name_or_path = model_name_or_path
14
+ super().__init__(model_name_or_path, *args, **kwargs)
15
+
16
+ def tokenize(
17
+ self,
18
+ texts: Union[List[str], List[Dict], List[Tuple[str, str]]],
19
+ padding: Union[str, bool] = True,
20
+ ) -> Dict[str, torch.Tensor]:
21
+ """Tokenizes a text and maps tokens to token-ids"""
22
+ output = {}
23
+ if isinstance(texts[0], str):
24
+ texts = [x + self.tokenizer.eos_token for x in texts]
25
+ to_tokenize = [texts]
26
+ elif isinstance(texts[0], dict):
27
+ to_tokenize = []
28
+ output["text_keys"] = []
29
+ for lookup in texts:
30
+ text_key, text = next(iter(lookup.items()))
31
+ to_tokenize.append(text)
32
+ output["text_keys"].append(text_key)
33
+ to_tokenize = [to_tokenize]
34
+ else:
35
+ batch1, batch2 = [], []
36
+ for text_tuple in texts:
37
+ batch1.append(text_tuple[0])
38
+ batch2.append(text_tuple[1])
39
+ to_tokenize = [batch1, batch2]
40
+
41
+ output.update(
42
+ self.tokenizer(
43
+ *to_tokenize,
44
+ padding="max_length",
45
+ truncation=True,
46
+ return_tensors="pt",
47
+ max_length=512,
48
+ )
49
+ )
50
+
51
+ # this is specific to OpenGPT-X model
52
+ output.pop("token_type_ids", None)
53
+
54
+ return output
55
+
56
+ def get_config_dict(self) -> dict[str, str]:
57
+ return {"model_name_or_path": self.model_name_or_path}
58
+
59
+ def save(self, save_dir: str, **kwargs) -> None:
60
+ # self.auto_model.save_pretrained(save_dir, safe_serialization=True)
61
+ # self.tokenizer.save_pretrained(save_dir)
62
+
63
+ with open(os.path.join(save_dir, "sentence_bert_config.json"), "w+") as f:
64
+ json.dump(self.get_config_dict(), f, indent=4)
65
+
66
+ model_path = os.path.join(save_dir, MODEL_PATH_IN_MODULE)
67
+ if not os.path.exists(model_path):
68
+ os.makedirs(model_path)
69
+
70
+ with open(f"{model_path}/config.json", "w+") as f:
71
+ json.dump(self.get_config_dict(), f, indent=4)
72
+
73
+ @staticmethod
74
+ def load(load_dir: str, **kwargs) -> "EmbeddingModel":
75
+ with open(os.path.join(load_dir, "config.json")) as fIn:
76
+ config = json.load(fIn)
77
+ return EmbeddingModel(**config)
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "0_Model",
6
+ "type": "embedding_model.EmbeddingModel"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_TokenPooling",
12
+ "type": "token_pooling.TokenPooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "model_name_or_path": "/mnt/mydrive/embedding-models-fine-tuned/merged_models/7B_EU24_2_5t_with_BIG_dataset_multiple_neg_full_model"
3
+ }
token_pooling.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+
7
+ class TokenPooling(nn.Module):
8
+ def __init__(self, dimension: int = 4096) -> None:
9
+ super(TokenPooling, self).__init__()
10
+ self.dimension = dimension
11
+
12
+ def forward(
13
+ self, features: dict[str, torch.Tensor], **kwargs
14
+ ) -> dict[str, torch.Tensor]:
15
+ token_embeddings = features["token_embeddings"]
16
+ attention_mask = features["attention_mask"]
17
+
18
+ embeddings = self.pool(
19
+ last_hidden_state=token_embeddings, attention_mask=attention_mask
20
+ )
21
+ features["sentence_embedding"] = embeddings
22
+ return features
23
+
24
+ def pool(
25
+ self, last_hidden_state: torch.Tensor, attention_mask: torch.Tensor
26
+ ) -> torch.Tensor:
27
+ """
28
+ Here, we take the embedding of the last token from the last layer
29
+ """
30
+ left_padding = attention_mask[:, -1].sum() == attention_mask.shape[0]
31
+ if left_padding:
32
+ return last_hidden_state[:, -1]
33
+ else:
34
+ sequence_lengths = attention_mask.sum(dim=1) - 1
35
+ batch_size = last_hidden_state.shape[0]
36
+ return last_hidden_state[
37
+ torch.arange(batch_size, device=last_hidden_state.device).long(),
38
+ sequence_lengths.long(),
39
+ ]
40
+
41
+ def get_sentence_embedding_dimension(self) -> int:
42
+ return self.dimension
43
+
44
+ def get_config_dict(self) -> dict[str, float]:
45
+ return {"dimension": self.dimension}
46
+
47
+ def save(self, save_dir: str, **kwargs) -> None:
48
+ pooling_path = os.path.join(save_dir)
49
+ if not os.path.exists(pooling_path):
50
+ os.makedirs(pooling_path)
51
+
52
+ with open(f"{pooling_path}/config.json", "w+") as f:
53
+ json.dump(self.get_config_dict(), f, indent=4)
54
+
55
+ @staticmethod
56
+ def load(load_dir: str, **kwargs) -> "TokenPooling":
57
+ with open(os.path.join(load_dir, "config.json")) as fIn:
58
+ config = json.load(fIn)
59
+ return TokenPooling(**config)