Remove tokenize
Browse files- EmbeddingModel.py +0 -62
EmbeddingModel.py
CHANGED
|
@@ -127,68 +127,6 @@ class EmbeddingModel(BaseTransformer):
|
|
| 127 |
embeddings[i] = hidden_states[i, mask[-1]]
|
| 128 |
|
| 129 |
return embeddings
|
| 130 |
-
|
| 131 |
-
def tokenize(self, texts, batch_size=32):
|
| 132 |
-
"""
|
| 133 |
-
Tokenize texts with custom handling of special tokens and padding.
|
| 134 |
-
This is a key method for SentenceTransformer compatibility, integrating
|
| 135 |
-
our custom tokenization approach.
|
| 136 |
-
|
| 137 |
-
Args:
|
| 138 |
-
texts: A list of texts to tokenize or a single text
|
| 139 |
-
batch_size: Batch size for tokenization (if needed)
|
| 140 |
-
|
| 141 |
-
Returns:
|
| 142 |
-
Dictionary with 'input_ids' and 'attention_mask'
|
| 143 |
-
"""
|
| 144 |
-
tokenizer = self._tokenizer if hasattr(self, '_tokenizer') else self.tokenizer
|
| 145 |
-
|
| 146 |
-
if isinstance(texts, str):
|
| 147 |
-
texts = [texts]
|
| 148 |
-
|
| 149 |
-
# Use our custom tokenization approach
|
| 150 |
-
encodings = tokenizer(
|
| 151 |
-
texts,
|
| 152 |
-
max_length=self.max_seq_length - 2, # Reserve space for special tokens
|
| 153 |
-
add_special_tokens=False,
|
| 154 |
-
padding=False,
|
| 155 |
-
truncation=True,
|
| 156 |
-
)
|
| 157 |
-
|
| 158 |
-
input_ids = []
|
| 159 |
-
attention_mask = []
|
| 160 |
-
|
| 161 |
-
# Add special tokens (BOS and EOS)
|
| 162 |
-
for ids, mask in zip(encodings["input_ids"], encodings["attention_mask"]):
|
| 163 |
-
new_ids = (
|
| 164 |
-
[tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
|
| 165 |
-
)
|
| 166 |
-
new_mask = [1] * len(new_ids)
|
| 167 |
-
|
| 168 |
-
input_ids.append(new_ids)
|
| 169 |
-
attention_mask.append(new_mask)
|
| 170 |
-
|
| 171 |
-
# Determine max sequence length in the batch for padding
|
| 172 |
-
max_seq_length = max(len(ids) for ids in input_ids)
|
| 173 |
-
padded_input_ids = []
|
| 174 |
-
padded_attention_mask = []
|
| 175 |
-
|
| 176 |
-
# Apply padding
|
| 177 |
-
for ids, mask in zip(input_ids, attention_mask):
|
| 178 |
-
padding_length = max_seq_length - len(ids)
|
| 179 |
-
if padding_length > 0:
|
| 180 |
-
padded_input_ids.append(
|
| 181 |
-
ids + [tokenizer.pad_token_id] * padding_length
|
| 182 |
-
)
|
| 183 |
-
padded_attention_mask.append(mask + [0] * padding_length)
|
| 184 |
-
else:
|
| 185 |
-
padded_input_ids.append(ids[:max_seq_length])
|
| 186 |
-
padded_attention_mask.append(mask[:max_seq_length])
|
| 187 |
-
|
| 188 |
-
return {
|
| 189 |
-
"input_ids": torch.tensor(padded_input_ids),
|
| 190 |
-
"attention_mask": torch.tensor(padded_attention_mask),
|
| 191 |
-
}
|
| 192 |
|
| 193 |
def get_sentence_embedding_dimension(self):
|
| 194 |
"""Return the dimension of the sentence embeddings."""
|
|
|
|
| 127 |
embeddings[i] = hidden_states[i, mask[-1]]
|
| 128 |
|
| 129 |
return embeddings
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
def get_sentence_embedding_dimension(self):
|
| 132 |
"""Return the dimension of the sentence embeddings."""
|