alexshah commited on
Commit
2c57c6a
·
verified ·
1 Parent(s): 4b62d89

Remove tokenize

Browse files
Files changed (1) hide show
  1. EmbeddingModel.py +0 -62
EmbeddingModel.py CHANGED
@@ -127,68 +127,6 @@ class EmbeddingModel(BaseTransformer):
127
  embeddings[i] = hidden_states[i, mask[-1]]
128
 
129
  return embeddings
130
-
131
- def tokenize(self, texts, batch_size=32):
132
- """
133
- Tokenize texts with custom handling of special tokens and padding.
134
- This is a key method for SentenceTransformer compatibility, integrating
135
- our custom tokenization approach.
136
-
137
- Args:
138
- texts: A list of texts to tokenize or a single text
139
- batch_size: Batch size for tokenization (if needed)
140
-
141
- Returns:
142
- Dictionary with 'input_ids' and 'attention_mask'
143
- """
144
- tokenizer = self._tokenizer if hasattr(self, '_tokenizer') else self.tokenizer
145
-
146
- if isinstance(texts, str):
147
- texts = [texts]
148
-
149
- # Use our custom tokenization approach
150
- encodings = tokenizer(
151
- texts,
152
- max_length=self.max_seq_length - 2, # Reserve space for special tokens
153
- add_special_tokens=False,
154
- padding=False,
155
- truncation=True,
156
- )
157
-
158
- input_ids = []
159
- attention_mask = []
160
-
161
- # Add special tokens (BOS and EOS)
162
- for ids, mask in zip(encodings["input_ids"], encodings["attention_mask"]):
163
- new_ids = (
164
- [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
165
- )
166
- new_mask = [1] * len(new_ids)
167
-
168
- input_ids.append(new_ids)
169
- attention_mask.append(new_mask)
170
-
171
- # Determine max sequence length in the batch for padding
172
- max_seq_length = max(len(ids) for ids in input_ids)
173
- padded_input_ids = []
174
- padded_attention_mask = []
175
-
176
- # Apply padding
177
- for ids, mask in zip(input_ids, attention_mask):
178
- padding_length = max_seq_length - len(ids)
179
- if padding_length > 0:
180
- padded_input_ids.append(
181
- ids + [tokenizer.pad_token_id] * padding_length
182
- )
183
- padded_attention_mask.append(mask + [0] * padding_length)
184
- else:
185
- padded_input_ids.append(ids[:max_seq_length])
186
- padded_attention_mask.append(mask[:max_seq_length])
187
-
188
- return {
189
- "input_ids": torch.tensor(padded_input_ids),
190
- "attention_mask": torch.tensor(padded_attention_mask),
191
- }
192
 
193
  def get_sentence_embedding_dimension(self):
194
  """Return the dimension of the sentence embeddings."""
 
127
  embeddings[i] = hidden_states[i, mask[-1]]
128
 
129
  return embeddings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
  def get_sentence_embedding_dimension(self):
132
  """Return the dimension of the sentence embeddings."""