alexshah
/

armembed

Feature Extraction

sentence-transformers

text-embeddings-inference

Model card Files Files and versions

alexshah commited on May 6, 2025

Commit

2c57c6a

·

verified ·

1 Parent(s): 4b62d89

Remove tokenize

Files changed (1) hide show

EmbeddingModel.py +0 -62

EmbeddingModel.py CHANGED Viewed

@@ -127,68 +127,6 @@ class EmbeddingModel(BaseTransformer):
                     embeddings[i] = hidden_states[i, mask[-1]]
         return embeddings
-    def tokenize(self, texts, batch_size=32):
-        """
-        Tokenize texts with custom handling of special tokens and padding.
-        This is a key method for SentenceTransformer compatibility, integrating
-        our custom tokenization approach.
-        Args:
-            texts: A list of texts to tokenize or a single text
-            batch_size: Batch size for tokenization (if needed)
-        Returns:
-            Dictionary with 'input_ids' and 'attention_mask'
-        """
-        tokenizer = self._tokenizer if hasattr(self, '_tokenizer') else self.tokenizer
-        if isinstance(texts, str):
-            texts = [texts]
-        # Use our custom tokenization approach
-        encodings = tokenizer(
-            texts,
-            max_length=self.max_seq_length - 2,  # Reserve space for special tokens
-            add_special_tokens=False,
-            padding=False,
-            truncation=True,
-        )
-        input_ids = []
-        attention_mask = []
-        # Add special tokens (BOS and EOS)
-        for ids, mask in zip(encodings["input_ids"], encodings["attention_mask"]):
-            new_ids = (
-                [tokenizer.bos_token_id] + ids + [tokenizer.eos_token_id]
-            )
-            new_mask = [1] * len(new_ids)
-            input_ids.append(new_ids)
-            attention_mask.append(new_mask)
-        # Determine max sequence length in the batch for padding
-        max_seq_length = max(len(ids) for ids in input_ids)
-        padded_input_ids = []
-        padded_attention_mask = []
-        # Apply padding
-        for ids, mask in zip(input_ids, attention_mask):
-            padding_length = max_seq_length - len(ids)
-            if padding_length > 0:
-                padded_input_ids.append(
-                    ids + [tokenizer.pad_token_id] * padding_length
-                )
-                padded_attention_mask.append(mask + [0] * padding_length)
-            else:
-                padded_input_ids.append(ids[:max_seq_length])
-                padded_attention_mask.append(mask[:max_seq_length])
-        return {
-            "input_ids": torch.tensor(padded_input_ids),
-            "attention_mask": torch.tensor(padded_attention_mask),
-        }
     def get_sentence_embedding_dimension(self):
         """Return the dimension of the sentence embeddings."""

                     embeddings[i] = hidden_states[i, mask[-1]]
         return embeddings
     def get_sentence_embedding_dimension(self):
         """Return the dimension of the sentence embeddings."""