MagedSaeed
/

APCD-Plus-meter-classification-model

Text Classification

model_hub_mixin

pytorch_model_hub_mixin

Model card Files Files and versions

MagedSaeed commited on Mar 17, 2025

Commit

99c923e

·

verified ·

1 Parent(s): c3ceda6

Update tokenizer_script.py

Files changed (1) hide show

tokenizer_script.py +12 -0

tokenizer_script.py CHANGED Viewed

@@ -76,6 +76,18 @@ class CharacterTokenizer(PreTrainedTokenizer):
         return (vocab_file,)
     def train(self, texts):
         # Start with special tokens
         vocab = {}

         return (vocab_file,)
+    def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
+        encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
+        # Handle max_length (truncation)
+        if max_length is not None:
+            encoded_texts = [ids[:max_length] for ids in encoded_texts]
+        # Handle padding
+        if padding:
+            pad_id = self.vocab.get(self.pad_token, 0)
+            max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
+            encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
+        return encoded_texts
     def train(self, texts):
         # Start with special tokens
         vocab = {}