Update tokenizer_script.py
Browse files- tokenizer_script.py +12 -0
tokenizer_script.py
CHANGED
|
@@ -76,6 +76,18 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 76 |
|
| 77 |
return (vocab_file,)
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def train(self, texts):
|
| 80 |
# Start with special tokens
|
| 81 |
vocab = {}
|
|
|
|
| 76 |
|
| 77 |
return (vocab_file,)
|
| 78 |
|
| 79 |
+
def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
|
| 80 |
+
encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
|
| 81 |
+
# Handle max_length (truncation)
|
| 82 |
+
if max_length is not None:
|
| 83 |
+
encoded_texts = [ids[:max_length] for ids in encoded_texts]
|
| 84 |
+
# Handle padding
|
| 85 |
+
if padding:
|
| 86 |
+
pad_id = self.vocab.get(self.pad_token, 0)
|
| 87 |
+
max_len = max(len(ids) for ids in encoded_texts) if max_length is None else max_length
|
| 88 |
+
encoded_texts = [ids + [pad_id] * (max_len - len(ids)) for ids in encoded_texts]
|
| 89 |
+
return encoded_texts
|
| 90 |
+
|
| 91 |
def train(self, texts):
|
| 92 |
# Start with special tokens
|
| 93 |
vocab = {}
|