Update tokenizer_script.py
Browse files- tokenizer_script.py +1 -2
tokenizer_script.py
CHANGED
|
@@ -76,7 +76,7 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 76 |
|
| 77 |
return (vocab_file,)
|
| 78 |
|
| 79 |
-
def batch_encode(self, texts, add_special_tokens=False, padding=False, max_length=None):
|
| 80 |
encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
|
| 81 |
# Handle max_length (truncation)
|
| 82 |
if max_length is not None:
|
|
@@ -123,7 +123,6 @@ class CharacterTokenizer(PreTrainedTokenizer):
|
|
| 123 |
def convert_tokens_to_string(self, tokens):
|
| 124 |
return "".join(tokens)
|
| 125 |
|
| 126 |
-
|
| 127 |
@classmethod
|
| 128 |
def from_json(cls, vocab_file, **kwargs):
|
| 129 |
with open(vocab_file, 'r', encoding='utf-8') as f:
|
|
|
|
| 76 |
|
| 77 |
return (vocab_file,)
|
| 78 |
|
| 79 |
+
def batch_encode(self, texts, add_special_tokens=False, padding=False, truncation=True, max_length=None):
|
| 80 |
encoded_texts = [self.encode(text, add_special_tokens) for text in texts]
|
| 81 |
# Handle max_length (truncation)
|
| 82 |
if max_length is not None:
|
|
|
|
| 123 |
def convert_tokens_to_string(self, tokens):
|
| 124 |
return "".join(tokens)
|
| 125 |
|
|
|
|
| 126 |
@classmethod
|
| 127 |
def from_json(cls, vocab_file, **kwargs):
|
| 128 |
with open(vocab_file, 'r', encoding='utf-8') as f:
|