Update tokenizer_pi05.py
Browse files- tokenizer_pi05.py +3 -3
tokenizer_pi05.py
CHANGED
|
@@ -14,14 +14,14 @@ class PaligemmaTokenizer(PreTrainedTokenizer):
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
vocab_files_names = {"vocab_file": "tokenizer.model"}
|
| 17 |
-
model_input_names = ["input_ids", "attention_mask"]
|
| 18 |
|
| 19 |
def __init__(self, vocab_file: str, max_len: int = 48, **kwargs):
|
| 20 |
-
super().__init__
|
| 21 |
-
|
| 22 |
self.vocab_file = vocab_file
|
| 23 |
self._max_len = int(max_len)
|
| 24 |
self._tokenizer = sentencepiece.SentencePieceProcessor(model_file=str(vocab_file))
|
|
|
|
|
|
|
| 25 |
self.pad_token_id = 0
|
| 26 |
|
| 27 |
# ---- minimal HF plumbing ----
|
|
|
|
| 14 |
"""
|
| 15 |
|
| 16 |
vocab_files_names = {"vocab_file": "tokenizer.model"}
|
|
|
|
| 17 |
|
| 18 |
def __init__(self, vocab_file: str, max_len: int = 48, **kwargs):
|
| 19 |
+
# Must init _tokenizer BEFORE super().__init__ because parent accesses vocab_size
|
|
|
|
| 20 |
self.vocab_file = vocab_file
|
| 21 |
self._max_len = int(max_len)
|
| 22 |
self._tokenizer = sentencepiece.SentencePieceProcessor(model_file=str(vocab_file))
|
| 23 |
+
|
| 24 |
+
super().__init__(**kwargs)
|
| 25 |
self.pad_token_id = 0
|
| 26 |
|
| 27 |
# ---- minimal HF plumbing ----
|