sunshk commited on
Commit
7f62302
·
verified ·
1 Parent(s): 6325e17

Update tokenizer_pi05.py

Browse files
Files changed (1) hide show
  1. tokenizer_pi05.py +3 -3
tokenizer_pi05.py CHANGED
@@ -14,14 +14,14 @@ class PaligemmaTokenizer(PreTrainedTokenizer):
14
  """
15
 
16
  vocab_files_names = {"vocab_file": "tokenizer.model"}
17
- model_input_names = ["input_ids", "attention_mask"]
18
 
19
  def __init__(self, vocab_file: str, max_len: int = 48, **kwargs):
20
- super().__init__(**kwargs)
21
-
22
  self.vocab_file = vocab_file
23
  self._max_len = int(max_len)
24
  self._tokenizer = sentencepiece.SentencePieceProcessor(model_file=str(vocab_file))
 
 
25
  self.pad_token_id = 0
26
 
27
  # ---- minimal HF plumbing ----
 
14
  """
15
 
16
  vocab_files_names = {"vocab_file": "tokenizer.model"}
 
17
 
18
  def __init__(self, vocab_file: str, max_len: int = 48, **kwargs):
19
+ # Must init _tokenizer BEFORE super().__init__ because parent accesses vocab_size
 
20
  self.vocab_file = vocab_file
21
  self._max_len = int(max_len)
22
  self._tokenizer = sentencepiece.SentencePieceProcessor(model_file=str(vocab_file))
23
+
24
+ super().__init__(**kwargs)
25
  self.pad_token_id = 0
26
 
27
  # ---- minimal HF plumbing ----