#from tokenizer import ParadigmTokenizerWrapper

#tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M")
#enc = tok("the singers were singing a very nice song!")
#print(tok.tok.convert_ids_to_tokens(enc["input_ids"]))


from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True)

print(type(tok))

print(tok("the singers were singing a very nice song!"))

print(tok.tokenize("the singers were singing a very nice song!"))

print(tok.special_tokens_map)
print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id)

enc = tok("the skibidiboppers were sdjnajning a very nice song!",
          add_special_tokens=True,
          return_attention_mask=True)
print(tok.convert_ids_to_tokens(enc["input_ids"]))

# via HF object
print(tok.backend_tokenizer.post_processor)   # should NOT be None

# double-check by reading tokenizer.json directly
from tokenizers import Tokenizer
import os
tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json"))
print(tk.post_processor)                      # should NOT be None

enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt")
print(enc["input_ids"])        # shorter row should end with pad ids
print(enc["attention_mask"])   # 1 for real tokens, 0 for pads