#from tokenizer import ParadigmTokenizerWrapper #tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M") #enc = tok("the singers were singing a very nice song!") #print(tok.tok.convert_ids_to_tokens(enc["input_ids"])) from transformers import AutoTokenizer tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True) print(type(tok)) print(tok("the singers were singing a very nice song!")) print(tok.tokenize("the singers were singing a very nice song!")) print(tok.special_tokens_map) print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id) enc = tok("the skibidiboppers were sdjnajning a very nice song!", add_special_tokens=True, return_attention_mask=True) print(tok.convert_ids_to_tokens(enc["input_ids"])) # via HF object print(tok.backend_tokenizer.post_processor) # should NOT be None # double-check by reading tokenizer.json directly from tokenizers import Tokenizer import os tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json")) print(tk.post_processor) # should NOT be None enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt") print(enc["input_ids"]) # shorter row should end with pad ids print(enc["attention_mask"]) # 1 for real tokens, 0 for pads