PyTorch
gpt2
gpt2-10M-parfind-eng / segmentation_tests.py
achille-fusco's picture
Upload folder using huggingface_hub
c2760fe verified
#from tokenizer import ParadigmTokenizerWrapper
#tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M")
#enc = tok("the singers were singing a very nice song!")
#print(tok.tok.convert_ids_to_tokens(enc["input_ids"]))
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True)
print(type(tok))
print(tok("the singers were singing a very nice song!"))
print(tok.tokenize("the singers were singing a very nice song!"))
print(tok.special_tokens_map)
print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id)
enc = tok("the skibidiboppers were sdjnajning a very nice song!",
add_special_tokens=True,
return_attention_mask=True)
print(tok.convert_ids_to_tokens(enc["input_ids"]))
# via HF object
print(tok.backend_tokenizer.post_processor) # should NOT be None
# double-check by reading tokenizer.json directly
from tokenizers import Tokenizer
import os
tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json"))
print(tk.post_processor) # should NOT be None
enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt")
print(enc["input_ids"]) # shorter row should end with pad ids
print(enc["attention_mask"]) # 1 for real tokens, 0 for pads