|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True) |
|
|
|
|
|
print(type(tok)) |
|
|
|
|
|
print(tok("the singers were singing a very nice song!")) |
|
|
|
|
|
print(tok.tokenize("the singers were singing a very nice song!")) |
|
|
|
|
|
print(tok.special_tokens_map) |
|
|
print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id) |
|
|
|
|
|
enc = tok("the skibidiboppers were sdjnajning a very nice song!", |
|
|
add_special_tokens=True, |
|
|
return_attention_mask=True) |
|
|
print(tok.convert_ids_to_tokens(enc["input_ids"])) |
|
|
|
|
|
|
|
|
print(tok.backend_tokenizer.post_processor) |
|
|
|
|
|
|
|
|
from tokenizers import Tokenizer |
|
|
import os |
|
|
tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json")) |
|
|
print(tk.post_processor) |
|
|
|
|
|
enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt") |
|
|
print(enc["input_ids"]) |
|
|
print(enc["attention_mask"]) |
|
|
|