| |
|
|
| |
| |
| |
|
|
|
|
| from transformers import AutoTokenizer |
| tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True) |
|
|
| print(type(tok)) |
|
|
| print(tok("the singers were singing a very nice song!")) |
|
|
| print(tok.tokenize("the singers were singing a very nice song!")) |
|
|
| print(tok.special_tokens_map) |
| print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id) |
|
|
| enc = tok("the skibidiboppers were sdjnajning a very nice song!", |
| add_special_tokens=True, |
| return_attention_mask=True) |
| print(tok.convert_ids_to_tokens(enc["input_ids"])) |
|
|
| |
| print(tok.backend_tokenizer.post_processor) |
|
|
| |
| from tokenizers import Tokenizer |
| import os |
| tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json")) |
| print(tk.post_processor) |
|
|
| enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt") |
| print(enc["input_ids"]) |
| print(enc["attention_mask"]) |
|
|