| from transformers import AutoTokenizer |
| from tokenizers import Tokenizer |
| from tokenizers.processors import TemplateProcessing |
| import os, json |
|
|
| TOK_DIR = "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M" |
|
|
| |
| tmp = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True) |
| print("HF loads from:", tmp.name_or_path) |
|
|
| |
| bos, eos = tmp.bos_token, tmp.eos_token |
| assert bos and eos, "BOS/EOS not defined in special_tokens_map.json / tokenizer_config.json" |
| bos_id, eos_id = tmp.convert_tokens_to_ids([bos, eos]) |
|
|
| |
| tok_json = os.path.join(TOK_DIR, "tokenizer.json") |
| tk = Tokenizer.from_file(tok_json) |
| tk.post_processor = TemplateProcessing( |
| single=f"{bos} $A {eos}", |
| pair=f"{bos} $A {eos} $B:1 {eos}:1", |
| special_tokens=[(bos, bos_id), (eos, eos_id)], |
| ) |
| tk.save(tok_json) |
|
|
| |
| cfg_path = os.path.join(TOK_DIR, "tokenizer_config.json") |
| with open(cfg_path, "r", encoding="utf-8") as f: |
| cfg = json.load(f) |
| cfg["bos_token"] = bos |
| cfg["eos_token"] = eos |
| with open(cfg_path, "w", encoding="utf-8") as f: |
| json.dump(cfg, f, indent=2) |
|
|
| |
| tok = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True) |
| print("post-processor:", tok.backend_tokenizer.post_processor) |
|
|
| |
| enc = tok("the singers were singing a very nice song!", add_special_tokens=True, return_attention_mask=False) |
| print(tok.convert_ids_to_tokens(enc["input_ids"])) |