PyTorch
gpt2
gpt2-10M-parfind-eng / bos_eos_patch.py
achille-fusco's picture
Upload folder using huggingface_hub
c2760fe verified
raw
history blame
1.72 kB
from transformers import AutoTokenizer
from tokenizers import Tokenizer
from tokenizers.processors import TemplateProcessing
import os, json
TOK_DIR = "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M"
# 0) sanity: what path will HF load?
tmp = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
print("HF loads from:", tmp.name_or_path)
# 1) get bos/eos tokens & ids
bos, eos = tmp.bos_token, tmp.eos_token
assert bos and eos, "BOS/EOS not defined in special_tokens_map.json / tokenizer_config.json"
bos_id, eos_id = tmp.convert_tokens_to_ids([bos, eos])
# 2) patch tokenizer.json with a TemplateProcessing post-processor
tok_json = os.path.join(TOK_DIR, "tokenizer.json")
tk = Tokenizer.from_file(tok_json)
tk.post_processor = TemplateProcessing(
single=f"{bos} $A {eos}",
pair=f"{bos} $A {eos} $B:1 {eos}:1",
special_tokens=[(bos, bos_id), (eos, eos_id)],
)
tk.save(tok_json)
# 3) (optional) keep bos/eos also in tokenizer_config.json
cfg_path = os.path.join(TOK_DIR, "tokenizer_config.json")
with open(cfg_path, "r", encoding="utf-8") as f:
cfg = json.load(f)
cfg["bos_token"] = bos
cfg["eos_token"] = eos
with open(cfg_path, "w", encoding="utf-8") as f:
json.dump(cfg, f, indent=2)
# 4) verify post-processor is present after a fresh reload
tok = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
print("post-processor:", tok.backend_tokenizer.post_processor) # should NOT be None
# 5) final check: specials appear when requested
enc = tok("the singers were singing a very nice song!", add_special_tokens=True, return_attention_mask=False)
print(tok.convert_ids_to_tokens(enc["input_ids"]))