NeTSlab
/

gpt2-10M-parfind-eng

Model card Files Files and versions

gpt2-10M-parfind-eng / bos_eos_patch.py

achille-fusco's picture

Upload folder using huggingface_hub

c2760fe verified 8 months ago

1.72 kB

	from transformers import AutoTokenizer
	from tokenizers import Tokenizer
	from tokenizers.processors import TemplateProcessing
	import os, json

	TOK_DIR = "/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M"

	# 0) sanity: what path will HF load?
	tmp = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
	print("HF loads from:", tmp.name_or_path)

	# 1) get bos/eos tokens & ids
	bos, eos = tmp.bos_token, tmp.eos_token
	assert bos and eos, "BOS/EOS not defined in special_tokens_map.json / tokenizer_config.json"
	bos_id, eos_id = tmp.convert_tokens_to_ids([bos, eos])

	# 2) patch tokenizer.json with a TemplateProcessing post-processor
	tok_json = os.path.join(TOK_DIR, "tokenizer.json")
	tk = Tokenizer.from_file(tok_json)
	tk.post_processor = TemplateProcessing(
	single=f"{bos} $A {eos}",
	pair=f"{bos} $A {eos} $B:1 {eos}:1",
	special_tokens=[(bos, bos_id), (eos, eos_id)],
	)
	tk.save(tok_json)

	# 3) (optional) keep bos/eos also in tokenizer_config.json
	cfg_path = os.path.join(TOK_DIR, "tokenizer_config.json")
	with open(cfg_path, "r", encoding="utf-8") as f:
	cfg = json.load(f)
	cfg["bos_token"] = bos
	cfg["eos_token"] = eos
	with open(cfg_path, "w", encoding="utf-8") as f:
	json.dump(cfg, f, indent=2)

	# 4) verify post-processor is present after a fresh reload
	tok = AutoTokenizer.from_pretrained(TOK_DIR, trust_remote_code=True, local_files_only=True)
	print("post-processor:", tok.backend_tokenizer.post_processor) # should NOT be None

	# 5) final check: specials appear when requested
	enc = tok("the singers were singing a very nice song!", add_special_tokens=True, return_attention_mask=False)
	print(tok.convert_ids_to_tokens(enc["input_ids"]))