NeTSlab
/

gpt2-10M-parfind-eng

Model card Files Files and versions

gpt2-10M-parfind-eng / segmentation_tests.py

achille-fusco's picture

Upload folder using huggingface_hub

c2760fe verified 5 months ago

history blame contribute delete

1.51 kB

	#from tokenizer import ParadigmTokenizerWrapper

	#tok = ParadigmTokenizerWrapper("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M")
	#enc = tok("the singers were singing a very nice song!")
	#print(tok.tok.convert_ids_to_tokens(enc["input_ids"]))


	from transformers import AutoTokenizer
	tok = AutoTokenizer.from_pretrained("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", trust_remote_code=True, local_files_only=True)

	print(type(tok))

	print(tok("the singers were singing a very nice song!"))

	print(tok.tokenize("the singers were singing a very nice song!"))

	print(tok.special_tokens_map)
	print(tok.bos_token, tok.eos_token, tok.bos_token_id, tok.eos_token_id)

	enc = tok("the skibidiboppers were sdjnajning a very nice song!",
	add_special_tokens=True,
	return_attention_mask=True)
	print(tok.convert_ids_to_tokens(enc["input_ids"]))

	# via HF object
	print(tok.backend_tokenizer.post_processor) # should NOT be None

	# double-check by reading tokenizer.json directly
	from tokenizers import Tokenizer
	import os
	tk = Tokenizer.from_file(os.path.join("/home/achille.fusco/pr_baby_lm/babyLM_2025/03-models/gpt2_ParFindFast_10M", "tokenizer.json"))
	print(tk.post_processor) # should NOT be None

	enc = tok(["a b", "a b c d"], padding="max_length", max_length=6, return_tensors="pt")
	print(enc["input_ids"]) # shorter row should end with pad ids
	print(enc["attention_mask"]) # 1 for real tokens, 0 for pads