hey-shiv
/

mini-modern-llm

Text Generation

Model card Files Files and versions

mini-modern-llm / train_tokenizer.py

hey-shiv's picture

Upload train_tokenizer.py with huggingface_hub

e6bb431 verified about 1 month ago

history blame contribute delete

1.28 kB

	from pathlib import Path

	from tokenizers import Tokenizer
	from tokenizers.decoders import ByteLevel as ByteLevelDecoder
	from tokenizers.models import BPE
	from tokenizers.pre_tokenizers import ByteLevel
	from tokenizers.processors import ByteLevel as ByteLevelProcessor
	from tokenizers.trainers import BpeTrainer

	from download_dataset import DATA_PATH, ensure_dataset_exists


	TOKENIZER_PATH = Path(__file__).resolve().with_name("tokenizer.json")
	VOCAB_SIZE = 2000
	SPECIAL_TOKENS = [
	"[PAD]",
	"[UNK]",
	"[BOS]",
	"[EOS]",
	]


	def main() -> None:
	data_path = ensure_dataset_exists(DATA_PATH)

	tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
	tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=False)
	tokenizer.decoder = ByteLevelDecoder()
	tokenizer.post_processor = ByteLevelProcessor(trim_offsets=True)

	trainer = BpeTrainer(
	vocab_size=VOCAB_SIZE,
	min_frequency=2,
	special_tokens=SPECIAL_TOKENS,
	initial_alphabet=ByteLevel.alphabet(),
	show_progress=True,
	)

	tokenizer.train([str(data_path)], trainer)
	tokenizer.save(str(TOKENIZER_PATH))

	print(f"Saved tokenizer to {TOKENIZER_PATH}")
	print(f"Final vocab size: {tokenizer.get_vocab_size()}")


	if __name__ == "__main__":
	main()