import sentencepiece as spm spm.SentencePieceTrainer.Train( input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt", model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm", vocab_size=32000, model_type="bpe", byte_fallback=True, character_coverage=1.0, unk_id=0, bos_id=1, eos_id=2, pad_id=3, user_defined_symbols=["", "", ""], ) print("Tokenizer trained!") # Model and vocab will be saved as spm.model and spm.vocab in the specified path