| # Script to tokenize cleaned text using SentencePiece | |
| import sentencepiece as spm | |
| INPUT_FILE = "data/processed/mk_wiki_clean.txt" | |
| MODEL_PREFIX = "data/tokenized/mk_tokenizer" | |
| def train_tokenizer(): | |
| spm.SentencePieceTrainer.train(input=INPUT_FILE, model_prefix=MODEL_PREFIX, vocab_size=32000) | |
| print(f"✅ Tokenizer model saved at {MODEL_PREFIX}.model") | |
| if __name__ == "__main__": | |
| train_tokenizer() | |