File size: 428 Bytes
f29d474
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15

# Script to tokenize cleaned text using SentencePiece

import sentencepiece as spm

INPUT_FILE = "data/processed/mk_wiki_clean.txt"
MODEL_PREFIX = "data/tokenized/mk_tokenizer"

def train_tokenizer():
    spm.SentencePieceTrainer.train(input=INPUT_FILE, model_prefix=MODEL_PREFIX, vocab_size=32000)
    print(f"✅ Tokenizer model saved at {MODEL_PREFIX}.model")

if __name__ == "__main__":
    train_tokenizer()