File size: 539 Bytes
a433a25 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 |
import sentencepiece as spm
spm.SentencePieceTrainer.Train(
input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm",
vocab_size=32000,
model_type="bpe",
byte_fallback=True,
character_coverage=1.0,
unk_id=0,
bos_id=1,
eos_id=2,
pad_id=3,
user_defined_symbols=["<user>", "<assistant>", "<system>"],
)
print("Tokenizer trained!")
# Model and vocab will be saved as spm.model and spm.vocab in the specified path |