Mini-LLM / Tokenizer /train_spm_bpe.py
Ashx098's picture
Upload folder using huggingface_hub
a433a25 verified
raw
history blame contribute delete
539 Bytes
import sentencepiece as spm
spm.SentencePieceTrainer.Train(
input="/home/aviinashh/projects/Mini-LLM/data/raw/merged_text/corpus.txt",
model_prefix="/home/aviinashh/projects/Mini-LLM/Tokenizer/BPE/spm",
vocab_size=32000,
model_type="bpe",
byte_fallback=True,
character_coverage=1.0,
unk_id=0,
bos_id=1,
eos_id=2,
pad_id=3,
user_defined_symbols=["<user>", "<assistant>", "<system>"],
)
print("Tokenizer trained!")
# Model and vocab will be saved as spm.model and spm.vocab in the specified path