MAIRK commited on
Commit
e98c4c6
·
verified ·
1 Parent(s): b329e7f

Upload tokenizer.mode

Browse files

# generate_tokenizer_model.py
# 创建符合 Hugging Face 和 LLaMA 要求的 tokenizer.model(SentencePiece)

import sentencepiece as spm
import os

# ---------- 配置 ----------
# 训练数据路径:纯文本文件,每行为一个句子
input_file = "tokenizer_corpus.txt"
# 输出模型前缀,将生成 tokenizer.model 和 tokenizer.vocab
model_prefix = "tokenizer"
# 词表大小(LLaMA 默认 32000)
vocab_size = 32000

# 检查输入文件
if not os.path.exists(input_file):
raise FileNotFoundError(f"未找到训练语料文件: {input_file}")

# ---------- 训练 SentencePiece 模型 ----------
print("🚀 开始训练 SentencePiece 分词器...")
spm.SentencePieceTrainer.Train(
input=input_file,
model_prefix=model_prefix,
vocab_size=vocab_size,
character_coverage=1.0,
model_type='bpe',
bos_id=1,
eos_id=2,
pad_id=0,
unk_id=3,
user_defined_symbols=['<s>', '</s>', '<unk>', '<pad>']
)

print("✅ 训练完成!生成文件:")
print("- tokenizer.model")
print("- tokenizer.vocab")

Files changed (1) hide show
  1. tokenizer.model +3 -0
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30adda947c3d9d6cd9f9c66cfeb130811932a53ee987e06a2cbd683b1e23ab58
3
+ size 1087