emese-tokenizer-32k / train_tokenizer.py
gyopak's picture
Upload folder using huggingface_hub
223d3da verified
#!/usr/bin/env python3
"""Train the Emese Hungarian SentencePiece tokenizer (32K unigram)."""
import argparse, json, os, sys, tempfile, time
import sentencepiece as spm
def build_corpus(path: str, limit: int) -> str:
"""Extract up to `limit` texts from JSONL into a temp file."""
if not os.path.exists(path):
sys.exit(f"Error: input file not found: {path}")
tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8")
count = 0
with open(path) as f:
for line in f:
if text := json.loads(line).get("text"):
tmp.write(text + "\n")
count += 1
if count >= limit:
break
tmp.close()
print(f"Sampled {count:,} paragraphs from {path}")
return tmp.name
def train_tokenizer(corpus: str, prefix: str, vocab_size: int) -> None:
os.makedirs(os.path.dirname(prefix) or ".", exist_ok=True)
print(f"Training {vocab_size:,}-vocab tokenizer → {prefix}.model")
spm.SentencePieceTrainer.train(
input=corpus, model_prefix=prefix, model_type="unigram",
vocab_size=vocab_size, character_coverage=0.9999,
pad_id=-1, unk_id=0, bos_id=1, eos_id=2,
user_defined_symbols=["<eos>"],
shrinking_factor=0.85, num_sub_iterations=4,
max_sentencepiece_length=24, seed_sentencepiece_size=2_000_000,
split_digits=False, byte_fallback=False, num_threads=os.cpu_count(),
input_sentence_size=500_000, shuffle_input_sentence=True,
normalization_rule_name="nfkc",
)
def evaluate(model_path: str, corpus: str, limit: int = 50_000) -> None:
sp = spm.SentencePieceProcessor(model_file=f"{model_path}.model")
chars = tokens = words = 0
with open(corpus) as f:
for i, line in enumerate(f):
if i >= limit:
break
if text := line.strip():
ids = sp.encode(text, out_type=int)
chars += len(text); tokens += len(ids); words += len(text.split())
print(f" Chars/token: {chars/tokens:.2f} | Tokens/word: {tokens/words:.2f}")
def main():
p = argparse.ArgumentParser(description="Train Emese SentencePiece tokenizer")
p.add_argument("--input", default="data/clean/wiki_clean.jsonl", help="Input JSONL")
p.add_argument("--model-dir", default="tokenizer", help="Output directory")
p.add_argument("--model-name", default="emese-tokenizer", help="Model prefix name")
p.add_argument("--vocab-size", type=int, default=32_000, help="Vocabulary size")
p.add_argument("--sample-limit", type=int, default=500_000, help="Max paragraphs to sample")
args = p.parse_args()
t0 = time.time()
prefix = os.path.join(args.model_dir, args.model_name)
corpus = build_corpus(args.input, args.sample_limit)
try:
train_tokenizer(corpus, prefix, args.vocab_size)
evaluate(prefix, corpus)
finally:
os.unlink(corpus)
print(f"Done in {time.time()-t0:.1f}s → {prefix}.model")
if __name__ == "__main__":
main()