#!/usr/bin/env python3 """Train the Emese Hungarian SentencePiece tokenizer (32K unigram).""" import argparse, json, os, sys, tempfile, time import sentencepiece as spm def build_corpus(path: str, limit: int) -> str: """Extract up to `limit` texts from JSONL into a temp file.""" if not os.path.exists(path): sys.exit(f"Error: input file not found: {path}") tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") count = 0 with open(path) as f: for line in f: if text := json.loads(line).get("text"): tmp.write(text + "\n") count += 1 if count >= limit: break tmp.close() print(f"Sampled {count:,} paragraphs from {path}") return tmp.name def train_tokenizer(corpus: str, prefix: str, vocab_size: int) -> None: os.makedirs(os.path.dirname(prefix) or ".", exist_ok=True) print(f"Training {vocab_size:,}-vocab tokenizer → {prefix}.model") spm.SentencePieceTrainer.train( input=corpus, model_prefix=prefix, model_type="unigram", vocab_size=vocab_size, character_coverage=0.9999, pad_id=-1, unk_id=0, bos_id=1, eos_id=2, user_defined_symbols=[""], shrinking_factor=0.85, num_sub_iterations=4, max_sentencepiece_length=24, seed_sentencepiece_size=2_000_000, split_digits=False, byte_fallback=False, num_threads=os.cpu_count(), input_sentence_size=500_000, shuffle_input_sentence=True, normalization_rule_name="nfkc", ) def evaluate(model_path: str, corpus: str, limit: int = 50_000) -> None: sp = spm.SentencePieceProcessor(model_file=f"{model_path}.model") chars = tokens = words = 0 with open(corpus) as f: for i, line in enumerate(f): if i >= limit: break if text := line.strip(): ids = sp.encode(text, out_type=int) chars += len(text); tokens += len(ids); words += len(text.split()) print(f" Chars/token: {chars/tokens:.2f} | Tokens/word: {tokens/words:.2f}") def main(): p = argparse.ArgumentParser(description="Train Emese SentencePiece tokenizer") p.add_argument("--input", default="data/clean/wiki_clean.jsonl", help="Input JSONL") p.add_argument("--model-dir", default="tokenizer", help="Output directory") p.add_argument("--model-name", default="emese-tokenizer", help="Model prefix name") p.add_argument("--vocab-size", type=int, default=32_000, help="Vocabulary size") p.add_argument("--sample-limit", type=int, default=500_000, help="Max paragraphs to sample") args = p.parse_args() t0 = time.time() prefix = os.path.join(args.model_dir, args.model_name) corpus = build_corpus(args.input, args.sample_limit) try: train_tokenizer(corpus, prefix, args.vocab_size) evaluate(prefix, corpus) finally: os.unlink(corpus) print(f"Done in {time.time()-t0:.1f}s → {prefix}.model") if __name__ == "__main__": main()