| |
| """Train the Emese Hungarian SentencePiece tokenizer (32K unigram).""" |
| import argparse, json, os, sys, tempfile, time |
| import sentencepiece as spm |
|
|
|
|
| def build_corpus(path: str, limit: int) -> str: |
| """Extract up to `limit` texts from JSONL into a temp file.""" |
| if not os.path.exists(path): |
| sys.exit(f"Error: input file not found: {path}") |
| tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False, encoding="utf-8") |
| count = 0 |
| with open(path) as f: |
| for line in f: |
| if text := json.loads(line).get("text"): |
| tmp.write(text + "\n") |
| count += 1 |
| if count >= limit: |
| break |
| tmp.close() |
| print(f"Sampled {count:,} paragraphs from {path}") |
| return tmp.name |
|
|
|
|
| def train_tokenizer(corpus: str, prefix: str, vocab_size: int) -> None: |
| os.makedirs(os.path.dirname(prefix) or ".", exist_ok=True) |
| print(f"Training {vocab_size:,}-vocab tokenizer → {prefix}.model") |
| spm.SentencePieceTrainer.train( |
| input=corpus, model_prefix=prefix, model_type="unigram", |
| vocab_size=vocab_size, character_coverage=0.9999, |
| pad_id=-1, unk_id=0, bos_id=1, eos_id=2, |
| user_defined_symbols=["<eos>"], |
| shrinking_factor=0.85, num_sub_iterations=4, |
| max_sentencepiece_length=24, seed_sentencepiece_size=2_000_000, |
| split_digits=False, byte_fallback=False, num_threads=os.cpu_count(), |
| input_sentence_size=500_000, shuffle_input_sentence=True, |
| normalization_rule_name="nfkc", |
| ) |
|
|
|
|
| def evaluate(model_path: str, corpus: str, limit: int = 50_000) -> None: |
| sp = spm.SentencePieceProcessor(model_file=f"{model_path}.model") |
| chars = tokens = words = 0 |
| with open(corpus) as f: |
| for i, line in enumerate(f): |
| if i >= limit: |
| break |
| if text := line.strip(): |
| ids = sp.encode(text, out_type=int) |
| chars += len(text); tokens += len(ids); words += len(text.split()) |
| print(f" Chars/token: {chars/tokens:.2f} | Tokens/word: {tokens/words:.2f}") |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser(description="Train Emese SentencePiece tokenizer") |
| p.add_argument("--input", default="data/clean/wiki_clean.jsonl", help="Input JSONL") |
| p.add_argument("--model-dir", default="tokenizer", help="Output directory") |
| p.add_argument("--model-name", default="emese-tokenizer", help="Model prefix name") |
| p.add_argument("--vocab-size", type=int, default=32_000, help="Vocabulary size") |
| p.add_argument("--sample-limit", type=int, default=500_000, help="Max paragraphs to sample") |
| args = p.parse_args() |
|
|
| t0 = time.time() |
| prefix = os.path.join(args.model_dir, args.model_name) |
| corpus = build_corpus(args.input, args.sample_limit) |
| try: |
| train_tokenizer(corpus, prefix, args.vocab_size) |
| evaluate(prefix, corpus) |
| finally: |
| os.unlink(corpus) |
| print(f"Done in {time.time()-t0:.1f}s → {prefix}.model") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|
|
|