LLM-GO / src /llm_go /scripts /tokenize.py
LesterCerioli's picture
Building first version from Golang LLM
a58ece3
Raw
History Blame Contribute Delete
726 Bytes
"""CLI: train the Go BPE tokenizer."""
import click
from pathlib import Path
from llm_go.tokenizer.go_tokenizer import GoTokenizer
@click.command()
@click.option("--raw-dir", default="data/raw", show_default=True)
@click.option("--out-dir", default="data/tokenizer", show_default=True)
@click.option("--vocab-size", default=32_000, show_default=True)
def main(raw_dir, out_dir, vocab_size):
"""Train a BPE tokenizer on collected Go source files."""
files = [str(p) for p in Path(raw_dir).rglob("*.go")]
click.echo(f"Training tokenizer on {len(files):,} files…")
GoTokenizer.train(files=files, vocab_size=vocab_size, save_dir=out_dir)
click.echo(f"Tokenizer saved → {out_dir}")