mindi-backup / scripts /train_code_tokenizer.py
Mindigenous
Initial full project backup with Git LFS
53f0cc2
"""
Component 2 training script.
This script trains the custom code tokenizer and saves it for reuse.
Supported input formats:
- .jsonl with fields: prompt, code, language
- .txt where each line is one raw sample
"""
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Iterable, Iterator, List
# This makes "src" imports work when script is run from project root.
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig
def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]:
"""
Streams JSONL rows as training text without loading full file into RAM.
"""
with file_path.open("r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
row = json.loads(line)
except json.JSONDecodeError:
continue
prompt = str(row.get("prompt", "")).strip()
code = str(row.get("code", "")).strip()
language = str(row.get("language", "python")).strip().lower()
if not prompt or not code:
continue
if language not in {"python", "javascript"}:
language = "python"
yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language)
def stream_txt_samples(file_path: Path) -> Iterator[str]:
"""
Streams plain text file line by line.
"""
with file_path.open("r", encoding="utf-8") as f:
for line in f:
text = line.strip()
if text:
yield text
def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]:
"""
Creates one merged iterator from many files.
"""
def _generator() -> Iterator[str]:
for path in input_files:
suffix = path.suffix.lower()
if suffix == ".jsonl":
yield from stream_jsonl_samples(path, tokenizer)
elif suffix == ".txt":
yield from stream_txt_samples(path)
else:
print(f"[warning] Skipping unsupported file type: {path}")
return _generator()
def parse_args() -> argparse.Namespace:
"""
Reads command-line settings for tokenizer training.
"""
parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.")
parser.add_argument(
"--input",
nargs="+",
required=True,
help="One or more input files (.jsonl or .txt).",
)
parser.add_argument(
"--output_dir",
default="artifacts/tokenizer/code_tokenizer_v1",
help="Folder where tokenizer files will be saved.",
)
parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.")
parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.")
parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.")
return parser.parse_args()
def main() -> None:
"""
Main training entry point with clear error messages.
"""
args = parse_args()
try:
input_files = [Path(p) for p in args.input]
missing = [str(p) for p in input_files if not p.exists()]
if missing:
raise FileNotFoundError(
"Some input files do not exist:\n- " + "\n- ".join(missing)
)
config = CodeTokenizerConfig(
vocab_size=args.vocab_size,
min_frequency=args.min_frequency,
model_max_length=args.model_max_length,
)
tokenizer = CodeTokenizer(config=config)
text_stream = build_stream(input_files=input_files, tokenizer=tokenizer)
tokenizer.train(text_stream)
tokenizer.save(args.output_dir)
print("Tokenizer training completed successfully.")
print(f"Saved tokenizer to: {args.output_dir}")
print("Saved files: tokenizer.json, tokenizer_config.json")
except Exception as exc:
print("Tokenizer training failed.")
print(f"What went wrong: {exc}")
print("Fix suggestion: check file paths and file format, then run again.")
raise SystemExit(1)
if __name__ == "__main__":
main()