File size: 4,485 Bytes

53f0cc2

"""
Component 2 training script.

This script trains the custom code tokenizer and saves it for reuse.
Supported input formats:
- .jsonl with fields: prompt, code, language
- .txt where each line is one raw sample
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Iterable, Iterator, List

# This makes "src" imports work when script is run from project root.
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig


def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]:
    """
    Streams JSONL rows as training text without loading full file into RAM.
    """
    with file_path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                row = json.loads(line)
            except json.JSONDecodeError:
                continue
            prompt = str(row.get("prompt", "")).strip()
            code = str(row.get("code", "")).strip()
            language = str(row.get("language", "python")).strip().lower()
            if not prompt or not code:
                continue
            if language not in {"python", "javascript"}:
                language = "python"
            yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language)


def stream_txt_samples(file_path: Path) -> Iterator[str]:
    """
    Streams plain text file line by line.
    """
    with file_path.open("r", encoding="utf-8") as f:
        for line in f:
            text = line.strip()
            if text:
                yield text


def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]:
    """
    Creates one merged iterator from many files.
    """
    def _generator() -> Iterator[str]:
        for path in input_files:
            suffix = path.suffix.lower()
            if suffix == ".jsonl":
                yield from stream_jsonl_samples(path, tokenizer)
            elif suffix == ".txt":
                yield from stream_txt_samples(path)
            else:
                print(f"[warning] Skipping unsupported file type: {path}")

    return _generator()


def parse_args() -> argparse.Namespace:
    """
    Reads command-line settings for tokenizer training.
    """
    parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.")
    parser.add_argument(
        "--input",
        nargs="+",
        required=True,
        help="One or more input files (.jsonl or .txt).",
    )
    parser.add_argument(
        "--output_dir",
        default="artifacts/tokenizer/code_tokenizer_v1",
        help="Folder where tokenizer files will be saved.",
    )
    parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.")
    parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.")
    parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.")
    return parser.parse_args()


def main() -> None:
    """
    Main training entry point with clear error messages.
    """
    args = parse_args()

    try:
        input_files = [Path(p) for p in args.input]
        missing = [str(p) for p in input_files if not p.exists()]
        if missing:
            raise FileNotFoundError(
                "Some input files do not exist:\n- " + "\n- ".join(missing)
            )

        config = CodeTokenizerConfig(
            vocab_size=args.vocab_size,
            min_frequency=args.min_frequency,
            model_max_length=args.model_max_length,
        )
        tokenizer = CodeTokenizer(config=config)
        text_stream = build_stream(input_files=input_files, tokenizer=tokenizer)
        tokenizer.train(text_stream)
        tokenizer.save(args.output_dir)

        print("Tokenizer training completed successfully.")
        print(f"Saved tokenizer to: {args.output_dir}")
        print("Saved files: tokenizer.json, tokenizer_config.json")
    except Exception as exc:
        print("Tokenizer training failed.")
        print(f"What went wrong: {exc}")
        print("Fix suggestion: check file paths and file format, then run again.")
        raise SystemExit(1)


if __name__ == "__main__":
    main()