""" Component 2 training script. This script trains the custom code tokenizer and saves it for reuse. Supported input formats: - .jsonl with fields: prompt, code, language - .txt where each line is one raw sample """ from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Iterable, Iterator, List # This makes "src" imports work when script is run from project root. PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]: """ Streams JSONL rows as training text without loading full file into RAM. """ with file_path.open("r", encoding="utf-8") as f: for line in f: line = line.strip() if not line: continue try: row = json.loads(line) except json.JSONDecodeError: continue prompt = str(row.get("prompt", "")).strip() code = str(row.get("code", "")).strip() language = str(row.get("language", "python")).strip().lower() if not prompt or not code: continue if language not in {"python", "javascript"}: language = "python" yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language) def stream_txt_samples(file_path: Path) -> Iterator[str]: """ Streams plain text file line by line. """ with file_path.open("r", encoding="utf-8") as f: for line in f: text = line.strip() if text: yield text def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]: """ Creates one merged iterator from many files. """ def _generator() -> Iterator[str]: for path in input_files: suffix = path.suffix.lower() if suffix == ".jsonl": yield from stream_jsonl_samples(path, tokenizer) elif suffix == ".txt": yield from stream_txt_samples(path) else: print(f"[warning] Skipping unsupported file type: {path}") return _generator() def parse_args() -> argparse.Namespace: """ Reads command-line settings for tokenizer training. """ parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.") parser.add_argument( "--input", nargs="+", required=True, help="One or more input files (.jsonl or .txt).", ) parser.add_argument( "--output_dir", default="artifacts/tokenizer/code_tokenizer_v1", help="Folder where tokenizer files will be saved.", ) parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.") parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.") parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.") return parser.parse_args() def main() -> None: """ Main training entry point with clear error messages. """ args = parse_args() try: input_files = [Path(p) for p in args.input] missing = [str(p) for p in input_files if not p.exists()] if missing: raise FileNotFoundError( "Some input files do not exist:\n- " + "\n- ".join(missing) ) config = CodeTokenizerConfig( vocab_size=args.vocab_size, min_frequency=args.min_frequency, model_max_length=args.model_max_length, ) tokenizer = CodeTokenizer(config=config) text_stream = build_stream(input_files=input_files, tokenizer=tokenizer) tokenizer.train(text_stream) tokenizer.save(args.output_dir) print("Tokenizer training completed successfully.") print(f"Saved tokenizer to: {args.output_dir}") print("Saved files: tokenizer.json, tokenizer_config.json") except Exception as exc: print("Tokenizer training failed.") print(f"What went wrong: {exc}") print("Fix suggestion: check file paths and file format, then run again.") raise SystemExit(1) if __name__ == "__main__": main()