| """ |
| Component 2 training script. |
| |
| This script trains the custom code tokenizer and saves it for reuse. |
| Supported input formats: |
| - .jsonl with fields: prompt, code, language |
| - .txt where each line is one raw sample |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
| from typing import Iterable, Iterator, List |
|
|
| |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.tokenizer.code_tokenizer import CodeTokenizer, CodeTokenizerConfig |
|
|
|
|
| def stream_jsonl_samples(file_path: Path, tokenizer: CodeTokenizer) -> Iterator[str]: |
| """ |
| Streams JSONL rows as training text without loading full file into RAM. |
| """ |
| with file_path.open("r", encoding="utf-8") as f: |
| for line in f: |
| line = line.strip() |
| if not line: |
| continue |
| try: |
| row = json.loads(line) |
| except json.JSONDecodeError: |
| continue |
| prompt = str(row.get("prompt", "")).strip() |
| code = str(row.get("code", "")).strip() |
| language = str(row.get("language", "python")).strip().lower() |
| if not prompt or not code: |
| continue |
| if language not in {"python", "javascript"}: |
| language = "python" |
| yield tokenizer.format_training_sample(prompt=prompt, code=code, language=language) |
|
|
|
|
| def stream_txt_samples(file_path: Path) -> Iterator[str]: |
| """ |
| Streams plain text file line by line. |
| """ |
| with file_path.open("r", encoding="utf-8") as f: |
| for line in f: |
| text = line.strip() |
| if text: |
| yield text |
|
|
|
|
| def build_stream(input_files: List[Path], tokenizer: CodeTokenizer) -> Iterable[str]: |
| """ |
| Creates one merged iterator from many files. |
| """ |
| def _generator() -> Iterator[str]: |
| for path in input_files: |
| suffix = path.suffix.lower() |
| if suffix == ".jsonl": |
| yield from stream_jsonl_samples(path, tokenizer) |
| elif suffix == ".txt": |
| yield from stream_txt_samples(path) |
| else: |
| print(f"[warning] Skipping unsupported file type: {path}") |
|
|
| return _generator() |
|
|
|
|
| def parse_args() -> argparse.Namespace: |
| """ |
| Reads command-line settings for tokenizer training. |
| """ |
| parser = argparse.ArgumentParser(description="Train custom Python/JavaScript code tokenizer.") |
| parser.add_argument( |
| "--input", |
| nargs="+", |
| required=True, |
| help="One or more input files (.jsonl or .txt).", |
| ) |
| parser.add_argument( |
| "--output_dir", |
| default="artifacts/tokenizer/code_tokenizer_v1", |
| help="Folder where tokenizer files will be saved.", |
| ) |
| parser.add_argument("--vocab_size", type=int, default=50_000, help="Tokenizer vocabulary size.") |
| parser.add_argument("--min_frequency", type=int, default=2, help="Minimum token frequency.") |
| parser.add_argument("--model_max_length", type=int, default=2048, help="Max token length hint.") |
| return parser.parse_args() |
|
|
|
|
| def main() -> None: |
| """ |
| Main training entry point with clear error messages. |
| """ |
| args = parse_args() |
|
|
| try: |
| input_files = [Path(p) for p in args.input] |
| missing = [str(p) for p in input_files if not p.exists()] |
| if missing: |
| raise FileNotFoundError( |
| "Some input files do not exist:\n- " + "\n- ".join(missing) |
| ) |
|
|
| config = CodeTokenizerConfig( |
| vocab_size=args.vocab_size, |
| min_frequency=args.min_frequency, |
| model_max_length=args.model_max_length, |
| ) |
| tokenizer = CodeTokenizer(config=config) |
| text_stream = build_stream(input_files=input_files, tokenizer=tokenizer) |
| tokenizer.train(text_stream) |
| tokenizer.save(args.output_dir) |
|
|
| print("Tokenizer training completed successfully.") |
| print(f"Saved tokenizer to: {args.output_dir}") |
| print("Saved files: tokenizer.json, tokenizer_config.json") |
| except Exception as exc: |
| print("Tokenizer training failed.") |
| print(f"What went wrong: {exc}") |
| print("Fix suggestion: check file paths and file format, then run again.") |
| raise SystemExit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|