""" Component 2 verification script. This script: 1) Trains tokenizer on a tiny sample file. 2) Saves tokenizer. 3) Loads tokenizer back. 4) Encodes and decodes a sample. """ from __future__ import annotations import sys from pathlib import Path # This makes "src" imports work when script is run from project root. PROJECT_ROOT = Path(__file__).resolve().parents[1] if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT)) from src.tokenizer.code_tokenizer import CodeTokenizer def main() -> None: sample_file = Path("data/external/component2_tokenizer_sample.jsonl") output_dir = Path("artifacts/tokenizer/code_tokenizer_v1") if not sample_file.exists(): print("Verification failed.") print(f"Missing sample file: {sample_file}") print("Fix suggestion: ensure Component 2 sample file exists and run again.") raise SystemExit(1) # Train tokenizer from sample file via script-like path. from scripts.train_code_tokenizer import stream_jsonl_samples # local import on purpose tokenizer = CodeTokenizer() tokenizer.train(stream_jsonl_samples(sample_file, tokenizer)) tokenizer.save(str(output_dir)) loaded = CodeTokenizer.load(str(output_dir)) sample = loaded.format_training_sample( prompt="Write Python function that squares a number.", code="def square(x):\n return x * x", language="python", ) token_ids = loaded.encode(sample) decoded = loaded.decode(token_ids) print("=== Component 2 Verification ===") print(f"Tokenizer saved to: {output_dir}") print(f"Encoded token count: {len(token_ids)}") print("First 25 token IDs:", token_ids[:25]) print("Decoded preview:") print(decoded[:300]) print("") print("Component 2 tokenizer verification passed.") if __name__ == "__main__": try: main() except Exception as exc: print("Verification failed.") print(f"What went wrong: {exc}") print("Fix suggestion: activate .venv and rerun this script.") raise SystemExit(1)