| """ |
| Component 2 verification script. |
| |
| This script: |
| 1) Trains tokenizer on a tiny sample file. |
| 2) Saves tokenizer. |
| 3) Loads tokenizer back. |
| 4) Encodes and decodes a sample. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| if str(PROJECT_ROOT) not in sys.path: |
| sys.path.insert(0, str(PROJECT_ROOT)) |
|
|
| from src.tokenizer.code_tokenizer import CodeTokenizer |
|
|
|
|
| def main() -> None: |
| sample_file = Path("data/external/component2_tokenizer_sample.jsonl") |
| output_dir = Path("artifacts/tokenizer/code_tokenizer_v1") |
|
|
| if not sample_file.exists(): |
| print("Verification failed.") |
| print(f"Missing sample file: {sample_file}") |
| print("Fix suggestion: ensure Component 2 sample file exists and run again.") |
| raise SystemExit(1) |
|
|
| |
| from scripts.train_code_tokenizer import stream_jsonl_samples |
|
|
| tokenizer = CodeTokenizer() |
| tokenizer.train(stream_jsonl_samples(sample_file, tokenizer)) |
| tokenizer.save(str(output_dir)) |
|
|
| loaded = CodeTokenizer.load(str(output_dir)) |
| sample = loaded.format_training_sample( |
| prompt="Write Python function that squares a number.", |
| code="def square(x):\n return x * x", |
| language="python", |
| ) |
| token_ids = loaded.encode(sample) |
| decoded = loaded.decode(token_ids) |
|
|
| print("=== Component 2 Verification ===") |
| print(f"Tokenizer saved to: {output_dir}") |
| print(f"Encoded token count: {len(token_ids)}") |
| print("First 25 token IDs:", token_ids[:25]) |
| print("Decoded preview:") |
| print(decoded[:300]) |
| print("") |
| print("Component 2 tokenizer verification passed.") |
|
|
|
|
| if __name__ == "__main__": |
| try: |
| main() |
| except Exception as exc: |
| print("Verification failed.") |
| print(f"What went wrong: {exc}") |
| print("Fix suggestion: activate .venv and rerun this script.") |
| raise SystemExit(1) |
|
|