File size: 2,091 Bytes
53f0cc2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | """
Component 2 verification script.
This script:
1) Trains tokenizer on a tiny sample file.
2) Saves tokenizer.
3) Loads tokenizer back.
4) Encodes and decodes a sample.
"""
from __future__ import annotations
import sys
from pathlib import Path
# This makes "src" imports work when script is run from project root.
PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
from src.tokenizer.code_tokenizer import CodeTokenizer
def main() -> None:
sample_file = Path("data/external/component2_tokenizer_sample.jsonl")
output_dir = Path("artifacts/tokenizer/code_tokenizer_v1")
if not sample_file.exists():
print("Verification failed.")
print(f"Missing sample file: {sample_file}")
print("Fix suggestion: ensure Component 2 sample file exists and run again.")
raise SystemExit(1)
# Train tokenizer from sample file via script-like path.
from scripts.train_code_tokenizer import stream_jsonl_samples # local import on purpose
tokenizer = CodeTokenizer()
tokenizer.train(stream_jsonl_samples(sample_file, tokenizer))
tokenizer.save(str(output_dir))
loaded = CodeTokenizer.load(str(output_dir))
sample = loaded.format_training_sample(
prompt="Write Python function that squares a number.",
code="def square(x):\n return x * x",
language="python",
)
token_ids = loaded.encode(sample)
decoded = loaded.decode(token_ids)
print("=== Component 2 Verification ===")
print(f"Tokenizer saved to: {output_dir}")
print(f"Encoded token count: {len(token_ids)}")
print("First 25 token IDs:", token_ids[:25])
print("Decoded preview:")
print(decoded[:300])
print("")
print("Component 2 tokenizer verification passed.")
if __name__ == "__main__":
try:
main()
except Exception as exc:
print("Verification failed.")
print(f"What went wrong: {exc}")
print("Fix suggestion: activate .venv and rerun this script.")
raise SystemExit(1)
|