| |
| |
| |
| |
|
|
| import struct |
| import tiktoken |
| import pytest |
|
|
| enc = tiktoken.get_encoding("gpt2") |
|
|
| |
|
|
| def test_tokenizer_roundtrip_simple(): |
| text = "Hello, how are you?" |
| ids = enc.encode_ordinary(text) |
| back = enc.decode(ids) |
| assert back == text |
|
|
| def test_tokenizer_roundtrip_multiline(): |
| text = "User: What is AI?\nAssistant: AI is artificial intelligence." |
| ids = enc.encode_ordinary(text) |
| back = enc.decode(ids) |
| assert back == text |
|
|
| def test_tokenizer_roundtrip_system_format(): |
| text = "System: You are a helpful assistant.\nUser: Hi\nAssistant:" |
| ids = enc.encode_ordinary(text) |
| back = enc.decode(ids) |
| assert back == text |
|
|
| def test_eot_token(): |
| |
| assert enc.eot_token == 50256 |
|
|
| def test_encode_returns_valid_ids(): |
| ids = enc.encode_ordinary("What is machine learning?") |
| assert len(ids) > 0 |
| assert all(0 <= i < 50304 for i in ids) |
|
|
| def test_empty_string(): |
| ids = enc.encode_ordinary("") |
| assert ids == [] |
|
|
| |
|
|
| def test_vocab_size_padding(): |
| |
| |
| assert 50304 % 64 == 0 |
| assert 50304 >= 50257 |
| assert 50304 - 50257 == 47 |
|
|
| |
|
|
| def test_model_bin_exists(): |
| import os |
| assert os.path.exists("model.bin"), "model.bin not found in /app" |
|
|
| def test_model_bin_header(): |
| with open("model.bin", "rb") as f: |
| header = struct.unpack('iiiii', f.read(20)) |
| n_layer, n_head, n_embd, block_size, vocab_size = header |
| assert n_layer == 16, f"Expected 16 layers, got {n_layer}" |
| assert n_head == 12, f"Expected 12 heads, got {n_head}" |
| assert n_embd == 768, f"Expected 768 embd, got {n_embd}" |
| assert block_size == 1024, f"Expected 1024 block_size, got {block_size}" |
| assert vocab_size == 50304, f"Expected 50304 vocab, got {vocab_size}" |
|
|
| def test_model_bin_size(): |
| import os |
| size_mb = os.path.getsize("model.bin") / 1e6 |
| |
| assert 700 < size_mb < 850, f"model.bin size {size_mb:.1f}MB out of expected range" |
|
|
| |
|
|
| def test_tokenizer_bin_exists(): |
| import os |
| assert os.path.exists("tokenizer.bin"), "tokenizer.bin not found in /app" |
|
|
| def test_tokenizer_bin_vocab_size(): |
| with open("tokenizer.bin", "rb") as f: |
| vocab_size = struct.unpack('i', f.read(4))[0] |
| assert vocab_size == 50304 |
|
|
| def test_tokenizer_bin_readable(): |
| """Read first 10 token entries and verify they have valid lengths""" |
| with open("tokenizer.bin", "rb") as f: |
| vocab_size = struct.unpack('i', f.read(4))[0] |
| for _ in range(min(10, vocab_size)): |
| length = struct.unpack('i', f.read(4))[0] |
| assert 0 < length <= 64, f"Token length {length} out of range" |
| token_bytes = f.read(length) |
| assert len(token_bytes) == length |
|
|
| |
|
|
| def test_head_size_divisible(): |
| n_embd = 768 |
| n_head = 12 |
| assert n_embd % n_head == 0 |
| head_size = n_embd // n_head |
| assert head_size == 64 |
|
|
| def test_mlp_expansion(): |
| |
| n_embd = 768 |
| assert 4 * n_embd == 3072 |