OpenTransformer commited on 14 days ago

Commit

19ed98b

verified ·

1 Parent(s): 0ebe638

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

README.md +66 -0
bench_fwd.py +70 -0
bench_gen.py +71 -0
bench_prompt.py +86 -0
build.sh +82 -0
concat_unary +0 -0
concat_unary.c +608 -0
convert.py +205 -0
convert_fast.py +226 -0
convert_log_unary.py +159 -0
convert_proper_unary.py +164 -0
convert_proper_unary_v2.py +247 -0
convert_qwen3.py +149 -0
convert_qwen3_v2.py +161 -0
deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales +0 -0
deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16 +0 -0
deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales +0 -0
deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg +0 -0
deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales +3 -0
deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos +0 -0
inference.py +503 -0
log_unary_engine.c +598 -0
logunary_tensor.c +534 -0
packed_convert.py +79 -0
packed_engine.c +408 -0
packed_loader.py +134 -0
proper_unary +0 -0
proper_unary.c +563 -0
pure_unary_engine.c +658 -0
run_convert.py +76 -0
run_log_unary.py +123 -0
run_pure_unary.py +176 -0
run_qwen3_4b.py +221 -0
server.py +107 -0
ternary_kernel.c +265 -0
test_logunary +0 -0
test_logunary.c +153 -0
test_popcount.py +99 -0
true_unary +0 -0
true_unary.c +552 -0
unary_convert.py +189 -0
unary_convert_v2.py +134 -0
unary_engine.c +381 -0
unary_engine_v2.c +629 -0
unary_full.c +742 -0
unary_group_convert.py +192 -0
unary_kernel.c +120 -0
unary_loader.py +202 -0
unary_run.py +203 -0
unary_run16.py +203 -0

README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+---
+license: apache-2.0
+tags:
+  - quantization
+  - unary
+  - thermometer-encoding
+  - inference-engine
+  - low-bit
+language:
+  - en
+---
+# Unary Quantization Research
+True unary (base-1) quantization for neural network weights. NOT binary.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+## Overview
+Unary means magnitude N = N consecutive 1-bits across N bitplanes. Each bitplane contributes value=1, not binary powers. This eliminates multiplication from inference — only addition and popcount.
+7-plane unary gives 8 magnitude levels (15 distinct values with sign), achieving 0.97 cosine similarity per layer against FP32 originals.
+## Contents
+### Converters (Python)
+- `unary_convert.py` / `unary_convert_v2.py` — Base unary thermometer conversion
+- `convert_proper_unary.py` / `convert_proper_unary_v2.py` — Proper unary with group quantization
+- `convert_log_unary.py` — Log-spaced unary variant
+- `convert_fast.py` — Optimised conversion pipeline
+- `packed_convert.py` / `packed_loader.py` — Packed binary format
+- `convert_qwen3.py` / `convert_qwen3_v2.py` — Qwen3-4B specific converters
+### C Inference Engines (AVX-512 + POPCNT)
+- `unary_engine.c` / `unary_engine_v2.c` — Core unary inference
+- `pure_unary_engine.c` — Pure unary (no FP in linear layers)
+- `log_unary_engine.c` — Log-unary engine
+- `proper_unary.c` — Proper unary with group scales
+- `true_unary.c` — True base-1 unary engine
+- `concat_unary.c` — Concatenated unary engine
+- `packed_engine.c` — Packed bitplane engine
+- `unary_full.c` — Full forward pass engine
+### Converted Models
+- `deepseek-r1-1.5b-*` — DeepSeek-R1-1.5B in multiple unary variants (4-plane, 7-plane, 31-plane, grouped, packed, ternary baseline)
+- `qwen3-4b-*` — Qwen3-4B-Thinking in unary, log-unary, and proper-unary variants
+### Benchmarks and Runners
+- `bench_fwd.py` / `bench_gen.py` / `bench_prompt.py` — Performance benchmarks
+- `inference.py` / `server.py` — Python inference and API server
+- Various `run_*.py` — Model-specific runners
+## Key Insight
+Unary quantization trades bits-per-weight for computational simplicity. All multiply-accumulate operations become popcount + addition, making this particularly suited for edge/CPU inference where SIMD popcount is fast.
+## Building
+```bash
+gcc -O3 -mavx512f -mavx512bw -mpopcnt -o unary_engine unary_engine.c -lm
+```
+## License
+Apache 2.0

bench_fwd.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import ctypes, numpy as np, os, time, sys
+MODEL_DIR = "deepseek-r1-1.5b-unary"
+HF_DIR = "deepseek-r1-1.5b-hf"
+lib = ctypes.CDLL("./unary_engine.so")
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
+lib.forward_token.restype = ctypes.c_void_p
+lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+_refs = []
+def keep(a):
+    _refs.append(a)
+    return a.ctypes.data
+N_PLANES = 7
+N_LAYERS = 28
+PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
+DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
+print("Loading model...")
+m = lib.model_alloc(N_PLANES)
+e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
+lib.model_set_embed(m, keep(e))
+n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
+lib.model_set_final_norm(m, keep(n))
+h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
+lib.model_set_lm_head(m, keep(h), 151936, 1536)
+for l in range(N_LAYERS):
+    inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
+    qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
+    pa = []
+    for pn in PROJS:
+        base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
+        s = np.fromfile(base+'.sign',dtype=np.uint64)
+        p = np.fromfile(base+'.planes',dtype=np.uint64)
+        sc = np.fromfile(base+'.scales',dtype=np.float32)
+        od,id = DIMS[pn]
+        pa.extend([keep(s),keep(p),keep(sc),od,id])
+    lib.layer_set_linears(m, l, *pa, N_PLANES)
+print("Model loaded, benchmarking single forward pass...")
+lib.model_reset_cache(m)
+# Time single forward pass (token_id=1, pos=0)
+times = []
+for i in range(3):
+    lib.model_reset_cache(m)
+    t0 = time.time()
+    lib.forward_token(m, 1, 0)
+    dt = time.time() - t0
+    times.append(dt)
+    print(f"  forward_token run {i}: {dt:.3f}s")
+avg = sum(times)/len(times)
+print(f"\nAvg: {avg:.3f}s per token = {1/avg:.1f} tok/s")
+print(f"OMP threads: {os.environ.get('OMP_NUM_THREADS', 'default')}")

bench_gen.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import ctypes, numpy as np, os, time
+MODEL_DIR = "deepseek-r1-1.5b-unary"
+HF_DIR = "deepseek-r1-1.5b-hf"
+lib = ctypes.CDLL("./unary_engine.so")
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
+lib.forward_token.restype = ctypes.c_void_p
+lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.generate.restype = ctypes.c_int
+lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+_refs = []
+def keep(a):
+    _refs.append(a)
+    return a.ctypes.data
+N_PLANES = 7
+N_LAYERS = 28
+PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
+DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
+m = lib.model_alloc(N_PLANES)
+e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
+lib.model_set_embed(m, keep(e))
+n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
+lib.model_set_final_norm(m, keep(n))
+h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
+lib.model_set_lm_head(m, keep(h), 151936, 1536)
+for l in range(N_LAYERS):
+    inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
+    qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
+    pa = []
+    for pn in PROJS:
+        base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
+        s = np.fromfile(base+'.sign',dtype=np.uint64)
+        p = np.fromfile(base+'.planes',dtype=np.uint64)
+        sc = np.fromfile(base+'.scales',dtype=np.float32)
+        od,id = DIMS[pn]
+        pa.extend([keep(s),keep(p),keep(sc),od,id])
+    lib.layer_set_linears(m, l, *pa, N_PLANES)
+print("Loaded. Testing generate with greedy (temp=0)...")
+lib.model_reset_cache(m)
+inp = np.array([1], dtype=np.int32)  # just BOS token
+out = np.zeros(8, dtype=np.int32)
+t0 = time.time()
+ng = lib.generate(m, inp.ctypes.data, 1, out.ctypes.data, 8,
+                  ctypes.c_float(0.0), ctypes.c_float(0.9), 151643)
+dt = time.time() - t0
+print(f"Generated {ng} tokens in {dt:.1f}s = {ng/dt:.1f} tok/s")
+print(f"Token IDs: {out[:ng].tolist()}")
+from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
+print(f"Text: {tok.decode(out[:ng].tolist())}")

bench_prompt.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import ctypes, numpy as np, os, time
+MODEL_DIR = "deepseek-r1-1.5b-unary"
+HF_DIR = "deepseek-r1-1.5b-hf"
+lib = ctypes.CDLL("./unary_engine.so")
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
+lib.generate.restype = ctypes.c_int
+lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+_refs = []
+def keep(a):
+    _refs.append(a)
+    return a.ctypes.data
+N_PLANES = 7
+N_LAYERS = 28
+PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
+DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
+m = lib.model_alloc(N_PLANES)
+e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
+lib.model_set_embed(m, keep(e))
+n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
+lib.model_set_final_norm(m, keep(n))
+h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
+lib.model_set_lm_head(m, keep(h), 151936, 1536)
+for l in range(N_LAYERS):
+    inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
+    qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
+    lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
+    pa = []
+    for pn in PROJS:
+        base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
+        s = np.fromfile(base+'.sign',dtype=np.uint64)
+        p = np.fromfile(base+'.planes',dtype=np.uint64)
+        sc = np.fromfile(base+'.scales',dtype=np.float32)
+        od,id = DIMS[pn]
+        pa.extend([keep(s),keep(p),keep(sc),od,id])
+    lib.layer_set_linears(m, l, *pa, N_PLANES)
+from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
+# Test with actual prompt
+prompt = "What is 2+2? Think step by step."
+ids = tok.encode(prompt)
+inp = np.array(ids, dtype=np.int32)
+out = np.zeros(64, dtype=np.int32)
+lib.model_reset_cache(m)
+print(f"Prompt: {prompt} ({len(ids)} tokens)")
+# Test greedy first
+print("\n--- Greedy ---")
+t0 = time.time()
+ng = lib.generate(m, inp.ctypes.data, len(ids), out.ctypes.data, 64,
+                  ctypes.c_float(0.0), ctypes.c_float(0.9), tok.eos_token_id)
+dt = time.time() - t0
+text = tok.decode(out[:ng].tolist(), skip_special_tokens=False)
+print(f"{ng} tokens, {dt:.1f}s, {ng/dt:.1f} tok/s")
+print(f"Output: {text}")
+# Test with temperature
+print("\n--- Temperature=0.6 ---")
+lib.model_reset_cache(m)
+out2 = np.zeros(64, dtype=np.int32)
+t0 = time.time()
+ng2 = lib.generate(m, inp.ctypes.data, len(ids), out2.ctypes.data, 64,
+                   ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
+dt2 = time.time() - t0
+text2 = tok.decode(out2[:ng2].tolist(), skip_special_tokens=False)
+print(f"{ng2} tokens, {dt2:.1f}s, {ng2/dt2:.1f} tok/s")
+print(f"Output: {text2}")

build.sh ADDED Viewed

	@@ -0,0 +1,82 @@

+#!/bin/bash
+# Build and deploy ternary inference engine
+# (c) 2026 OpenTransformers Ltd / Scott Bisset
+set -e
+WORKDIR=/root/ternary_engine
+MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
+MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf
+TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary
+echo "=== Ternary Inference Engine Build ==="
+echo "Target: AVX-512 Skylake"
+echo ""
+mkdir -p $WORKDIR
+cd $WORKDIR
+# Step 1: Compile C kernel with AVX-512
+echo "[1/4] Compiling AVX-512 kernel..."
+gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \
+    -shared -fPIC -lm \
+    -o ternary_kernel.so ternary_kernel.c
+echo "  -> ternary_kernel.so built"
+ls -lh ternary_kernel.so
+# Step 2: Download model from HuggingFace
+echo ""
+echo "[2/4] Downloading model weights..."
+pip install --break-system-packages -q safetensors tokenizers 2>/dev/null
+python3 -c "
+from huggingface_hub import snapshot_download
+snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR',
+    ignore_patterns=['*.md', '*.txt', 'figures/*'])
+print('Download complete')
+"
+# Step 3: Convert to ternary
+echo ""
+echo "[3/4] Converting to ternary format..."
+python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7
+# Step 4: Verify
+echo ""
+echo "[4/4] Verifying..."
+ls -lh $TERNARY_DIR/ | head -20
+echo ""
+du -sh $TERNARY_DIR/
+echo ""
+# Quick test
+echo "Running speed test..."
+python3 -c "
+from inference import TernaryQwen, load_kernel
+import time
+import os
+kernel = load_kernel('$WORKDIR/ternary_kernel.so')
+model = TernaryQwen('$TERNARY_DIR', kernel)
+# Warm up
+import numpy as np
+cache_module = __import__('inference')
+cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
+hidden = model.forward_token(9707, cache, 0)  # 'Hello'
+# Benchmark single token
+times = []
+for i in range(5):
+    cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
+    t0 = time.time()
+    h = model.forward_token(9707, cache2, 0)
+    times.append(time.time() - t0)
+avg = sum(times) / len(times)
+print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)')
+print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}')
+"
+echo ""
+echo "=== Build complete ==="
+echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py"

concat_unary ADDED Viewed

Binary file (26.1 kB). View file

concat_unary.c ADDED Viewed

	@@ -0,0 +1,608 @@

+/*
+ * CONCATENATIVE UNARY ENGINE
+ *
+ * In base-1, the value IS the count of ones.
+ * Addition = concatenation of bitstreams.
+ * Multiplication = AND + count.
+ *
+ * REPRESENTATION:
+ *   Each element of a vector has:
+ *     - A sign bit (positive/negative)
+ *     - A magnitude = number of 1-bits across K "slots"
+ *
+ *   But crucially, when we ADD two unary vectors (residual connection),
+ *   we DON'T dequantize-add-requantize. We CONCATENATE the slots.
+ *
+ *   If vector A has K_a slots and vector B has K_b slots,
+ *   A + B has K_a + K_b slots. The magnitude of element j is
+ *   just the total count of 1-bits at position j across ALL slots.
+ *
+ *   This means the residual stream GROWS through the network:
+ *     After embed:  K_0 slots
+ *     After layer 1: K_0 + K_attn + K_mlp slots
+ *     After layer L: K_0 + L*(K_attn + K_mlp) slots
+ *
+ *   No information is ever destroyed by requantization.
+ *
+ * MATMUL:
+ *   y = W @ x where W has K_w slots and x has K_x slots.
+ *   For each output element y[i]:
+ *     For each slot pair (p from W, q from x):
+ *       count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
+ *              - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
+ *   Output gets K_out = some fixed number of slots (requantized)
+ *   because matmul output magnitude is in a different scale.
+ *
+ * SAME-SIGN ADD (residual):
+ *   Just append slots. Zero compute.
+ *   For different signs: need cancellation.
+ *   In practice residual connections are same-sign-dominant,
+ *   so we track sign separately and concat magnitudes,
+ *   deferring cancellation to the next norm.
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#define _POSIX_C_SOURCE 199309L
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+/* ============================================================
+ * GROWABLE UNARY VECTOR
+ *
+ * The key data structure. Slots can be appended (concat = add).
+ * Each slot is a bitplane of dim bits packed into uint64 chunks.
+ *
+ * sign:     uint64[chunks]        — per-element sign
+ * slots:    uint64[n_slots * chunks] — each slot is chunks uint64s
+ * n_slots:  current number of slots (grows via concat)
+ * max_slots: allocated capacity
+ *
+ * For element j:
+ *   magnitude = number of slots where bit j is set
+ *   value = sign * magnitude * scale
+ *
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign;
+    uint64_t *slots;     /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */
+    float     scale;     /* per-vector scale factor */
+    int       dim;
+    int       chunks;    /* (dim+63)/64 */
+    int       n_slots;   /* current slot count */
+    int       max_slots; /* allocated capacity */
+} GrowVec;
+/* Fixed-size unary matrix (weights don't grow) */
+typedef struct {
+    uint64_t *sign;     /* [rows * chunks] */
+    uint64_t *slots;    /* [K * rows * chunks] */
+    float    *scales;   /* [rows] per-row scale */
+    int       rows, cols, chunks, K;
+} FixedMat;
+/* ============================================================
+ * ALLOCATION
+ * ============================================================ */
+GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
+    GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec));
+    v->dim = dim;
+    v->chunks = (dim + 63) / 64;
+    v->n_slots = 0;
+    v->max_slots = max_slots;
+    v->scale = 1.0f;
+    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
+    v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t));
+    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
+    memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
+    return v;
+}
+void gv_free(GrowVec *v) {
+    if (v) { free(v->sign); free(v->slots); free(v); }
+}
+FixedMat* fm_alloc(int rows, int cols, int K) {
+    FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat));
+    m->rows = rows; m->cols = cols; m->K = K;
+    m->chunks = (cols + 63) / 64;
+    m->sign   = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
+    m->slots  = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
+    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
+    memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    return m;
+}
+void fm_free(FixedMat *m) {
+    if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
+}
+/* ============================================================
+ * FLOAT → UNARY CONVERSION (only at boundaries)
+ * ============================================================ */
+void gv_from_float(GrowVec *v, const float *x, int K) {
+    int dim = v->dim, chunks = v->chunks;
+    v->n_slots = K;
+    memset(v->sign, 0, chunks * sizeof(uint64_t));
+    memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
+    float amax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > amax) amax = a;
+    }
+    if (amax == 0.0f) { v->scale = 1.0f; return; }
+    v->scale = amax / K;
+    float inv = K / amax;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        if (x[i] < 0.0f) v->sign[c] |= bit;
+        int mag = (int)(fabsf(x[i]) * inv + 0.5f);
+        if (mag > K) mag = K;
+        for (int s = 0; s < mag; s++)
+            v->slots[(size_t)s * chunks + c] |= bit;
+    }
+}
+void gv_to_float(const GrowVec *v, float *out) {
+    int dim = v->dim, chunks = v->chunks;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        int mag = 0;
+        for (int s = 0; s < v->n_slots; s++) {
+            if (v->slots[(size_t)s * chunks + c] & bit)
+                mag++;
+        }
+        float val = (float)mag * v->scale;
+        out[i] = (v->sign[c] & bit) ? -val : val;
+    }
+}
+void fm_from_float(FixedMat *m, const float *data) {
+    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
+    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
+    memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
+    for (int r = 0; r < rows; r++) {
+        const float *row = data + (size_t)r * cols;
+        float amax = 0.0f;
+        for (int j = 0; j < cols; j++) {
+            float a = fabsf(row[j]);
+            if (a > amax) amax = a;
+        }
+        if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
+        m->scales[r] = amax / K;
+        float inv = K / amax;
+        uint64_t *rs = m->sign + (size_t)r * chunks;
+        for (int j = 0; j < cols; j++) {
+            int c = j / 64;
+            uint64_t bit = 1ULL << (j % 64);
+            if (row[j] < 0.0f) rs[c] |= bit;
+            int mag = (int)(fabsf(row[j]) * inv + 0.5f);
+            if (mag > K) mag = K;
+            for (int s = 0; s < mag; s++)
+                m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
+        }
+    }
+}
+/* ============================================================
+ * CONCATENATION = ADDITION
+ *
+ * gv_concat(dst, src):
+ *   Appends src's slots to dst.
+ *   Same-sign: just append.
+ *   Different-sign: cancel bits (remove from both).
+ *
+ * For efficiency with residual connections where scales differ:
+ *   We track a "slot_scales" or use a single scale with normalization.
+ *
+ * SIMPLE VERSION: assumes same scale (works after norm).
+ * ============================================================ */
+/* Simple concat: append src slots to dst. Handles sign cancellation. */
+void gv_concat(GrowVec *dst, const GrowVec *src) {
+    int chunks = dst->chunks;
+    /* For each source slot, process element-wise:
+     * Where signs agree: copy bit to new dst slot
+     * Where signs differ: cancel - find a dst slot with that bit set and clear it
+     *
+     * Optimization: for most transformer residuals, signs mostly agree.
+     * So we do the simple thing: compute per-element sign agreement,
+     * then for agreeing elements just append, for disagreeing elements cancel.
+     */
+    /* Sign agreement mask */
+    /* agree[c] = ~(dst_sign[c] ^ src_sign[c])  — bits where signs match */
+    for (int s = 0; s < src->n_slots; s++) {
+        const uint64_t *src_slot = src->slots + (size_t)s * chunks;
+        /* Split into agree and disagree portions */
+        int new_slot = dst->n_slots;
+        if (new_slot >= dst->max_slots) {
+            /* Out of room — would need realloc in production */
+            printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
+            return;
+        }
+        uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t src_bits = src_slot[c];
+            uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
+            uint64_t disagree = dst->sign[c] ^ src->sign[c];
+            /* Same sign: just append to new slot */
+            uint64_t to_add = src_bits & agree;
+            /* Different sign: cancel from existing dst slots */
+            uint64_t to_cancel = src_bits & disagree;
+            /* Cancel by walking backwards through dst slots */
+            for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
+                uint64_t *dslot = dst->slots + (size_t)d * chunks + c;
+                uint64_t overlap = *dslot & to_cancel;
+                *dslot &= ~overlap;      /* clear cancelled bits in dst */
+                to_cancel &= ~overlap;   /* mark as cancelled */
+            }
+            /* Any remaining to_cancel means src > dst for those elements
+             * — flip the sign and add to new slot */
+            if (to_cancel) {
+                dst->sign[c] ^= to_cancel;  /* flip sign for these elements */
+                to_add |= to_cancel;
+            }
+            dst_new[c] = to_add;
+        }
+        /* Only increment if new slot is non-empty */
+        int non_empty = 0;
+        for (int c = 0; c < chunks && !non_empty; c++)
+            if (dst_new[c]) non_empty = 1;
+        if (non_empty)
+            dst->n_slots++;
+    }
+}
+/* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
+void gv_concat_fast(GrowVec *dst, const GrowVec *src) {
+    int chunks = dst->chunks;
+    int src_slots = src->n_slots;
+    if (dst->n_slots + src_slots > dst->max_slots) {
+        printf("WARNING: GrowVec overflow\n");
+        src_slots = dst->max_slots - dst->n_slots;
+    }
+    /* Just memcpy the slots — handles same-sign correctly,
+     * defers opposite-sign cancellation to next norm */
+    memcpy(dst->slots + (size_t)dst->n_slots * chunks,
+           src->slots,
+           (size_t)src_slots * chunks * sizeof(uint64_t));
+    dst->n_slots += src_slots;
+}
+/* ============================================================
+ * MATMUL: y = M @ x
+ *
+ * M is fixed (K_w slots), x is growable (n_slots slots).
+ * Output is a NEW GrowVec with K_out slots.
+ *
+ * Core: for each output element i, accumulate:
+ *   acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
+ *        - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
+ *
+ * Then quantize acc to K_out unary slots.
+ * ============================================================ */
+void gv_matmul(
+    const FixedMat *M,
+    const GrowVec *x,
+    GrowVec *y,         /* output — gets filled with K_out slots */
+    int K_out           /* how many output slots */
+) {
+    int out_dim = M->rows;
+    int chunks = M->chunks;
+    int wK = M->K;
+    int xK = x->n_slots;
+    float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
+        long long acc = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign_row[c];
+            uint64_t xs = x->sign[c];
+            uint64_t same = ~(ws ^ xs);
+            uint64_t diff = ws ^ xs;
+            for (int p = 0; p < wK; p++) {
+                uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xK; q++) {
+                    uint64_t xq = x->slots[(size_t)q * chunks + c];
+                    uint64_t active = wp & xq;
+                    acc += __builtin_popcountll(active & same)
+                         - __builtin_popcountll(active & diff);
+                }
+            }
+        }
+        y_float[i] = (float)acc * M->scales[i] * x->scale;
+    }
+    /* Quantize to K_out slots */
+    gv_from_float(y, y_float, K_out);
+    free(y_float);
+}
+/* ============================================================
+ * NORM: GrowVec → GrowVec with controlled slot count
+ *
+ * RMSNorm dequantizes (counting), normalizes (float),
+ * then requantizes to a fixed K.
+ * This is where slot count gets reset.
+ * ============================================================ */
+void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) {
+    int dim = x->dim;
+    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
+    gv_to_float(x, xf);
+    float ss = 0.0f;
+    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
+    float rms = 1.0f / sqrtf(ss / dim + eps);
+    for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
+    gv_from_float(out, xf, K_out);
+    free(xf);
+}
+/* ============================================================
+ * SILU_MUL: out = SiLU(gate) * up
+ * Dequant, compute, requant. O(dim).
+ * ============================================================ */
+void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) {
+    int dim = gate->dim;
+    float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
+    float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
+    gv_to_float(gate, gf);
+    gv_to_float(up, uf);
+    for (int i = 0; i < dim; i++)
+        gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
+    gv_from_float(out, gf, K_out);
+    free(gf); free(uf);
+}
+/* ============================================================
+ * TEST: demonstrate growing residual stream
+ * ============================================================ */
+void test_concat_add() {
+    printf("=== CONCATENATION = ADDITION TEST ===\n\n");
+    int dim = 16;
+    /* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
+    float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
+    float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};
+    GrowVec *a = gv_alloc(dim, 8, 64);
+    GrowVec *b = gv_alloc(dim, 8, 64);
+    gv_from_float(a, a_vals, 8);
+    gv_from_float(b, b_vals, 8);
+    printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
+    float af[16], bf[16];
+    gv_to_float(a, af);
+    printf("  Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
+    printf("  Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");
+    printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
+    gv_to_float(b, bf);
+    printf("  Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
+    printf("  Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");
+    /* Concatenate (= add) */
+    printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);
+    /* Need same scale for concat to work correctly */
+    /* In a real network, both come from norm so they have comparable scale */
+    /* For this test, use fast concat (no cancellation) */
+    gv_concat(a, b);
+    printf(" -> %d):\n", a->n_slots);
+    float result[16], ref[16];
+    gv_to_float(a, result);
+    for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];
+    /* NOTE: concat addition only works correctly when scales match.
+     * When scales differ, we'd need to adjust. In a transformer,
+     * the norm before each sublayer ensures comparable scales. */
+    printf("  Float A+B:  "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
+    printf("  Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");
+    gv_free(a); gv_free(b);
+}
+void test_growing_residual() {
+    printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
+    printf("Simulating 6 transformer layers with concat residuals\n\n");
+    int dim = 2560;
+    int K_embed = 16;      /* initial embedding quantization */
+    int K_sublayer = 8;    /* each sublayer output */
+    int n_layers = 6;
+    /* Create random embedding */
+    float *embed = (float *)malloc(dim * sizeof(float));
+    srand(42);
+    for (int i = 0; i < dim; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    /* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
+    int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
+    GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
+    gv_from_float(residual, embed, K_embed);
+    printf("After embedding: %d slots (%.1f KB)\n",
+           residual->n_slots,
+           (float)residual->n_slots * residual->chunks * 8 / 1024);
+    for (int l = 0; l < n_layers; l++) {
+        /* Simulate attention output */
+        GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
+        float *fake_attn = (float *)malloc(dim * sizeof(float));
+        for (int i = 0; i < dim; i++) {
+            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
+        }
+        gv_from_float(attn_out, fake_attn, K_sublayer);
+        /* Scale must match for concat to work — in real net, norm handles this */
+        attn_out->scale = residual->scale;
+        /* RESIDUAL ADD = CONCATENATION */
+        gv_concat_fast(residual, attn_out);
+        /* Simulate MLP output */
+        GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
+        float *fake_mlp = (float *)malloc(dim * sizeof(float));
+        for (int i = 0; i < dim; i++) {
+            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
+        }
+        gv_from_float(mlp_out, fake_mlp, K_sublayer);
+        mlp_out->scale = residual->scale;
+        /* RESIDUAL ADD = CONCATENATION */
+        gv_concat_fast(residual, mlp_out);
+        printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
+               l + 1, residual->n_slots,
+               (float)residual->n_slots * residual->chunks * 8 / 1024,
+               K_sublayer, K_sublayer);
+        gv_free(attn_out); gv_free(mlp_out);
+        free(fake_attn); free(fake_mlp);
+    }
+    printf("\nResidual grew from %d to %d slots through %d layers\n",
+           K_embed, residual->n_slots, n_layers);
+    printf("Information accumulated, never lost to requantization\n");
+    gv_free(residual);
+    free(embed);
+}
+void test_matmul_accuracy() {
+    printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");
+    int rows = 512, cols = 2560;
+    int wK = 32;
+    printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
+    printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");
+    srand(42);
+    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
+    float *xf = (float *)malloc(cols * sizeof(float));
+    float *y_ref = (float *)calloc(rows, sizeof(float));
+    for (size_t i = 0; i < (size_t)rows * cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    for (int i = 0; i < cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
+    FixedMat *M = fm_alloc(rows, cols, wK);
+    fm_from_float(M, Mf);
+    /* Test with different x slot counts (simulating growing residual) */
+    int x_slots[] = {8, 16, 32, 48, 64, 96};
+    for (int t = 0; t < 6; t++) {
+        int xK = x_slots[t];
+        GrowVec *x = gv_alloc(cols, xK, xK);
+        GrowVec *y = gv_alloc(rows, xK, xK);
+        gv_from_float(x, xf, xK);
+        struct timespec t0, t1;
+        float *yf = (float *)malloc(rows * sizeof(float));
+        clock_gettime(CLOCK_MONOTONIC, &t0);
+        gv_matmul(M, x, y, xK);
+        clock_gettime(CLOCK_MONOTONIC, &t1);
+        double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
+        gv_to_float(y, yf);
+        float dot = 0, na = 0, nb = 0, noise = 0;
+        for (int i = 0; i < rows; i++) {
+            dot += y_ref[i] * yf[i];
+            na += y_ref[i] * y_ref[i];
+            nb += yf[i] * yf[i];
+            float e = y_ref[i] - yf[i];
+            noise += e * e;
+        }
+        float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+        float snr = 10.0f * log10f(na / (noise + 1e-10f));
+        printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);
+        gv_free(x); gv_free(y); free(yf);
+    }
+    fm_free(M);
+    free(Mf); free(xf); free(y_ref);
+}
+int main() {
+    printf("========================================\n");
+    printf("  CONCATENATIVE UNARY ENGINE TESTS\n");
+    printf("  Addition = Concatenation\n");
+    printf("  Value = Count of Ones\n");
+    printf("========================================\n");
+    test_concat_add();
+    test_growing_residual();
+    test_matmul_accuracy();
+    printf("\n=== ALL TESTS DONE ===\n");
+    return 0;
+}

convert.py ADDED Viewed

	@@ -0,0 +1,205 @@

+#!/usr/bin/env python3
+"""
+Convert DeepSeek-R1-Distill-Qwen-1.5B to ternary format.
+Stores linear weights as bitplanes (pos_mask, neg_mask) + per-row scale.
+Embeddings and layernorms stay FP16. LM head stays FP16.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import os
+import json
+import struct
+import numpy as np
+from pathlib import Path
+import time
+def load_safetensors(model_dir):
+    """Load all tensors from safetensors files."""
+    import torch; from safetensors.torch import load_file
+    tensors = {}
+    for f in sorted(Path(model_dir).glob("*.safetensors")):
+        print(f"Loading {f.name}...")
+        state = load_file(str(f))
+        for key, val in state.items():
+            tensors[key] = val.float().numpy()
+    return tensors
+def quantize_row_ternary(row, alpha=0.7):
+    """Quantize a single row to ternary {-1, 0, +1}. Vectorized bitpacking."""
+    row = row.astype(np.float32)
+    mean_abs = np.mean(np.abs(row))
+    threshold = alpha * mean_abs
+    pos = row >= threshold
+    neg = row <= -threshold
+    nz_mask = pos | neg
+    scale = np.mean(np.abs(row[nz_mask])) if nz_mask.any() else np.float32(1.0)
+    # Pad to multiple of 64
+    in_dim = len(row)
+    pad = (64 - in_dim % 64) % 64
+    if pad:
+        pos = np.concatenate([pos, np.zeros(pad, dtype=bool)])
+        neg = np.concatenate([neg, np.zeros(pad, dtype=bool)])
+    # Vectorized bitpack: reshape to [chunks, 64], multiply by bit positions, sum
+    pos_r = pos.reshape(-1, 64).astype(np.uint64)
+    neg_r = neg.reshape(-1, 64).astype(np.uint64)
+    bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
+    pos_bits = np.bitwise_or.reduce(pos_r * bit_positions, axis=1)
+    neg_bits = np.bitwise_or.reduce(neg_r * bit_positions, axis=1)
+    return pos_bits, neg_bits, np.float32(scale)
+    return pos_bits, neg_bits, np.float32(scale)
+def quantize_weight_matrix(weight, alpha=0.7):
+    """Quantize entire weight matrix [out_dim, in_dim] to ternary. Fully vectorized."""
+    w = weight.astype(np.float32)
+    out_dim, in_dim = w.shape
+    # Per-row thresholds
+    row_means = np.mean(np.abs(w), axis=1, keepdims=True)
+    thresholds = alpha * row_means
+    pos = w >= thresholds   # [out_dim, in_dim]
+    neg = w <= -thresholds
+    # Per-row scales
+    nz = pos | neg
+    # Use row means of absolute values where non-zero
+    scales = np.zeros(out_dim, dtype=np.float32)
+    for i in range(out_dim):
+        if nz[i].any():
+            scales[i] = np.mean(np.abs(w[i, nz[i]]))
+        else:
+            scales[i] = 1.0
+    # Sparsity
+    total = out_dim * in_dim
+    sparsity = 1.0 - np.sum(nz) / total
+    # Pad to multiple of 64
+    pad = (64 - in_dim % 64) % 64
+    if pad:
+        pos = np.concatenate([pos, np.zeros((out_dim, pad), dtype=bool)], axis=1)
+        neg = np.concatenate([neg, np.zeros((out_dim, pad), dtype=bool)], axis=1)
+    padded_dim = pos.shape[1]
+    chunks = padded_dim // 64
+    # Vectorized bitpacking for entire matrix at once
+    bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))  # [64]
+    pos_r = pos.reshape(out_dim, chunks, 64).astype(np.uint64)  # [out, chunks, 64]
+    neg_r = neg.reshape(out_dim, chunks, 64).astype(np.uint64)
+    all_pos = np.bitwise_or.reduce(pos_r * bit_positions, axis=2)  # [out, chunks]
+    all_neg = np.bitwise_or.reduce(neg_r * bit_positions, axis=2)
+    return all_pos, all_neg, scales, sparsity
+def save_ternary_model(tensors, output_dir, alpha=0.7):
+    """Convert and save full model to ternary format."""
+    os.makedirs(output_dir, exist_ok=True)
+    config = {
+        "hidden_size": 1536,
+        "intermediate_size": 8960,
+        "num_attention_heads": 12,
+        "num_key_value_heads": 2,
+        "num_hidden_layers": 28,
+        "vocab_size": 151936,
+        "head_dim": 128,
+        "rope_theta": 1000000.0,
+        "rms_norm_eps": 1e-6,
+        "alpha": alpha,
+    }
+    # Identify which tensors to ternarize vs keep as-is
+    ternary_keys = []  # Linear weights to ternarize
+    keep_keys = []     # Embeddings, norms, biases to keep as FP16
+    for key in tensors:
+        if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
+                                    'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
+                                    'down_proj.weight']):
+            ternary_keys.append(key)
+        else:
+            keep_keys.append(key)
+    print(f"\nTernary layers: {len(ternary_keys)}")
+    print(f"FP16 layers: {len(keep_keys)}")
+    # Save config
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    # Save ternary weights
+    total_ternary_bytes = 0
+    total_original_bytes = 0
+    for key in ternary_keys:
+        w = tensors[key].astype(np.float32)
+        out_dim, in_dim = w.shape
+        total_original_bytes += w.nbytes
+        t0 = time.time()
+        pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
+        dt = time.time() - t0
+        # Save as binary
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        pos.tofile(prefix + ".pos")
+        neg.tofile(prefix + ".neg")
+        scales.tofile(prefix + ".scales")
+        ternary_bytes = pos.nbytes + neg.nbytes + scales.nbytes
+        total_ternary_bytes += ternary_bytes
+        ratio = w.nbytes / ternary_bytes
+        print(f"  {key}: {w.shape} -> ternary ({ternary_bytes/1024:.0f}KB, "
+              f"{ratio:.1f}x compression, {sparsity:.1%} sparse, {dt:.1f}s)")
+    # Save FP16 weights
+    total_fp16_bytes = 0
+    for key in keep_keys:
+        w = tensors[key].astype(np.float16)
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        w.tofile(prefix + ".fp16")
+        total_fp16_bytes += w.nbytes
+        print(f"  {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
+    # Save tensor manifest
+    manifest = {
+        "ternary": {k: list(tensors[k].shape) for k in ternary_keys},
+        "fp16": {k: list(tensors[k].shape) for k in keep_keys},
+    }
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    total_bytes = total_ternary_bytes + total_fp16_bytes
+    orig_bytes = total_original_bytes + total_fp16_bytes
+    print(f"\n=== Summary ===")
+    print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
+    print(f"Ternary linear weights: {total_ternary_bytes/1024/1024:.1f} MB")
+    print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
+    print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
+    print(f"Compression vs FP32: {orig_bytes/total_bytes:.1f}x")
+if __name__ == "__main__":
+    import sys
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-ternary"
+    alpha = float(sys.argv[3]) if len(sys.argv) > 3 else 0.7
+    print(f"Loading model from {model_dir}...")
+    tensors = load_safetensors(model_dir)
+    print(f"Converting to ternary (alpha={alpha})...")
+    save_ternary_model(tensors, output_dir, alpha)
+    print("Done!")

convert_fast.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python3
+"""
+FAST proper unary converter — vectorized bitpacking via numpy.
+Instead of iterating columns one at a time, processes plane-by-plane
+with vectorized comparisons, then packs to uint64 using np.packbits.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import torch, json, os, sys, gc, shutil
+from safetensors import safe_open
+import numpy as np
+def pack_bits_to_uint64(bool_matrix):
+    """
+    Pack [rows, cols] boolean → [rows, chunks] uint64
+    where chunks = ceil(cols/64).
+    Bit j of element (r, c) corresponds to column c*64+j.
+    Uses little-endian bit ordering within each uint64.
+    """
+    rows, cols = bool_matrix.shape
+    chunks = (cols + 63) // 64
+    # Pad cols to multiple of 64
+    if cols % 64:
+        padded = np.zeros((rows, chunks * 64), dtype=np.uint8)
+        padded[:, :cols] = bool_matrix.astype(np.uint8)
+    else:
+        padded = bool_matrix.astype(np.uint8)
+    # Reshape to [rows, chunks, 64]
+    reshaped = padded.reshape(rows, chunks, 64)
+    # Pack: bit j of uint64 = reshaped[r, c, j]
+    # Build uint64 from 64 bits using shifts
+    result = np.zeros((rows, chunks), dtype=np.uint64)
+    for bit in range(64):
+        result |= reshaped[:, :, bit].astype(np.uint64) << np.uint64(bit)
+    return result
+def encode_fast(weight_f32_np, quantum, K):
+    """
+    Fast vectorized proper unary encoding.
+    weight_f32_np: [rows, cols] numpy float32
+    Returns: sign [rows, chunks] uint64, slots [K, rows, chunks] uint64, clip_count
+    """
+    rows, cols = weight_f32_np.shape
+    chunks = (cols + 63) // 64
+    inv_q = 1.0 / quantum
+    magnitudes = np.round(np.abs(weight_f32_np) * inv_q).astype(np.int32)
+    clip_count = int(np.sum(magnitudes > K))
+    magnitudes = np.clip(magnitudes, 0, K)
+    # Sign: negative elements
+    signs_bool = weight_f32_np < 0  # [rows, cols]
+    sign_packed = pack_bits_to_uint64(signs_bool)  # [rows, chunks]
+    # Unary slots: plane p is set where magnitude > p
+    # Process plane by plane (K iterations, each vectorized over entire matrix)
+    slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
+    for p in range(K):
+        active = magnitudes > p  # [rows, cols] boolean, fully vectorized
+        slots_packed[p] = pack_bits_to_uint64(active)
+        if (p + 1) % 8 == 0 or p == K - 1:
+            print(f"    plane {p+1}/{K}", end="\r", flush=True)
+    print(f"    {K}/{K} planes done, {clip_count} clipped")
+    return sign_packed, slots_packed, clip_count
+def convert(model_dir, output_dir, K=32, clip_pct=99.9):
+    os.makedirs(output_dir, exist_ok=True)
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    print(f"Model: {config.get('model_type', '?')}")
+    print(f"  Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
+    # Index
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        index = json.load(open(index_path))
+        shards = sorted(set(index["weight_map"].values()))
+        weight_map = index["weight_map"]
+    else:
+        shards = ["model.safetensors"]
+        weight_map = None
+    # Scan for quantum
+    print("\nScanning weights...")
+    all_abs = []
+    linear_names = []
+    global_max = 0.0
+    for shard in shards:
+        path = os.path.join(model_dir, shard)
+        print(f"  {shard}...")
+        with safe_open(path, framework="pt") as f:
+            for name in f.keys():
+                t = f.get_tensor(name).float()
+                if t.dim() == 2 and "norm" not in name and "embed" not in name:
+                    linear_names.append(name)
+                    am = t.abs().max().item()
+                    if am > global_max: global_max = am
+                    idx = torch.randint(0, t.numel(), (2000,))
+                    all_abs.append(t.flatten()[idx].abs())
+    all_abs_t = torch.cat(all_abs)
+    clip_val = torch.quantile(all_abs_t, clip_pct / 100.0).item()
+    quantum = clip_val / K
+    print(f"\n  Absmax={global_max:.6f}  P{clip_pct}={clip_val:.6f}")
+    print(f"  K={K}  quantum={quantum:.8f}")
+    mags = (all_abs_t / quantum).round().clamp(0, K)
+    print(f"  Mean mag={mags.mean():.1f}  Median={mags.median():.1f}  Zero={100*(mags==0).float().mean():.1f}%  Clipped={100*(mags==K).float().mean():.1f}%")
+    del all_abs, all_abs_t, mags
+    gc.collect()
+    manifest = {
+        "format": "proper_unary",
+        "quantum": float(quantum),
+        "K": K,
+        "clip_pct": clip_pct,
+        "clip_val": float(clip_val),
+        "global_absmax": float(global_max),
+        "unary": {},
+        "fp16": [],
+    }
+    total_unary = 0
+    total_fp16 = 0
+    total_clip = 0
+    done = 0
+    for shard in shards:
+        path = os.path.join(model_dir, shard)
+        # Get linear names in this shard
+        shard_lins = [n for n in linear_names if (weight_map or {}).get(n, "model.safetensors") == shard]
+        print(f"\n{shard}: {len(shard_lins)} linear layers")
+        with safe_open(path, framework="pt") as f:
+            # Non-linear → FP16
+            for name in f.keys():
+                if name in linear_names:
+                    continue
+                fname = name.replace(".", "_") + ".fp16"
+                out_path = os.path.join(output_dir, fname)
+                if not os.path.exists(out_path):
+                    t = f.get_tensor(name).half().numpy()
+                    t.view(np.uint16).tofile(out_path)
+                    total_fp16 += os.path.getsize(out_path)
+                    manifest["fp16"].append(name)
+                    print(f"  FP16: {name} {t.shape}")
+            # Linear → proper unary
+            for name in shard_lins:
+                fname = name.replace(".", "_")
+                sign_path = os.path.join(output_dir, f"{fname}.usign")
+                slots_path = os.path.join(output_dir, f"{fname}.uslots")
+                if os.path.exists(sign_path) and os.path.exists(slots_path):
+                    t_shape = list(f.get_tensor(name).shape)
+                    manifest["unary"][name] = t_shape
+                    total_unary += os.path.getsize(sign_path) + os.path.getsize(slots_path)
+                    done += 1
+                    print(f"  Skip: {name}")
+                    continue
+                t = f.get_tensor(name).float().numpy()
+                rows, cols = t.shape
+                print(f"  {name} [{rows}x{cols}]", flush=True)
+                sign_p, slots_p, clip_c = encode_fast(t, quantum, K)
+                total_clip += clip_c
+                sign_p.tofile(sign_path)
+                slots_p.tofile(slots_path)
+                s_sz = os.path.getsize(sign_path)
+                sl_sz = os.path.getsize(slots_path)
+                total_unary += s_sz + sl_sz
+                manifest["unary"][name] = [rows, cols]
+                done += 1
+                mb = (s_sz + sl_sz) / 1e6
+                print(f"    → {mb:.1f} MB ({s_sz//1024}KB sign + {sl_sz//1024}KB slots)")
+                del t, sign_p, slots_p
+                gc.collect()
+    # Copy tokenizer/config files
+    for fname in os.listdir(model_dir):
+        if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
+            src = os.path.join(model_dir, fname)
+            dst = os.path.join(output_dir, fname)
+            if not os.path.exists(dst):
+                shutil.copy2(src, dst)
+    json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
+    total = total_unary + total_fp16
+    print(f"\n{'='*60}")
+    print(f"DONE: {done} layers, quantum={quantum:.8f}, K={K}")
+    print(f"  Unary:  {total_unary/1e9:.2f} GB")
+    print(f"  FP16:   {total_fp16/1e6:.1f} MB")
+    print(f"  Total:  {total/1e9:.2f} GB (vs ~7.6 GB BF16 = {total/7.6e9:.1f}x)")
+    print(f"  Clipped: {total_clip} values")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
+    K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
+    convert(model_dir, output_dir, K=K, clip_pct=clip)

convert_log_unary.py ADDED Viewed

	@@ -0,0 +1,159 @@

+#!/usr/bin/env python3
+"""
+Log-unary converter.
+Instead of thermometer (plane p = mag > p), uses binary decomposition
+(plane p = bit p of magnitude). Fewer planes, wider dynamic range.
+3 log-planes: 9 levels (-4 to +4), storage = 3 bitplanes
+vs 7 linear planes: 15 levels (-7 to +7), storage = 7 bitplanes
+4 log-planes: 17 levels (-8 to +8), storage = 4 bitplanes  <-- sweet spot
+5 log-planes: 33 levels (-16 to +16), storage = 5 bitplanes
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import numpy as np
+import os, sys, json, time, gc
+def quantize_log_unary(w_fp32, n_planes):
+    """Quantize weight matrix to log-unary format (binary magnitude planes)"""
+    out_dim, in_dim = w_fp32.shape
+    max_level = (1 << n_planes) - 1  # 2^n - 1
+    # Per-row scale
+    abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
+    abs_max = np.where(abs_max == 0, 1.0, abs_max)
+    scales = (abs_max.flatten() / max_level).astype(np.float32)
+    # Quantize to integer magnitudes
+    scaled = w_fp32 / abs_max * max_level
+    rounded = np.clip(np.round(scaled), -max_level, max_level).astype(np.int32)
+    signs = (rounded < 0)
+    magnitudes = np.abs(rounded)
+    # Pad to 64-bit chunks
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    if padded > in_dim:
+        signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
+        magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
+    # Pack sign bits
+    sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
+    sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
+    # Pack log-planes: plane p = bit p of magnitude
+    plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
+    for p in range(n_planes):
+        bit_mask = (magnitudes >> p) & 1  # extract bit p
+        packed = np.packbits(bit_mask.astype(np.uint8), axis=1, bitorder='little')
+        plane_bits[p] = packed.view(np.uint64)[:, :chunks]
+    return sign_u64, plane_bits, scales
+def convert_model(model_dir, output_dir, n_planes=4):
+    os.makedirs(output_dir, exist_ok=True)
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    max_level = (1 << n_planes) - 1
+    index_file = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_file):
+        index = json.load(open(index_file))
+        weight_map = index["weight_map"]
+        shards = sorted(set(weight_map.values()))
+    else:
+        shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+        weight_map = None
+    print(f"LOG-UNARY CONVERSION")
+    print(f"  Model: {n_layers} layers, hidden={hidden}")
+    print(f"  Log-planes: {n_planes} -> {2*max_level+1} levels (range -{max_level}..+{max_level})")
+    print(f"  Shards: {len(shards)}")
+    manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers,
+                "encoding": "log_unary", "config": config}
+    total_linear = sum(1 for k in (weight_map or {}) if k.endswith(".weight") and "proj" in k)
+    converted = 0
+    import torch
+    from safetensors import safe_open
+    for si, shard in enumerate(shards):
+        path = os.path.join(model_dir, shard)
+        print(f"\n=== Shard {si+1}/{len(shards)}: {shard} ===")
+        with safe_open(path, framework="pt") as f:
+            for key in sorted(f.keys()):
+                fname = key.replace(".", "_")
+                is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
+                if is_linear:
+                    sign_path = os.path.join(output_dir, f"{fname}.sign")
+                    if os.path.exists(sign_path):
+                        manifest["unary"][key] = list(f.get_tensor(key).shape)
+                        converted += 1
+                        print(f"  [SKIP] {key}")
+                        continue
+                    w = f.get_tensor(key).float().numpy()
+                    t0 = time.time()
+                    sign, planes, scales = quantize_log_unary(w, n_planes)
+                    dt = time.time() - t0
+                    np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
+                    np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
+                    np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
+                    manifest["unary"][key] = list(w.shape)
+                    converted += 1
+                    orig_mb = w.nbytes / 1e6
+                    comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
+                    print(f"  [{converted}/{total_linear}] {key}: {list(w.shape)} "
+                          f"-> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
+                    del w, sign, planes, scales
+                else:
+                    fp16_path = os.path.join(output_dir, f"{fname}.fp16")
+                    if os.path.exists(fp16_path):
+                        manifest["fp16"][key] = list(f.get_tensor(key).shape)
+                        print(f"  [SKIP] {key}")
+                        continue
+                    w = f.get_tensor(key).float().numpy()
+                    w_fp16 = w.astype(np.float16)
+                    w_fp16.view(np.uint16).tofile(fp16_path)
+                    manifest["fp16"][key] = list(w.shape)
+                    print(f"  [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
+                    del w, w_fp16
+        gc.collect()
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    import shutil
+    for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+        src = os.path.join(model_dir, cf)
+        if os.path.exists(src):
+            shutil.copy(src, os.path.join(output_dir, cf))
+    total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
+                      for f in os.listdir(output_dir) if f.endswith((".sign",".planes",".scales")))
+    total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
+                     for f in os.listdir(output_dir) if f.endswith(".fp16"))
+    print(f"\n=== LOG-UNARY CONVERSION COMPLETE ===")
+    print(f"  Encoding: {n_planes} log-planes (binary magnitude)")
+    print(f"  Unary:  {total_unary/1e9:.2f} GB")
+    print(f"  FP16:   {total_fp16/1e9:.2f} GB")
+    print(f"  Total:  {(total_unary+total_fp16)/1e9:.2f} GB")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-log-unary"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 4
+    convert_model(model_dir, output_dir, n_planes)

convert_proper_unary.py ADDED Viewed

	@@ -0,0 +1,164 @@

+#!/usr/bin/env python3
+"""
+Convert Qwen3-4B BF16 safetensors → Proper Unary.
+Reads safetensors raw bytes (no framework dependency for BF16).
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import numpy as np
+import json, os, sys, gc, shutil, struct, time
+class SafeTensorReader:
+    """Read safetensors one tensor at a time (memory efficient)."""
+    def __init__(self, path):
+        self.f = open(path, "rb")
+        header_size = struct.unpack("<Q", self.f.read(8))[0]
+        self.header = json.loads(self.f.read(header_size).decode("utf-8"))
+        self.data_start = 8 + header_size
+        self._meta = {k: v for k, v in self.header.items() if k != "__metadata__"}
+    def keys(self):
+        return list(self._meta.keys())
+    def get(self, name):
+        meta = self._meta[name]
+        dtype = meta["dtype"]
+        shape = tuple(meta["shape"])
+        start, end = meta["data_offsets"]
+        self.f.seek(self.data_start + start)
+        raw = self.f.read(end - start)
+        if dtype == "BF16":
+            u16 = np.frombuffer(raw, dtype=np.uint16)
+            u32 = u16.astype(np.uint32) << 16
+            return u32.view(np.float32).reshape(shape)
+        elif dtype == "F16":
+            return np.frombuffer(raw, dtype=np.float16).reshape(shape).astype(np.float32)
+        elif dtype == "F32":
+            return np.frombuffer(raw, dtype=np.float32).reshape(shape).copy()
+        else:
+            raise ValueError(f"Unknown dtype {dtype}")
+    def close(self):
+        self.f.close()
+def encode_proper_unary(weight_f32, K):
+    """Encode 2D float32 matrix to proper unary."""
+    rows, cols = weight_f32.shape
+    chunks = (cols + 63) // 64
+    row_absmax = np.abs(weight_f32).max(axis=1).astype(np.float32)
+    row_absmax = np.maximum(row_absmax, 1e-10)
+    row_scales = (row_absmax / K).astype(np.float32)
+    inv_scales = K / row_absmax
+    magnitudes = np.clip(
+        np.round(np.abs(weight_f32) * inv_scales[:, None]).astype(np.int32), 0, K)
+    sign_bits = np.zeros((rows, chunks), dtype=np.uint64)
+    slot_planes = np.zeros((K, rows, chunks), dtype=np.uint64)
+    negative = weight_f32 < 0
+    for j in range(cols):
+        c = j // 64
+        b = np.uint64(j % 64)
+        bit = np.uint64(1) << b
+        neg_mask = negative[:, j]
+        if neg_mask.any():
+            sign_bits[neg_mask, c] |= bit
+        mag_col = magnitudes[:, j]
+        for s in range(K):
+            active = mag_col > s
+            if not active.any():
+                break
+            slot_planes[s, active, c] |= bit
+    return sign_bits, slot_planes, row_scales
+def convert_model(model_dir, output_dir, K=32):
+    os.makedirs(output_dir, exist_ok=True)
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    for f in ["config.json", "tokenizer.json", "tokenizer_config.json",
+              "special_tokens_map.json", "generation_config.json"]:
+        src = os.path.join(model_dir, f)
+        if os.path.exists(src):
+            shutil.copy2(src, output_dir)
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        index = json.load(open(index_path))
+        shard_files = sorted(set(index["weight_map"].values()))
+    else:
+        shard_files = ["model.safetensors"]
+    linear_names = ["q_proj", "k_proj", "v_proj", "o_proj",
+                    "gate_proj", "up_proj", "down_proj"]
+    manifest = {"K": K, "format": "proper_unary", "unary": {}, "fp16": []}
+    total_linear = 0
+    total_size = 0
+    for shard_name in shard_files:
+        shard_path = os.path.join(model_dir, shard_name)
+        print(f"\n=== {shard_name} ===", flush=True)
+        reader = SafeTensorReader(shard_path)
+        print(f"  {len(reader.keys())} tensors", flush=True)
+        for key in sorted(reader.keys()):
+            tensor = reader.get(key)
+            fname = key.replace(".", "_")
+            is_linear = any(ln + ".weight" in key for ln in linear_names)
+            if is_linear and tensor.ndim == 2:
+                rows, cols = tensor.shape
+                t0 = time.time()
+                print(f"  {key}: {rows}x{cols} K={K}...", end="", flush=True)
+                sign_bits, slot_planes, row_scales = encode_proper_unary(tensor, K)
+                dt = time.time() - t0
+                sign_bits.tofile(os.path.join(output_dir, fname + ".sign"))
+                slot_planes.tofile(os.path.join(output_dir, fname + ".slots"))
+                row_scales.tofile(os.path.join(output_dir, fname + ".scales"))
+                manifest["unary"][key] = [rows, cols]
+                sz = sign_bits.nbytes + slot_planes.nbytes + row_scales.nbytes
+                total_size += sz
+                total_linear += 1
+                ratio = sz / (rows * cols * 2)
+                print(f" {sz/1e6:.1f}MB ({ratio:.1f}x) [{dt:.0f}s]", flush=True)
+                del sign_bits, slot_planes, row_scales
+            else:
+                # FP16
+                t_f16 = tensor.astype(np.float16)
+                out_data = t_f16.view(np.uint16)
+                out_data.tofile(os.path.join(output_dir, fname + ".fp16"))
+                manifest["fp16"].append(key)
+                sz = out_data.nbytes
+                total_size += sz
+                print(f"  {key}: {tensor.shape} -> FP16 ({sz/1e6:.1f}MB)", flush=True)
+                del t_f16, out_data
+            del tensor
+        reader.close()
+        gc.collect()
+    json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
+    print(f"\n{'='*50}", flush=True)
+    print(f"DONE: {total_linear} layers, K={K}", flush=True)
+    print(f"Total: {total_size/1e9:.2f} GB (orig ~7.6 GB, ratio {total_size/7.6e9:.1f}x)", flush=True)
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "/root/ternary_engine/qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "/root/ternary_engine/qwen3-4b-proper-unary"
+    K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    convert_model(model_dir, output_dir, K)

convert_proper_unary_v2.py ADDED Viewed

	@@ -0,0 +1,247 @@

+#!/usr/bin/env python3
+"""
+PROPER UNARY CONVERTER — Global quantum, torch-based, BF16 support
+Clips at P99.9 of |weights| instead of absmax to avoid wasting
+quantization range on rare outliers. Values above clip point
+saturate at K (still represented, just capped).
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import torch, json, os, sys, gc, shutil
+from safetensors import safe_open
+import numpy as np
+def scan_all_linears(model_dir):
+    """Scan linear layers, return global stats."""
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        index = json.load(open(index_path))
+        shards = sorted(set(index["weight_map"].values()))
+    else:
+        shards = ["model.safetensors"]
+    all_abs_samples = []
+    linear_names = []
+    global_max = 0.0
+    for shard in shards:
+        path = os.path.join(model_dir, shard)
+        print(f"  Scanning {shard}...")
+        with safe_open(path, framework="pt") as f:
+            for name in f.keys():
+                t = f.get_tensor(name).float()
+                if t.dim() == 2 and "norm" not in name and "embed" not in name:
+                    linear_names.append(name)
+                    am = t.abs().max().item()
+                    if am > global_max:
+                        global_max = am
+                    # Sample 2000 values for distribution
+                    idx = torch.randint(0, t.numel(), (2000,))
+                    all_abs_samples.append(t.flatten()[idx].abs())
+    all_abs = torch.cat(all_abs_samples)
+    return global_max, all_abs, linear_names, shards
+def encode_to_proper_unary_torch(weight_f32, quantum, K):
+    """
+    Encode [rows, cols] float32 tensor to proper unary.
+    Returns sign_packed [rows, chunks] uint64, slots_packed [K, rows, chunks] uint64
+    """
+    rows, cols = weight_f32.shape
+    chunks = (cols + 63) // 64
+    inv_q = 1.0 / quantum
+    magnitudes = (weight_f32.abs() * inv_q).round().long().clamp(0, K)
+    signs = weight_f32 < 0
+    clip_count = int((weight_f32.abs() * inv_q > K).sum().item())
+    # Pack to uint64 bitplanes using numpy (torch lacks bit manipulation)
+    sign_packed = np.zeros((rows, chunks), dtype=np.uint64)
+    slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
+    mags_np = magnitudes.numpy()
+    signs_np = signs.numpy()
+    for j in range(cols):
+        c = j // 64
+        bit = np.uint64(1) << np.uint64(j % 64)
+        # Sign
+        mask = signs_np[:, j]
+        sign_packed[mask, c] |= bit
+        # Unary slots: for each element, set slots 0..mag-1
+        col_mags = mags_np[:, j]
+        for p in range(K):
+            active = col_mags > p
+            slots_packed[p, active, c] |= bit
+        if (j + 1) % 256 == 0:
+            print(f"    col {j+1}/{cols}", end="\r", flush=True)
+    print(f"    {cols}/{cols} done, {clip_count} clipped")
+    return sign_packed, slots_packed, clip_count
+def convert(model_dir, output_dir, K=32, clip_pct=99.9):
+    os.makedirs(output_dir, exist_ok=True)
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    print(f"Model: {config.get('_name_or_path', config.get('model_type', '?'))}")
+    print(f"  Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
+    # Scan
+    print("\nScanning weights...")
+    global_max, all_abs, linear_names, shards = scan_all_linears(model_dir)
+    # Pick quantum from clip percentile
+    clip_val = torch.quantile(all_abs, clip_pct / 100.0).item()
+    quantum = clip_val / K
+    print(f"\n  Global absmax:  {global_max:.6f}")
+    print(f"  P{clip_pct} clip:    {clip_val:.6f}")
+    print(f"  K = {K}")
+    print(f"  Quantum = {quantum:.8f}")
+    print(f"  Values > clip ({clip_pct}%): saturate at K={K}")
+    # Distribution with chosen quantum
+    mags = (all_abs / quantum).round().clamp(0, K)
+    print(f"\n  Mean magnitude: {mags.mean():.1f} slots")
+    print(f"  Median:         {mags.median():.1f} slots")
+    print(f"  Zero fraction:  {100*(mags==0).float().mean():.1f}%")
+    print(f"  At K (clipped): {100*(mags==K).float().mean():.1f}%")
+    print(f"  Unique levels:  {len(mags.unique())} / {K+1}")
+    # Memory estimate
+    # Per linear: sign=rows*chunks*8 bytes, slots=K*rows*chunks*8 bytes
+    # Approx: (K+1) bits per element vs 16 bits BF16
+    bits_per_elem = K + 1  # K slot bits + 1 sign bit (stored in uint64 chunks)
+    ratio = bits_per_elem / 16.0
+    print(f"\n  Bits per weight:  {bits_per_elem}")
+    print(f"  vs BF16 (16 bit): {ratio:.1f}x")
+    print(f"  Original: ~7.6 GB → Estimated: ~{7.6 * ratio:.1f} GB")
+    # Build weight map
+    index_path = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_path):
+        weight_map = json.load(open(index_path))["weight_map"]
+    else:
+        weight_map = None
+    manifest = {
+        "format": "proper_unary",
+        "quantum": float(quantum),
+        "K": K,
+        "clip_pct": clip_pct,
+        "clip_val": float(clip_val),
+        "global_absmax": float(global_max),
+        "unary": {},
+        "fp16": [],
+    }
+    # Group linears by shard
+    shard_linears = {}
+    for name in linear_names:
+        shard = weight_map[name] if weight_map else "model.safetensors"
+        shard_linears.setdefault(shard, []).append(name)
+    total_unary_bytes = 0
+    total_fp16_bytes = 0
+    total_clipped = 0
+    done = 0
+    for shard in shards:
+        path = os.path.join(model_dir, shard)
+        shard_lins = shard_linears.get(shard, [])
+        print(f"\nProcessing {shard} ({len(shard_lins)} linear layers)...")
+        with safe_open(path, framework="pt") as f:
+            all_keys = list(f.keys())
+            # Non-linear weights → FP16
+            for name in all_keys:
+                if name in linear_names:
+                    continue
+                fname = name.replace(".", "_") + ".fp16"
+                out_path = os.path.join(output_dir, fname)
+                if not os.path.exists(out_path):
+                    t = f.get_tensor(name).half()
+                    t.numpy().view(np.uint16).tofile(out_path)
+                    sz = os.path.getsize(out_path)
+                    total_fp16_bytes += sz
+                    manifest["fp16"].append(name)
+                    print(f"  FP16: {name} {list(t.shape)} ({sz//1024}KB)")
+            # Linear weights → proper unary
+            for name in shard_lins:
+                fname = name.replace(".", "_")
+                sign_path = os.path.join(output_dir, f"{fname}.usign")
+                slots_path = os.path.join(output_dir, f"{fname}.uslots")
+                if os.path.exists(sign_path) and os.path.exists(slots_path):
+                    t = f.get_tensor(name)
+                    manifest["unary"][name] = list(t.shape)
+                    total_unary_bytes += os.path.getsize(sign_path) + os.path.getsize(slots_path)
+                    done += 1
+                    print(f"  Skip: {name}")
+                    continue
+                t = f.get_tensor(name).float()
+                rows, cols = t.shape
+                print(f"  Converting: {name} [{rows}x{cols}]...", flush=True)
+                sign_p, slots_p, clip_c = encode_to_proper_unary_torch(t, quantum, K)
+                total_clipped += clip_c
+                sign_p.tofile(sign_path)
+                slots_p.tofile(slots_path)
+                s_sz = os.path.getsize(sign_path)
+                sl_sz = os.path.getsize(slots_path)
+                total_unary_bytes += s_sz + sl_sz
+                manifest["unary"][name] = [rows, cols]
+                done += 1
+                print(f"    sign={s_sz//1024}KB slots={sl_sz//1024}KB total={( s_sz+sl_sz)//1024//1024}MB")
+                del t, sign_p, slots_p
+                gc.collect()
+    # Copy config and tokenizer
+    for fname in os.listdir(model_dir):
+        if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
+            src = os.path.join(model_dir, fname)
+            dst = os.path.join(output_dir, fname)
+            if not os.path.exists(dst):
+                shutil.copy2(src, dst)
+    manifest_path = os.path.join(output_dir, "manifest.json")
+    json.dump(manifest, open(manifest_path, "w"), indent=2)
+    total = total_unary_bytes + total_fp16_bytes
+    print(f"\n{'='*60}")
+    print(f"PROPER UNARY CONVERSION COMPLETE")
+    print(f"{'='*60}")
+    print(f"  Quantum:       {quantum:.8f}")
+    print(f"  K:             {K}")
+    print(f"  Clip at P{clip_pct}: {clip_val:.6f}")
+    print(f"  Linear layers: {done}")
+    print(f"  Clipped vals:  {total_clipped}")
+    print(f"  Unary:         {total_unary_bytes/1e9:.2f} GB")
+    print(f"  FP16 (norms):  {total_fp16_bytes/1e6:.1f} MB")
+    print(f"  Total:         {total/1e9:.2f} GB")
+    print(f"  Original BF16: ~7.6 GB")
+    print(f"  Ratio:         {total/7.6e9:.1f}x")
+    print(f"  Output dir:    {output_dir}")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
+    K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
+    convert(model_dir, output_dir, K=K, clip_pct=clip)

convert_qwen3.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+"""
+Unary converter for Qwen3 models.
+Converts safetensors to unary bitplane format.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import numpy as np
+import os, sys, json, time
+def load_safetensors_torch(model_dir):
+    """Load all safetensors shards using torch backend"""
+    import torch
+    from safetensors import safe_open
+    weights = {}
+    shard_files = sorted([f for f in os.listdir(model_dir) if f.endswith('.safetensors')])
+    print(f"Loading {len(shard_files)} shard(s)...")
+    for sf in shard_files:
+        path = os.path.join(model_dir, sf)
+        print(f"  {sf}...")
+        with safe_open(path, framework="pt") as f:
+            for key in f.keys():
+                t = f.get_tensor(key)
+                weights[key] = t.float().numpy()  # Convert BF16->FP32
+    return weights
+def quantize_unary_vectorized(w_fp32, n_planes):
+    """Quantize a weight matrix to unary bitplane format using vectorized numpy"""
+    out_dim, in_dim = w_fp32.shape
+    max_val = n_planes  # values from -n_planes to +n_planes
+    # Scale to [-max_val, max_val]
+    abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
+    abs_max = np.where(abs_max == 0, 1.0, abs_max)
+    scaled = w_fp32 / abs_max * max_val
+    rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
+    # Per-row scales
+    scales = (abs_max.flatten() / max_val).astype(np.float32)
+    # Sign and magnitude
+    signs = (rounded < 0)  # True = negative
+    magnitudes = np.abs(rounded)  # 0 to n_planes
+    # Pack into uint64 bitplanes
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    # Pad to multiple of 64
+    if padded > in_dim:
+        signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
+        magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
+    # Pack sign bits: [out_dim, chunks] as uint64
+    sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
+    sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
+    # Pack magnitude planes: for each plane p, bit is set if magnitude > p
+    plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
+    for p in range(n_planes):
+        mask = (magnitudes > p)
+        packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
+        plane_bits[p] = packed.view(np.uint64)[:, :chunks]
+    return sign_u64, plane_bits, scales
+def convert_model(model_dir, output_dir, n_planes=7):
+    """Convert a Qwen3 model to unary format"""
+    os.makedirs(output_dir, exist_ok=True)
+    # Load config
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
+    # Load weights
+    weights = load_safetensors_torch(model_dir)
+    print(f"Loaded {len(weights)} tensors")
+    # Identify linear layers (2D weight matrices in attn/mlp)
+    linear_keys = [k for k in weights if k.endswith(".weight") and weights[k].ndim == 2
+                   and ("proj" in k)]
+    manifest = {"unary": {}, "fp16": {}}
+    # Convert linear layers to unary
+    total = len(linear_keys)
+    for idx, key in enumerate(sorted(linear_keys)):
+        w = weights[key]
+        t0 = time.time()
+        sign, planes, scales = quantize_unary_vectorized(w, n_planes)
+        dt = time.time() - t0
+        # Flatten name for filesystem
+        fname = key.replace(".", "_")
+        np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
+        np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
+        np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
+        manifest["unary"][key] = list(w.shape)
+        sparsity = 1.0 - np.count_nonzero(np.abs(np.round(w / np.abs(w).max(axis=1, keepdims=True) * n_planes)).astype(int)) / w.size
+        orig_mb = w.nbytes / 1e6
+        comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
+        print(f"  [{idx+1}/{total}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
+    # Save FP16 weights (norms, embeddings, QK-norms)
+    fp16_keys = [k for k in weights if k not in linear_keys]
+    for key in sorted(fp16_keys):
+        w = weights[key]
+        fname = key.replace(".", "_")
+        w_fp16 = w.astype(np.float16)
+        w_fp16.view(np.uint16).tofile(os.path.join(output_dir, f"{fname}.fp16"))
+        manifest["fp16"][key] = list(w.shape)
+        print(f"  [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
+    # Save manifest and config
+    manifest["n_planes"] = n_planes
+    manifest["n_layers"] = n_layers
+    manifest["config"] = config
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    # Copy config
+    import shutil
+    shutil.copy(os.path.join(model_dir, "config.json"), os.path.join(output_dir, "config.json"))
+    # Size summary
+    total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
+                      for f in os.listdir(output_dir)
+                      if f.endswith((".sign", ".planes", ".scales")))
+    total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
+                     for f in os.listdir(output_dir)
+                     if f.endswith(".fp16"))
+    orig_total = sum(w.nbytes for w in weights.values())
+    print(f"\n=== CONVERSION COMPLETE ===")
+    print(f"Original FP32: {orig_total/1e9:.2f} GB")
+    print(f"Unary linear:  {total_unary/1e9:.2f} GB")
+    print(f"FP16 other:    {total_fp16/1e9:.2f} GB")
+    print(f"Total:         {(total_unary+total_fp16)/1e9:.2f} GB")
+    print(f"Compression:   {orig_total/(total_unary+total_fp16):.1f}x")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
+    convert_model(model_dir, output_dir, n_planes)

convert_qwen3_v2.py ADDED Viewed

	@@ -0,0 +1,161 @@

+#!/usr/bin/env python3
+"""
+Memory-efficient unary converter for Qwen3.
+Processes one safetensors shard at a time to avoid OOM.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import numpy as np
+import os, sys, json, time, gc
+def quantize_unary(w_fp32, n_planes):
+    """Quantize weight matrix to unary bitplane format"""
+    out_dim, in_dim = w_fp32.shape
+    max_val = n_planes
+    abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
+    abs_max = np.where(abs_max == 0, 1.0, abs_max)
+    scaled = w_fp32 / abs_max * max_val
+    rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
+    scales = (abs_max.flatten() / max_val).astype(np.float32)
+    signs = (rounded < 0)
+    magnitudes = np.abs(rounded)
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    if padded > in_dim:
+        signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
+        magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
+    sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
+    sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
+    plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
+    for p in range(n_planes):
+        mask = (magnitudes > p)
+        packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
+        plane_bits[p] = packed.view(np.uint64)[:, :chunks]
+    return sign_u64, plane_bits, scales
+def convert_model(model_dir, output_dir, n_planes=7):
+    os.makedirs(output_dir, exist_ok=True)
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    # Load index to know which keys are in which shard
+    index_file = os.path.join(model_dir, "model.safetensors.index.json")
+    if os.path.exists(index_file):
+        index = json.load(open(index_file))
+        weight_map = index["weight_map"]
+        shards = sorted(set(weight_map.values()))
+    else:
+        # Single shard
+        shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
+        weight_map = None
+    print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
+    print(f"Shards: {len(shards)}")
+    manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers, "config": config}
+    total_converted = 0
+    total_linear = 0
+    # Count total linear layers
+    if weight_map:
+        total_linear = sum(1 for k in weight_map if k.endswith(".weight") and "proj" in k)
+    print(f"Total linear layers to convert: {total_linear}")
+    import torch
+    from safetensors import safe_open
+    for shard_idx, shard in enumerate(shards):
+        shard_path = os.path.join(model_dir, shard)
+        print(f"\n=== Shard {shard_idx+1}/{len(shards)}: {shard} ===")
+        with safe_open(shard_path, framework="pt") as f:
+            keys = list(f.keys())
+            print(f"  {len(keys)} tensors in shard")
+            for key in sorted(keys):
+                fname = key.replace(".", "_")
+                is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
+                if is_linear:
+                    # Check if already converted
+                    sign_path = os.path.join(output_dir, f"{fname}.sign")
+                    if os.path.exists(sign_path):
+                        w = f.get_tensor(key)
+                        manifest["unary"][key] = list(w.shape)
+                        total_converted += 1
+                        print(f"  [SKIP] {key} already converted")
+                        continue
+                    w = f.get_tensor(key).float().numpy()
+                    t0 = time.time()
+                    sign, planes, scales = quantize_unary(w, n_planes)
+                    dt = time.time() - t0
+                    np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
+                    np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
+                    np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
+                    orig_mb = w.nbytes / 1e6
+                    comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
+                    total_converted += 1
+                    manifest["unary"][key] = list(w.shape)
+                    print(f"  [{total_converted}/{total_linear}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
+                    del w, sign, planes, scales
+                else:
+                    # FP16 weight (norms, embeddings, etc)
+                    fp16_path = os.path.join(output_dir, f"{fname}.fp16")
+                    if os.path.exists(fp16_path):
+                        w = f.get_tensor(key)
+                        manifest["fp16"][key] = list(w.shape)
+                        print(f"  [SKIP] {key} already saved")
+                        continue
+                    w = f.get_tensor(key).float().numpy()
+                    w_fp16 = w.astype(np.float16)
+                    w_fp16.view(np.uint16).tofile(fp16_path)
+                    manifest["fp16"][key] = list(w.shape)
+                    print(f"  [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
+                    del w, w_fp16
+        # Force GC between shards
+        gc.collect()
+        print(f"  Shard done, memory freed")
+    # Save manifest
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    # Copy config
+    import shutil
+    for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
+        src = os.path.join(model_dir, cf)
+        if os.path.exists(src):
+            shutil.copy(src, os.path.join(output_dir, cf))
+    # Summary
+    total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
+                      for f in os.listdir(output_dir)
+                      if f.endswith((".sign", ".planes", ".scales")))
+    total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
+                     for f in os.listdir(output_dir)
+                     if f.endswith(".fp16"))
+    print(f"\n=== CONVERSION COMPLETE ===")
+    print(f"Unary linear:  {total_unary/1e9:.2f} GB")
+    print(f"FP16 other:    {total_fp16/1e9:.2f} GB")
+    print(f"Total:         {(total_unary+total_fp16)/1e9:.2f} GB")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
+    convert_model(model_dir, output_dir, n_planes)

deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales ADDED Viewed

Binary file (35.8 kB). View file

deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16 ADDED Viewed

Binary file (3.07 kB). View file

deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales ADDED Viewed

Binary file (1.02 kB). View file

deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg ADDED Viewed

Binary file (49.2 kB). View file

deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales ADDED Viewed

	@@ -0,0 +1,3 @@

+E�=�r�=ě�=�6�=e��=)�=ڍ=Q�=룋=�d�=n��=L��=q2�=
+�=D�=��=�=S�=���=�A�=�=\%�=�F�=	J�=��=6v�=bՃ=a
+�=���=ƞn=�P�=~j�=��=�*�=ő=�(�=���=J6�=N+�=Cs�=�S�=Uu�=Vԋ=�і=� �=�ɓ=Zړ=^;�=�1�=s�=޷�=Uj�=�v=ڱ�=���=b�=c@�=W��=C)�=�܋=�,�=ᵓ=Z~�=�`�=��=i��=0W�=�.�=���=�َ=S��=�H�=0�=���=�0�=6'�=��=�<�=沕=��=���=�Љ=ޏ=H5�=f�~=z;�=�u�=���=�"�=��=�i�=+~�<�-�=�g�=�)�=�u�=��=��=ܨ�=�k�=~[�=�=�l�=���=G(�=��=�/�=��=6Y�=1 �=�=	Ջ=2�=��=蓏=)G�=.l�=N�=:��=.��=���=���=WA�=�>�=ѡ�=�ӏ=��=U�=㟙=q:�=F�=T��=���=o�y=�Ѝ=+

deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos ADDED Viewed

Binary file (49.2 kB). View file

inference.py ADDED Viewed

	@@ -0,0 +1,503 @@

+#!/usr/bin/env python3
+"""
+Ternary Transformer Inference Engine
+Full Qwen2 architecture inference using ternary (1.58-bit) linear layers
+with AVX-512 optimized kernels. Zero multiplications in linear layers.
+Architecture: DeepSeek-R1-Distill-Qwen-1.5B
+- 28 layers, hidden=1536, intermediate=8960
+- GQA: 12 heads, 2 KV heads, head_dim=128
+- SwiGLU MLP, RoPE, RMSNorm
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import os
+import json
+import ctypes
+import numpy as np
+from pathlib import Path
+import time
+# ============================================================
+# Load C kernel
+# ============================================================
+def load_kernel(so_path="ternary_kernel.so"):
+    lib = ctypes.CDLL(so_path)
+    # ternary_matvec_avx512
+    lib.ternary_matvec_avx512.restype = None
+    lib.ternary_matvec_avx512.argtypes = [
+        ctypes.c_void_p,  # pos_bits
+        ctypes.c_void_p,  # neg_bits
+        ctypes.c_void_p,  # scales
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # y
+        ctypes.c_int,     # out_dim
+        ctypes.c_int,     # in_dim
+    ]
+    # rmsnorm
+    lib.rmsnorm_avx512.restype = None
+    lib.rmsnorm_avx512.argtypes = [
+        ctypes.c_void_p,  # x
+        ctypes.c_void_p,  # weight
+        ctypes.c_void_p,  # y
+        ctypes.c_int,     # dim
+        ctypes.c_float,   # eps
+    ]
+    # silu
+    lib.silu_avx512.restype = None
+    lib.silu_avx512.argtypes = [ctypes.c_void_p, ctypes.c_int]
+    # elemwise_mul
+    lib.elemwise_mul_avx512.restype = None
+    lib.elemwise_mul_avx512.argtypes = [
+        ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int
+    ]
+    # softmax
+    lib.softmax.restype = None
+    lib.softmax.argtypes = [ctypes.c_void_p, ctypes.c_int]
+    # rope
+    lib.apply_rope.restype = None
+    lib.apply_rope.argtypes = [
+        ctypes.c_void_p, ctypes.c_void_p,
+        ctypes.c_int, ctypes.c_int, ctypes.c_int,
+        ctypes.c_int, ctypes.c_float
+    ]
+    return lib
+# ============================================================
+# Ternary Linear Layer
+# ============================================================
+class TernaryLinear:
+    def __init__(self, pos_bits, neg_bits, scales, out_dim, in_dim, kernel):
+        self.pos = pos_bits  # uint64 contiguous array
+        self.neg = neg_bits
+        self.scales = scales  # float32
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.kernel = kernel
+    def forward(self, x):
+        """x: float32[in_dim] -> float32[out_dim]"""
+        y = np.zeros(self.out_dim, dtype=np.float32)
+        self.kernel.ternary_matvec_avx512(
+            self.pos.ctypes.data,
+            self.neg.ctypes.data,
+            self.scales.ctypes.data,
+            x.ctypes.data,
+            y.ctypes.data,
+            self.out_dim,
+            self.in_dim,
+        )
+        return y
+# ============================================================
+# KV Cache
+# ============================================================
+class KVCache:
+    def __init__(self, n_layers, n_kv_heads, head_dim, max_seq=4096):
+        self.n_layers = n_layers
+        self.max_seq = max_seq
+        # Pre-allocate
+        self.k = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
+        self.v = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
+        self.seq_len = 0
+    def append(self, layer, k, v):
+        """k, v: [n_kv_heads, head_dim]"""
+        pos = self.seq_len
+        self.k[layer][pos] = k
+        self.v[layer][pos] = v
+    def get(self, layer):
+        """Returns k, v up to current position: [seq_len, n_kv_heads, head_dim]"""
+        return self.k[layer][:self.seq_len + 1], self.v[layer][:self.seq_len + 1]
+    def advance(self):
+        self.seq_len += 1
+# ============================================================
+# Model
+# ============================================================
+class TernaryQwen:
+    def __init__(self, model_dir, kernel):
+        self.kernel = kernel
+        self.model_dir = model_dir
+        with open(os.path.join(model_dir, "config.json")) as f:
+            self.config = json.load(f)
+        with open(os.path.join(model_dir, "manifest.json")) as f:
+            self.manifest = json.load(f)
+        self.hidden = self.config["hidden_size"]       # 1536
+        self.inter = self.config["intermediate_size"]   # 8960
+        self.n_heads = self.config["num_attention_heads"]  # 12
+        self.n_kv = self.config["num_key_value_heads"]     # 2
+        self.head_dim = self.config["head_dim"]            # 128
+        self.n_layers = self.config["num_hidden_layers"]   # 28
+        self.vocab = self.config["vocab_size"]             # 151936
+        self.rope_theta = self.config["rope_theta"]
+        self.eps = self.config["rms_norm_eps"]
+        print(f"Loading ternary model: {self.n_layers} layers, "
+              f"hidden={self.hidden}, heads={self.n_heads}/{self.n_kv}")
+        t0 = time.time()
+        self._load_weights()
+        print(f"Model loaded in {time.time()-t0:.1f}s")
+        self._compute_memory()
+    def _load_ternary(self, key):
+        """Load a ternary linear layer."""
+        prefix = os.path.join(self.model_dir, key.replace(".", "_"))
+        shape = self.manifest["ternary"][key]
+        out_dim, in_dim = shape
+        chunks = (in_dim + 63) // 64
+        pos = np.fromfile(prefix + ".pos", dtype=np.uint64).reshape(out_dim, chunks)
+        neg = np.fromfile(prefix + ".neg", dtype=np.uint64).reshape(out_dim, chunks)
+        scales = np.fromfile(prefix + ".scales", dtype=np.float32)
+        # Make contiguous
+        pos = np.ascontiguousarray(pos)
+        neg = np.ascontiguousarray(neg)
+        return TernaryLinear(pos, neg, scales, out_dim, in_dim, self.kernel)
+    def _load_fp16(self, key):
+        """Load an FP16 tensor."""
+        prefix = os.path.join(self.model_dir, key.replace(".", "_"))
+        shape = self.manifest["fp16"][key]
+        return np.fromfile(prefix + ".fp16", dtype=np.float16).reshape(shape).astype(np.float32)
+    def _load_weights(self):
+        """Load all weights."""
+        # Embedding (FP16)
+        self.embed = self._load_fp16("model.embed_tokens.weight")  # [vocab, hidden]
+        # Final norm
+        self.final_norm = self._load_fp16("model.norm.weight")  # [hidden]
+        # LM head — check if it exists as ternary or fp16
+        if "lm_head.weight" in self.manifest.get("ternary", {}):
+            self.lm_head = self._load_ternary("lm_head.weight")
+            self.lm_head_ternary = True
+        elif "lm_head.weight" in self.manifest.get("fp16", {}):
+            self.lm_head_w = self._load_fp16("lm_head.weight")
+            self.lm_head_ternary = False
+        else:
+            # Tied embeddings
+            self.lm_head_w = self.embed
+            self.lm_head_ternary = False
+        # Layers
+        self.layers = []
+        for i in range(self.n_layers):
+            layer = {}
+            prefix = f"model.layers.{i}"
+            # Attention
+            layer["q_proj"] = self._load_ternary(f"{prefix}.self_attn.q_proj.weight")
+            layer["k_proj"] = self._load_ternary(f"{prefix}.self_attn.k_proj.weight")
+            layer["v_proj"] = self._load_ternary(f"{prefix}.self_attn.v_proj.weight")
+            layer["o_proj"] = self._load_ternary(f"{prefix}.self_attn.o_proj.weight")
+            # MLP
+            layer["gate_proj"] = self._load_ternary(f"{prefix}.mlp.gate_proj.weight")
+            layer["up_proj"] = self._load_ternary(f"{prefix}.mlp.up_proj.weight")
+            layer["down_proj"] = self._load_ternary(f"{prefix}.mlp.down_proj.weight")
+            # Norms (FP16 -> FP32)
+            layer["input_norm"] = self._load_fp16(f"{prefix}.input_layernorm.weight")
+            layer["post_norm"] = self._load_fp16(f"{prefix}.post_attention_layernorm.weight")
+            # Load biases if they exist
+            for proj in ["q_proj", "k_proj", "v_proj"]:
+                bias_key = f"{prefix}.self_attn.{proj}.bias"
+                if bias_key in self.manifest.get("fp16", {}):
+                    layer[f"{proj}_bias"] = self._load_fp16(bias_key)
+            self.layers.append(layer)
+            if (i + 1) % 7 == 0:
+                print(f"  Loaded {i+1}/{self.n_layers} layers")
+        print(f"  Loaded {self.n_layers}/{self.n_layers} layers")
+    def _compute_memory(self):
+        """Report memory usage."""
+        ternary_bytes = 0
+        fp_bytes = 0
+        for layer in self.layers:
+            for key in ["q_proj", "k_proj", "v_proj", "o_proj",
+                       "gate_proj", "up_proj", "down_proj"]:
+                tl = layer[key]
+                ternary_bytes += tl.pos.nbytes + tl.neg.nbytes + tl.scales.nbytes
+            for key in ["input_norm", "post_norm"]:
+                fp_bytes += layer[key].nbytes
+        fp_bytes += self.embed.nbytes + self.final_norm.nbytes
+        if not self.lm_head_ternary:
+            fp_bytes += self.lm_head_w.nbytes if hasattr(self, 'lm_head_w') else 0
+        total = ternary_bytes + fp_bytes
+        print(f"\nMemory: ternary={ternary_bytes/1024/1024:.1f}MB, "
+              f"fp={fp_bytes/1024/1024:.1f}MB, total={total/1024/1024:.1f}MB")
+    def _rmsnorm(self, x, weight):
+        """RMSNorm using C kernel."""
+        y = np.zeros_like(x)
+        self.kernel.rmsnorm_avx512(
+            x.ctypes.data, weight.ctypes.data, y.ctypes.data,
+            len(x), ctypes.c_float(self.eps)
+        )
+        return y
+    def _attention(self, x, layer, cache, layer_idx, pos):
+        """Grouped-Query Attention."""
+        h = self.hidden
+        n_h = self.n_heads
+        n_kv = self.n_kv
+        hd = self.head_dim
+        # Project Q, K, V
+        q = layer["q_proj"].forward(x)  # [n_heads * head_dim]
+        k = layer["k_proj"].forward(x)  # [n_kv * head_dim]
+        v = layer["v_proj"].forward(x)  # [n_kv * head_dim]
+        # Add biases if present
+        if "q_proj_bias" in layer:
+            q += layer["q_proj_bias"]
+        if "k_proj_bias" in layer:
+            k += layer["k_proj_bias"]
+        if "v_proj_bias" in layer:
+            v += layer["v_proj_bias"]
+        # Reshape
+        q = q.reshape(n_h, hd)
+        k = k.reshape(n_kv, hd)
+        v = v.reshape(n_kv, hd)
+        # RoPE
+        self.kernel.apply_rope(
+            q.ctypes.data, k.ctypes.data,
+            n_h, n_kv, hd, pos,
+            ctypes.c_float(self.rope_theta)
+        )
+        # Update KV cache
+        cache.append(layer_idx, k, v)
+        # Get full K, V history
+        k_all, v_all = cache.get(layer_idx)  # [seq_len, n_kv, head_dim]
+        seq_len = k_all.shape[0]
+        # GQA: repeat KV heads to match Q heads
+        heads_per_kv = n_h // n_kv
+        # Compute attention for each head
+        output = np.zeros(n_h * hd, dtype=np.float32)
+        scale = 1.0 / np.sqrt(hd)
+        for head in range(n_h):
+            kv_head = head // heads_per_kv
+            q_h = q[head]  # [head_dim]
+            # Attention scores: q @ K^T
+            scores = np.dot(k_all[:, kv_head, :], q_h) * scale  # [seq_len]
+            # Causal mask (all visible for single token generation)
+            # Softmax
+            scores_max = np.max(scores)
+            scores = np.exp(scores - scores_max)
+            scores /= np.sum(scores)
+            # Weighted sum of values
+            out_h = np.dot(scores, v_all[:, kv_head, :])  # [head_dim]
+            output[head * hd:(head + 1) * hd] = out_h
+        # Output projection
+        return layer["o_proj"].forward(output)
+    def _mlp(self, x, layer):
+        """SwiGLU MLP."""
+        gate = layer["gate_proj"].forward(x)
+        up = layer["up_proj"].forward(x)
+        # SiLU on gate
+        self.kernel.silu_avx512(gate.ctypes.data, len(gate))
+        # gate * up
+        self.kernel.elemwise_mul_avx512(
+            gate.ctypes.data, up.ctypes.data, gate.ctypes.data, len(gate)
+        )
+        # Down projection
+        return layer["down_proj"].forward(gate)
+    def forward_token(self, token_id, cache, pos):
+        """Forward pass for a single token."""
+        # Embedding lookup
+        x = self.embed[token_id].copy()  # [hidden]
+        # Transformer layers
+        for i, layer in enumerate(self.layers):
+            # Pre-attention norm
+            normed = self._rmsnorm(x, layer["input_norm"])
+            # Self-attention + residual
+            attn_out = self._attention(normed, layer, cache, i, pos)
+            x = x + attn_out
+            # Pre-MLP norm
+            normed = self._rmsnorm(x, layer["post_norm"])
+            # MLP + residual
+            mlp_out = self._mlp(normed, layer)
+            x = x + mlp_out
+        # Final norm
+        x = self._rmsnorm(x, self.final_norm)
+        return x
+    def logits(self, hidden):
+        """Compute logits from hidden state."""
+        if self.lm_head_ternary:
+            return self.lm_head.forward(hidden)
+        else:
+            return hidden @ self.lm_head_w.T
+    def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95):
+        """Generate tokens autoregressively."""
+        cache = KVCache(self.n_layers, self.n_kv, self.head_dim)
+        generated = []
+        all_tokens = list(token_ids)
+        t_start = time.time()
+        # Prefill: process all input tokens
+        for i, tid in enumerate(token_ids):
+            hidden = self.forward_token(tid, cache, i)
+            if i < len(token_ids) - 1:
+                cache.advance()
+        t_prefill = time.time() - t_start
+        # Decode
+        t_decode_start = time.time()
+        for step in range(max_new_tokens):
+            # Get logits
+            logit_vec = self.logits(hidden)
+            # Sample
+            if temperature < 0.01:
+                next_token = int(np.argmax(logit_vec))
+            else:
+                logit_vec = logit_vec / temperature
+                # Top-p sampling
+                sorted_idx = np.argsort(logit_vec)[::-1]
+                sorted_logits = logit_vec[sorted_idx]
+                # Softmax
+                max_l = sorted_logits[0]
+                probs = np.exp(sorted_logits - max_l)
+                probs /= probs.sum()
+                cumsum = np.cumsum(probs)
+                cutoff = np.searchsorted(cumsum, top_p) + 1
+                top_probs = probs[:cutoff]
+                top_probs /= top_probs.sum()
+                top_idx = sorted_idx[:cutoff]
+                next_token = int(np.random.choice(top_idx, p=top_probs))
+            generated.append(next_token)
+            all_tokens.append(next_token)
+            # Check stop tokens
+            if next_token in [151643, 151644, 151645]:  # Qwen EOS tokens
+                break
+            cache.advance()
+            hidden = self.forward_token(next_token, cache, len(all_tokens) - 1)
+        t_total = time.time() - t_start
+        t_decode = time.time() - t_decode_start
+        n_gen = len(generated)
+        stats = {
+            "prefill_ms": t_prefill * 1000,
+            "decode_ms": t_decode * 1000,
+            "total_ms": t_total * 1000,
+            "tokens_generated": n_gen,
+            "tok_per_sec": n_gen / t_decode if t_decode > 0 else 0,
+            "prefill_tokens": len(token_ids),
+        }
+        return generated, stats
+# ============================================================
+# Tokenizer wrapper
+# ============================================================
+class Tokenizer:
+    def __init__(self, model_dir):
+        from tokenizers import Tokenizer as HFTokenizer
+        tok_path = os.path.join(model_dir, "tokenizer.json")
+        if os.path.exists(tok_path):
+            self.tok = HFTokenizer.from_file(tok_path)
+        else:
+            # Try loading from HF
+            from transformers import AutoTokenizer
+            self.tok = AutoTokenizer.from_pretrained(model_dir)
+            self._is_transformers = True
+            return
+        self._is_transformers = False
+    def encode(self, text):
+        if self._is_transformers:
+            return self.tok.encode(text)
+        return self.tok.encode(text).ids
+    def decode(self, ids):
+        if self._is_transformers:
+            return self.tok.decode(ids, skip_special_tokens=True)
+        return self.tok.decode(ids)
+    def apply_chat_template(self, messages):
+        """Build Qwen chat format."""
+        parts = []
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
+        parts.append("<|im_start|>assistant\n")
+        return "".join(parts)
+if __name__ == "__main__":
+    import sys
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-ternary"
+    kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
+    model = TernaryQwen(model_dir, kernel)
+    # Quick test
+    test_ids = [151644, 8948, 198, 151645, 198, 151644, 872, 198, 9707, 151645, 198, 151644, 77091, 198]
+    print("\nGenerating...")
+    tokens, stats = model.generate(test_ids, max_new_tokens=50, temperature=0.6)
+    print(f"Generated {stats['tokens_generated']} tokens")
+    print(f"Speed: {stats['tok_per_sec']:.1f} tok/s")
+    print(f"Prefill: {stats['prefill_ms']:.0f}ms, Decode: {stats['decode_ms']:.0f}ms")
+    print(f"Token IDs: {tokens}")

log_unary_engine.c ADDED Viewed

	@@ -0,0 +1,598 @@

+/*
+ * LOG-UNARY TRANSFORMER ENGINE
+ *
+ * Unary base-1 with logarithmic compression:
+ *   Linear unary: value 7 = 1111111 (7 planes, each = +1)
+ *   Log unary:    value 8 = 111     (3 planes, plane p = 2^p)
+ *
+ * Matmul kernel: acc += popcount(w_plane[p] AND x_plane[q]) << (p+q)
+ * Still pure AND+popcount+shift, no float in hot path.
+ *
+ * 3 log-planes = values {0,1,2,4} with sign = {-4..+4} = 9 levels
+ * 4 log-planes = values {0,1,2,4,8} with sign = {-8..+8} = 17 levels
+ * 5 log-planes = values {0,1,2,4,8,16} with sign = {-16..+16} = 33 levels
+ *
+ * vs linear 7 planes = {-7..+7} = 15 levels using 7 planes
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#define MAX_SEQ     4096
+#define RMS_EPS     1e-6f
+/* ============================================================
+ * Config
+ * ============================================================ */
+typedef struct {
+    int hidden;
+    int inter;
+    int n_heads;
+    int n_kv_heads;
+    int head_dim;
+    int n_layers;
+    int vocab;
+    float rope_theta;
+    int tie_embeddings;
+    int w_planes;   /* weight log-planes */
+    int a_planes;   /* activation log-planes */
+} Config;
+/* Log-unary weight matrix */
+typedef struct {
+    uint64_t *sign_bits;     /* [out_dim * chunks] */
+    uint64_t *log_planes;    /* [n_planes][out_dim * chunks] - plane p = 2^p */
+    float    *scales;        /* [out_dim] */
+    int       out_dim;
+    int       in_dim;
+    int       n_planes;
+    int       chunks;
+} LogUnaryWeight;
+/* Transformer layer */
+typedef struct {
+    LogUnaryWeight q_proj, k_proj, v_proj, o_proj;
+    LogUnaryWeight gate_proj, up_proj, down_proj;
+    float *input_norm;
+    float *post_norm;
+    float *q_norm, *k_norm;
+} Layer;
+/* Full model */
+typedef struct {
+    Config cfg;
+    uint16_t *embed;
+    Layer    *layers;
+    float    *final_norm;
+    /* KV cache */
+    float *k_cache;
+    float *v_cache;
+    /* Float scratch (O(dim) ops only) */
+    float *hidden;
+    float *normed;
+    float *q_float;
+    float *k_float;
+    float *v_float;
+    float *attn_out;
+    float *gate_float;
+    float *up_float;
+    float *mlp_act;
+    float *logits;
+    float *attn_scores;
+    /* Unary scratch for activation quantization */
+    uint64_t *act_sign;
+    uint64_t *act_planes;
+    /* Larger scratch for intermediate dim */
+    uint64_t *mlp_act_sign;
+    uint64_t *mlp_act_planes;
+} Model;
+/* ============================================================
+ * LOG-UNARY ACTIVATION QUANTIZATION
+ *
+ * Encode float value as sign + log-magnitude planes
+ * Plane p is set if |x| >= threshold_p
+ * threshold_p = scale * 2^p / max_level
+ *
+ * Effectively: compute integer magnitude = round(|x|/scale * max_level)
+ * Then decompose into binary: if bit p is set in magnitude, plane p is set
+ *
+ * Wait — that's just BINARY encoding of the magnitude!
+ * Log-unary IS binary representation stored as separate bitplanes.
+ * The magic is that AND+popcount+shift MULTIPLIES them.
+ * ============================================================ */
+static void quantize_log_unary(
+    const float *x, int dim, int n_planes,
+    uint64_t *sign_out, uint64_t *planes_out, float *scale_out
+) {
+    int chunks = (dim + 63) / 64;
+    int max_level = (1 << n_planes) - 1;  /* 2^n - 1 */
+    /* Find absmax */
+    float amax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > amax) amax = a;
+    }
+    if (amax == 0.0f) amax = 1.0f;
+    *scale_out = amax / max_level;
+    memset(sign_out, 0, chunks * sizeof(uint64_t));
+    memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
+    float inv_scale = max_level / amax;
+    for (int i = 0; i < dim; i++) {
+        int chunk = i / 64;
+        int bit = i % 64;
+        uint64_t mask = 1ULL << bit;
+        if (x[i] < 0.0f)
+            sign_out[chunk] |= mask;
+        int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
+        if (mag > max_level) mag = max_level;
+        /* Binary decomposition: plane p gets bit p of magnitude */
+        for (int p = 0; p < n_planes; p++) {
+            if (mag & (1 << p))
+                planes_out[(size_t)p * chunks + chunk] |= mask;
+        }
+    }
+}
+/* ============================================================
+ * LOG-UNARY MATVEC: y = W @ x
+ *
+ * W: log-unary (sign + wp log-planes, scales)
+ * x: log-unary (sign + xp log-planes, scale)
+ *
+ * For each output element i:
+ *   acc = 0
+ *   for each chunk c:
+ *     same = ~(w_sign[c] ^ x_sign[c])
+ *     diff = w_sign[c] ^ x_sign[c]
+ *     for p in 0..wp-1:
+ *       for q in 0..xp-1:
+ *         active = w_plane[p][c] & x_plane[q][c]
+ *         pos = popcount(active & same)
+ *         neg = popcount(active & diff)
+ *         acc += (pos - neg) << (p + q)   <-- THE KEY: shift by p+q
+ *   y[i] = acc * w_scale[i] * x_scale
+ * ============================================================ */
+static void log_unary_matvec(
+    const LogUnaryWeight *W,
+    const uint64_t *x_sign, const uint64_t *x_planes,
+    float x_scale, int x_n_planes,
+    float *y_out
+) {
+    int out_dim = W->out_dim;
+    int chunks = W->chunks;
+    int wp = W->n_planes;
+    int xp = x_n_planes;
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
+        long long acc = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign_row[c];
+            uint64_t xs = x_sign[c];
+            uint64_t same = ~(ws ^ xs);
+            uint64_t diff = ws ^ xs;
+            for (int p = 0; p < wp; p++) {
+                uint64_t w_mag = W->log_planes[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xp; q++) {
+                    uint64_t x_mag = x_planes[(size_t)q * chunks + c];
+                    uint64_t active = w_mag & x_mag;
+                    if (!active) continue;  /* skip zero — common with log encoding */
+                    uint64_t pos = active & same;
+                    uint64_t neg = active & diff;
+                    int shift = p + q;
+                    acc += (long long)(__builtin_popcountll(pos) -
+                                       __builtin_popcountll(neg)) << shift;
+                }
+            }
+        }
+        y_out[i] = (float)acc * W->scales[i] * x_scale;
+    }
+}
+/* ============================================================
+ * FP16 ops (embedding, lm_head) — not in the critical per-layer path
+ * ============================================================ */
+static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
+    const uint16_t *row = embed + (size_t)token_id * hidden;
+    int i;
+    for (i = 0; i + 16 <= hidden; i += 16) {
+        __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
+        __m512 fv = _mm512_cvtph_ps(h);
+        _mm512_storeu_ps(out + i, fv);
+    }
+    for (; i < hidden; i++) {
+        __m128i hv = _mm_set1_epi16(row[i]);
+        __m128 fv = _mm_cvtph_ps(hv);
+        _mm_store_ss(out + i, fv);
+    }
+}
+static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < out_dim; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        int j;
+        for (j = 0; j + 16 <= in_dim; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
+            __m512 wv = _mm512_cvtph_ps(h);
+            __m512 xv = _mm512_loadu_ps(x + j);
+            acc = _mm512_fmadd_ps(wv, xv, acc);
+        }
+        float sum = _mm512_reduce_add_ps(acc);
+        for (; j < in_dim; j++) {
+            __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
+            __m128 fv = _mm_cvtph_ps(hv);
+            float wf; _mm_store_ss(&wf, fv);
+            sum += wf * x[j];
+        }
+        y[i] = sum;
+    }
+}
+/* ============================================================
+ * O(dim) float ops — RMSNorm, SiLU, Softmax, RoPE, residual
+ * ============================================================ */
+static void rmsnorm(const float *x, const float *w, float *y, int dim) {
+    float ss = 0.0f;
+    for (int i = 0; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
+    for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
+}
+static void silu_mul(const float *gate, const float *up, float *out, int n) {
+    for (int i = 0; i < n; i++)
+        out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
+}
+static void vec_add(float *y, const float *x, int n) {
+    for (int i = 0; i < n; i++) y[i] += x[i];
+}
+static void apply_rope(float *vec, int pos, int dim, float theta) {
+    for (int i = 0; i < dim; i += 2) {
+        float freq = 1.0f / powf(theta, (float)i / dim);
+        float angle = pos * freq;
+        float co = cosf(angle), si = sinf(angle);
+        float v0 = vec[i], v1 = vec[i+1];
+        vec[i]   = v0*co - v1*si;
+        vec[i+1] = v0*si + v1*co;
+    }
+}
+static void softmax(float *x, int n) {
+    float mx = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
+    float inv = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv;
+}
+static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
+    return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
+                    (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
+}
+/* ============================================================
+ * ATTENTION
+ * ============================================================ */
+static void attention(Model *m, int layer_idx, int pos) {
+    Config *c = &m->cfg;
+    Layer *L = &m->layers[layer_idx];
+    int heads_per_kv = c->n_heads / c->n_kv_heads;
+    int hidden_chunks = (c->hidden + 63) / 64;
+    float act_scale;
+    /* Quantize normed hidden -> log-unary */
+    quantize_log_unary(m->normed, c->hidden, c->a_planes,
+                       m->act_sign, m->act_planes, &act_scale);
+    /* Q, K, V — log-unary matmul */
+    log_unary_matvec(&L->q_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->q_float);
+    log_unary_matvec(&L->k_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->k_float);
+    log_unary_matvec(&L->v_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->v_float);
+    /* QK-Norm */
+    if (L->q_norm)
+        for (int h = 0; h < c->n_heads; h++)
+            rmsnorm(m->q_float + h*c->head_dim, L->q_norm, m->q_float + h*c->head_dim, c->head_dim);
+    if (L->k_norm)
+        for (int h = 0; h < c->n_kv_heads; h++)
+            rmsnorm(m->k_float + h*c->head_dim, L->k_norm, m->k_float + h*c->head_dim, c->head_dim);
+    /* RoPE */
+    for (int h = 0; h < c->n_heads; h++)
+        apply_rope(m->q_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
+    for (int h = 0; h < c->n_kv_heads; h++)
+        apply_rope(m->k_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
+    /* KV cache store */
+    for (int h = 0; h < c->n_kv_heads; h++) {
+        memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
+               m->k_float + h*c->head_dim, c->head_dim * sizeof(float));
+        memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
+               m->v_float + h*c->head_dim, c->head_dim * sizeof(float));
+    }
+    /* Attention dot products + softmax + weighted sum */
+    float scale = 1.0f / sqrtf((float)c->head_dim);
+    memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
+    for (int h = 0; h < c->n_heads; h++) {
+        int kv_h = h / heads_per_kv;
+        float *qh = m->q_float + h*c->head_dim;
+        float *oh = m->attn_out + h*c->head_dim;
+        for (int t = 0; t <= pos; t++) {
+            float *kc = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
+            float dot = 0.0f;
+            for (int d = 0; d < c->head_dim; d++) dot += qh[d] * kc[d];
+            m->attn_scores[t] = dot * scale;
+        }
+        softmax(m->attn_scores, pos + 1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->attn_scores[t];
+            if (w < 1e-8f) continue;
+            float *vc = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
+            for (int d = 0; d < c->head_dim; d++) oh[d] += w * vc[d];
+        }
+    }
+    /* O projection — quantize attn_out, then log-unary matmul */
+    int o_dim = c->n_heads * c->head_dim;
+    int o_chunks = (o_dim + 63) / 64;
+    uint64_t *o_sign = (uint64_t *)aligned_alloc(64, o_chunks * sizeof(uint64_t));
+    uint64_t *o_planes = (uint64_t *)aligned_alloc(64, (size_t)c->a_planes * o_chunks * sizeof(uint64_t));
+    float o_scale;
+    quantize_log_unary(m->attn_out, o_dim, c->a_planes, o_sign, o_planes, &o_scale);
+    float *o_tmp = m->normed;  /* reuse */
+    log_unary_matvec(&L->o_proj, o_sign, o_planes, o_scale, c->a_planes, o_tmp);
+    memcpy(m->attn_out, o_tmp, c->hidden * sizeof(float));
+    free(o_sign); free(o_planes);
+}
+/* ============================================================
+ * MLP
+ * ============================================================ */
+static void mlp(Model *m, int layer_idx) {
+    Config *c = &m->cfg;
+    Layer *L = &m->layers[layer_idx];
+    int hidden_chunks = (c->hidden + 63) / 64;
+    int inter_chunks = (c->inter + 63) / 64;
+    float act_scale, mlp_scale;
+    /* Quantize normed input */
+    quantize_log_unary(m->normed, c->hidden, c->a_planes,
+                       m->act_sign, m->act_planes, &act_scale);
+    /* Gate + Up — log-unary */
+    log_unary_matvec(&L->gate_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->gate_float);
+    log_unary_matvec(&L->up_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->up_float);
+    /* SiLU(gate) * up */
+    silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
+    /* Quantize for down projection */
+    quantize_log_unary(m->mlp_act, c->inter, c->a_planes,
+                       m->mlp_act_sign, m->mlp_act_planes, &mlp_scale);
+    /* Down — log-unary */
+    log_unary_matvec(&L->down_proj, m->mlp_act_sign, m->mlp_act_planes, mlp_scale, c->a_planes, m->normed);
+}
+/* ============================================================
+ * FORWARD
+ * ============================================================ */
+float* forward_token(Model *m, int token_id, int pos) {
+    Config *c = &m->cfg;
+    embed_token(m->embed, token_id, m->hidden, c->hidden);
+    for (int l = 0; l < c->n_layers; l++) {
+        rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
+        attention(m, l, pos);
+        vec_add(m->hidden, m->attn_out, c->hidden);
+        rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
+        mlp(m, l);
+        vec_add(m->hidden, m->normed, c->hidden);
+    }
+    rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
+    if (c->tie_embeddings)
+        fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
+    return m->logits;
+}
+/* ============================================================
+ * SAMPLING
+ * ============================================================ */
+static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
+    if (temperature > 0) {
+        float inv_t = 1.0f / temperature;
+        for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
+    }
+    softmax(logits, vocab);
+    float *probs = (float *)malloc(vocab * sizeof(float));
+    int *indices = (int *)malloc(vocab * sizeof(int));
+    memcpy(probs, logits, vocab * sizeof(float));
+    for (int i = 0; i < vocab; i++) indices[i] = i;
+    int n = 0; float cum = 0.0f;
+    while (cum < top_p && n < vocab) {
+        int best = n;
+        for (int i = n+1; i < vocab; i++) if (probs[i] > probs[best]) best = i;
+        float t = probs[n]; probs[n] = probs[best]; probs[best] = t;
+        int ti = indices[n]; indices[n] = indices[best]; indices[best] = ti;
+        cum += probs[n]; n++;
+        if (n >= 40) break;
+    }
+    float sum = 0; for (int i = 0; i < n; i++) sum += probs[i];
+    float r = (float)rand() / RAND_MAX * sum;
+    float a = 0; int ch = indices[0];
+    for (int i = 0; i < n; i++) { a += probs[i]; if (a >= r) { ch = indices[i]; break; } }
+    free(probs); free(indices);
+    return ch;
+}
+int generate(Model *m, const int *prompt, int plen, int *out, int max_new,
+             float temperature, float top_p, int eos) {
+    srand(time(NULL));
+    for (int i = 0; i < plen; i++) forward_token(m, prompt[i], i);
+    int pos = plen, gen = 0;
+    for (int t = 0; t < max_new; t++) {
+        int next;
+        if (temperature <= 0) {
+            next = 0;
+            for (int i = 1; i < m->cfg.vocab; i++)
+                if (m->logits[i] > m->logits[next]) next = i;
+        } else {
+            next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
+        }
+        out[t] = next; gen++;
+        if (next == eos) break;
+        forward_token(m, next, pos); pos++;
+    }
+    return gen;
+}
+/* ============================================================
+ * ALLOCATION
+ * ============================================================ */
+Model* model_alloc(
+    int w_planes, int a_planes,
+    int hidden, int inter, int n_heads, int n_kv_heads,
+    int head_dim, int n_layers, int vocab,
+    float rope_theta, int tie_embeddings
+) {
+    Model *m = (Model *)calloc(1, sizeof(Model));
+    Config *c = &m->cfg;
+    c->hidden = hidden; c->inter = inter;
+    c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
+    c->head_dim = head_dim; c->n_layers = n_layers;
+    c->vocab = vocab; c->rope_theta = rope_theta;
+    c->tie_embeddings = tie_embeddings;
+    c->w_planes = w_planes; c->a_planes = a_planes;
+    m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
+    size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
+    m->k_cache = (float *)calloc(kv_size, sizeof(float));
+    m->v_cache = (float *)calloc(kv_size, sizeof(float));
+    int max_dim = inter > hidden ? inter : hidden;
+    m->hidden      = (float *)aligned_alloc(64, hidden * sizeof(float));
+    m->normed      = (float *)aligned_alloc(64, max_dim * sizeof(float));
+    m->q_float     = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->k_float     = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->v_float     = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->attn_out    = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->gate_float  = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->up_float    = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->mlp_act     = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->logits      = (float *)aligned_alloc(64, vocab * sizeof(float));
+    m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
+    m->final_norm  = (float *)aligned_alloc(64, hidden * sizeof(float));
+    /* Unary scratch for hidden dim */
+    int h_chunks = (hidden + 63) / 64;
+    m->act_sign   = (uint64_t *)aligned_alloc(64, h_chunks * sizeof(uint64_t));
+    m->act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * h_chunks * sizeof(uint64_t));
+    /* Unary scratch for intermediate dim */
+    int i_chunks = (inter + 63) / 64;
+    m->mlp_act_sign   = (uint64_t *)aligned_alloc(64, i_chunks * sizeof(uint64_t));
+    m->mlp_act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * i_chunks * sizeof(uint64_t));
+    int w_max = (1 << w_planes) - 1;
+    int a_max = (1 << a_planes) - 1;
+    printf("LOG-UNARY ENGINE\n");
+    printf("  Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
+           hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
+    printf("  Weight: %d log-planes -> %d levels (range -%d..+%d)\n",
+           w_planes, 2*w_max+1, w_max, w_max);
+    printf("  Activation: %d log-planes -> %d levels (range -%d..+%d)\n",
+           a_planes, 2*a_max+1, a_max, a_max);
+    printf("  Plane pairs per element: %d (vs %d linear)\n",
+           w_planes * a_planes, 7 * 4);
+    printf("  KV cache: %zu MB\n", kv_size * 2 * sizeof(float) / (1024*1024));
+    return m;
+}
+/* Weight setters */
+void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
+void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
+void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
+    m->layers[l].input_norm = in_norm;
+    m->layers[l].post_norm = post_norm;
+}
+void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
+    m->layers[l].q_norm = q_norm;
+    m->layers[l].k_norm = k_norm;
+}
+static void init_weight(LogUnaryWeight *w, uint64_t *sign, uint64_t *planes, float *scales,
+                        int out_dim, int in_dim, int n_planes) {
+    w->sign_bits = sign; w->log_planes = planes; w->scales = scales;
+    w->out_dim = out_dim; w->in_dim = in_dim; w->n_planes = n_planes;
+    w->chunks = (in_dim + 63) / 64;
+}
+void layer_set_linears(
+    Model *m, int l,
+    uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
+    uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
+    uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
+    uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
+    uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
+    uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
+    uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
+    int n_planes
+) {
+    init_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
+    init_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
+    init_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
+    init_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
+    init_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
+    init_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
+    init_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
+}
+void model_reset_cache(Model *m) {
+    size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
+    memset(m->k_cache, 0, kv_size * sizeof(float));
+    memset(m->v_cache, 0, kv_size * sizeof(float));
+}

logunary_tensor.c ADDED Viewed

	@@ -0,0 +1,534 @@

+#define _POSIX_C_SOURCE 199309L
+/*
+ * LOG-UNARY TENSOR LIBRARY
+ *
+ * Native tensor type where values are represented as:
+ *   sign (1 bit) + log-magnitude bitplanes
+ *
+ * Plane p is set if |value| >= 2^(p - bias)
+ * With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
+ *
+ * ALL arithmetic stays in this representation:
+ *   - matmul: AND + weighted_popcount (shift by p+q-2*bias)
+ *   - add: bitwise merge with carry propagation
+ *   - scale: shift planes up/down
+ *   - negate: flip sign bits
+ *
+ * Float conversion only at boundaries (embed lookup, final logits)
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+/* ============================================================
+ * LOG-UNARY TENSOR
+ *
+ * For a vector of length `dim`:
+ *   sign:   uint64[chunks]           - 1 bit per element
+ *   planes: uint64[n_planes][chunks] - 1 bit per element per plane
+ *   chunks = (dim + 63) / 64
+ *
+ * Plane p is set if |value| >= threshold[p]
+ * threshold[p] = base_scale * 2^(p - bias)
+ *
+ * This is a LOG thermometer code:
+ *   value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
+ *   value=1.0   with bias=10 -> planes 0-10 set
+ *   value=64.0  with bias=10 -> planes 0-16 set
+ *
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign;     /* [chunks] */
+    uint64_t *planes;   /* [n_planes * chunks] contiguous */
+    int dim;
+    int chunks;
+    int n_planes;
+    int bias;           /* log2 offset: threshold[p] = base * 2^(p-bias) */
+    float base_scale;   /* per-tensor scale factor */
+} LogUnaryTensor;
+/* 2D tensor (matrix) - row-major */
+typedef struct {
+    uint64_t *sign;     /* [rows * chunks_per_row] */
+    uint64_t *planes;   /* [n_planes * rows * chunks_per_row] */
+    float *row_scales;  /* [rows] per-row base scales */
+    int rows;
+    int cols;
+    int chunks;         /* chunks per row = (cols+63)/64 */
+    int n_planes;
+    int bias;
+} LogUnaryMatrix;
+/* ============================================================
+ * ALLOCATION
+ * ============================================================ */
+LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
+    LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor));
+    t->dim = dim;
+    t->n_planes = n_planes;
+    t->bias = bias;
+    t->chunks = (dim + 63) / 64;
+    t->base_scale = 1.0f;
+    t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t));
+    t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t));
+    memset(t->sign, 0, t->chunks * sizeof(uint64_t));
+    memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
+    return t;
+}
+LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
+    LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix));
+    m->rows = rows;
+    m->cols = cols;
+    m->n_planes = n_planes;
+    m->bias = bias;
+    m->chunks = (cols + 63) / 64;
+    m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
+    m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
+    m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float));
+    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
+    memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
+    for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
+    return m;
+}
+void lut_free(LogUnaryTensor *t) {
+    if (t) { free(t->sign); free(t->planes); free(t); }
+}
+void lum_free(LogUnaryMatrix *m) {
+    if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
+}
+/* ============================================================
+ * FLOAT <-> LOG-UNARY CONVERSION
+ * Only used at boundaries (embedding, final output)
+ * ============================================================ */
+void lut_from_float(LogUnaryTensor *t, const float *x) {
+    int dim = t->dim;
+    int np = t->n_planes;
+    int bias = t->bias;
+    int chunks = t->chunks;
+    memset(t->sign, 0, chunks * sizeof(uint64_t));
+    memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));
+    /* Find absmax for base_scale */
+    float amax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > amax) amax = a;
+    }
+    if (amax == 0.0f) { t->base_scale = 1.0f; return; }
+    /* Set base_scale so that max value uses the highest plane */
+    /* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
+    t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        if (x[i] < 0.0f) t->sign[c] |= bit;
+        float mag = fabsf(x[i]);
+        /* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
+        for (int p = 0; p < np; p++) {
+            float thresh = t->base_scale * ldexpf(1.0f, p - bias);
+            if (mag >= thresh)
+                t->planes[(size_t)p * chunks + c] |= bit;
+            else
+                break;  /* thermometer: once we stop, all higher planes are 0 */
+        }
+    }
+}
+void lut_to_float(const LogUnaryTensor *t, float *out) {
+    int dim = t->dim;
+    int np = t->n_planes;
+    int bias = t->bias;
+    int chunks = t->chunks;
+    memset(out, 0, dim * sizeof(float));
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        /* Find highest set plane */
+        int highest = -1;
+        for (int p = np - 1; p >= 0; p--) {
+            if (t->planes[(size_t)p * chunks + c] & bit) {
+                highest = p;
+                break;
+            }
+        }
+        if (highest < 0) {
+            out[i] = 0.0f;
+        } else {
+            /* Value is approximately base * 2^(highest - bias) */
+            /* More precise: midpoint between this threshold and next */
+            float val = t->base_scale * ldexpf(1.0f, highest - bias);
+            if (highest < np - 1) {
+                float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
+                val = (val + next) * 0.5f;  /* midpoint reconstruction */
+            }
+            out[i] = (t->sign[c] & bit) ? -val : val;
+        }
+    }
+}
+/* Convert float matrix to log-unary matrix (per-row scaling) */
+void lum_from_float(LogUnaryMatrix *m, const float *data) {
+    int rows = m->rows, cols = m->cols;
+    int np = m->n_planes, bias = m->bias;
+    int chunks = m->chunks;
+    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
+    memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));
+    for (int r = 0; r < rows; r++) {
+        const float *row = data + (size_t)r * cols;
+        /* Per-row absmax */
+        float amax = 0.0f;
+        for (int j = 0; j < cols; j++) {
+            float a = fabsf(row[j]);
+            if (a > amax) amax = a;
+        }
+        if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
+        m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);
+        uint64_t *row_sign = m->sign + (size_t)r * chunks;
+        for (int j = 0; j < cols; j++) {
+            int c = j / 64;
+            uint64_t bit = 1ULL << (j % 64);
+            if (row[j] < 0.0f) row_sign[c] |= bit;
+            float mag = fabsf(row[j]);
+            for (int p = 0; p < np; p++) {
+                float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
+                if (mag >= thresh)
+                    m->planes[((size_t)p * rows + r) * chunks + c] |= bit;
+                else
+                    break;
+            }
+        }
+    }
+}
+/* ============================================================
+ * LOG-UNARY MATMUL: y = M @ x
+ *
+ * Both M (matrix) and x (vector) are log-unary encoded.
+ *
+ * For each output element y[i]:
+ *   For each weight plane p, activation plane q:
+ *     active = M.planes[p][i] AND x.planes[q]
+ *     same   = active AND ~(M.sign[i] XOR x.sign)
+ *     diff   = active AND (M.sign[i] XOR x.sign)
+ *     contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
+ *
+ * Output is a LogUnaryTensor (converted from integer accumulator)
+ * ============================================================ */
+void lum_matvec(
+    const LogUnaryMatrix *M,
+    const LogUnaryTensor *x,
+    LogUnaryTensor *y_out    /* output: log-unary encoded result */
+) {
+    int out_dim = M->rows;
+    int chunks = M->chunks;
+    int wp = M->n_planes;
+    int xp = x->n_planes;
+    int w_bias = M->bias;
+    int x_bias = x->bias;
+    /* Accumulate to float temporarily, then requantize to log-unary.
+     * The accumulator is integer shifts (2^(p+q-2bias)), which
+     * we can do as int64 left-shifts for small exponents.
+     *
+     * For the exponent range we're in (p+q in [0,14] with bias ~4),
+     * net shift is [-8, 6], so we use a fixed-point int64 accumulator
+     * with a base shift to keep everything positive.
+     */
+    int base_shift = w_bias + x_bias;  /* shift to add to make all exponents >= 0 */
+    /* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
+    /* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */
+    float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
+        long long acc = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign_row[c];
+            uint64_t xs = x->sign[c];
+            uint64_t same = ~(ws ^ xs);
+            uint64_t diff = ws ^ xs;
+            for (int p = 0; p < wp; p++) {
+                uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xp; q++) {
+                    uint64_t x_plane = x->planes[(size_t)q * chunks + c];
+                    uint64_t active = w_plane & x_plane;
+                    uint64_t pos = active & same;
+                    uint64_t neg = active & diff;
+                    int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);
+                    /* Weighted by 2^(p + q) relative to base */
+                    int shift = p + q;  /* relative to 2^(-base_shift) */
+                    if (count != 0)
+                        acc += (long long)count << shift;
+                }
+            }
+        }
+        /* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
+        y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
+                    * ldexpf(1.0f, -base_shift);
+    }
+    /* Requantize float result to log-unary */
+    lut_from_float(y_out, y_float);
+    free(y_float);
+}
+/* ============================================================
+ * LOG-UNARY ELEMENT-WISE ADD: z = a + b
+ *
+ * Dequant both, add as float, requant.
+ * This is O(dim) so not the bottleneck.
+ * Future: direct bitwise add with carry chains.
+ * ============================================================ */
+void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) {
+    int dim = a->dim;
+    float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
+    float *fb = (float *)aligned_alloc(64, dim * sizeof(float));
+    lut_to_float(a, fa);
+    lut_to_float(b, fb);
+    for (int i = 0; i < dim; i++) fa[i] += fb[i];
+    lut_from_float(out, fa);
+    free(fa); free(fb);
+}
+/* In-place add: a += b (dequant a, add float b, requant) */
+void lut_add_float(LogUnaryTensor *a, const float *b) {
+    int dim = a->dim;
+    float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
+    lut_to_float(a, fa);
+    for (int i = 0; i < dim; i++) fa[i] += b[i];
+    lut_from_float(a, fa);
+    free(fa);
+}
+/* ============================================================
+ * LOG-UNARY RMSNORM
+ *
+ * Needs float for the sqrt/reciprocal, but O(dim).
+ * Input: log-unary, Output: log-unary
+ * ============================================================ */
+void lut_rmsnorm(
+    const LogUnaryTensor *x,
+    const float *weight,  /* norm weights stay float (tiny) */
+    LogUnaryTensor *out,
+    float eps
+) {
+    int dim = x->dim;
+    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
+    lut_to_float(x, xf);
+    float ss = 0.0f;
+    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
+    float rms = 1.0f / sqrtf(ss / dim + eps);
+    for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];
+    lut_from_float(out, xf);
+    free(xf);
+}
+/* ============================================================
+ * LOG-UNARY SILU_MUL: out = SiLU(gate) * up
+ *
+ * O(dim), not bottleneck. Dequant, compute, requant.
+ * ============================================================ */
+void lut_silu_mul(
+    const LogUnaryTensor *gate,
+    const LogUnaryTensor *up,
+    LogUnaryTensor *out
+) {
+    int dim = gate->dim;
+    float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
+    float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
+    lut_to_float(gate, gf);
+    lut_to_float(up, uf);
+    for (int i = 0; i < dim; i++)
+        gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
+    lut_from_float(out, gf);
+    free(gf); free(uf);
+}
+/* ============================================================
+ * LOG-UNARY ROPE
+ *
+ * O(dim), dequant-compute-requant per head.
+ * ============================================================ */
+void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
+    /* Dequant the relevant slice, apply RoPE, requant */
+    float *f = (float *)aligned_alloc(64, head_dim * sizeof(float));
+    /* Extract slice */
+    float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
+    lut_to_float(t, full);
+    memcpy(f, full + start, head_dim * sizeof(float));
+    for (int i = 0; i < head_dim; i += 2) {
+        float freq = 1.0f / powf(theta, (float)i / head_dim);
+        float angle = offset * freq;
+        float c = cosf(angle), s = sinf(angle);
+        float v0 = f[i], v1 = f[i + 1];
+        f[i]     = v0 * c - v1 * s;
+        f[i + 1] = v0 * s + v1 * c;
+    }
+    memcpy(full + start, f, head_dim * sizeof(float));
+    lut_from_float(t, full);
+    free(f); free(full);
+}
+/* ============================================================
+ * UTILITY: Get float slice from log-unary tensor
+ * (for attention scores which need float softmax)
+ * ============================================================ */
+void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) {
+    float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
+    lut_to_float(t, full);
+    memcpy(out, full + start, len * sizeof(float));
+    free(full);
+}
+/* ============================================================
+ * BENCHMARK: measure matvec throughput
+ * ============================================================ */
+typedef struct {
+    double total_and_ops;
+    double total_popcount_ops;
+    double wall_time_s;
+    double elements_per_sec;
+    double gops;   /* giga-operations per second */
+} BenchResult;
+BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
+    LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
+    LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
+    LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);
+    /* Fill with random bits */
+    for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
+        M->sign[i] = ((uint64_t)rand() << 32) | rand();
+    for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
+        M->planes[i] = ((uint64_t)rand() << 32) | rand();
+    for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
+    for (size_t i = 0; i < (size_t)x->chunks; i++)
+        x->sign[i] = ((uint64_t)rand() << 32) | rand();
+    for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
+        x->planes[i] = ((uint64_t)rand() << 32) | rand();
+    x->base_scale = 1.0f;
+    /* Warmup */
+    lum_matvec(M, x, y);
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    for (int i = 0; i < iters; i++)
+        lum_matvec(M, x, y);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
+    int chunks = M->chunks;
+    double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2;  /* AND + popcount pairs */
+    BenchResult r;
+    r.wall_time_s = dt / iters;
+    r.total_and_ops = ops_per_call;
+    r.total_popcount_ops = ops_per_call;
+    r.elements_per_sec = (double)rows * cols * iters / dt;
+    r.gops = ops_per_call * iters / dt / 1e9;
+    lum_free(M); lut_free(x); lut_free(y);
+    return r;
+}
+/* ============================================================
+ * ACCURACY TEST: convert float->logunary->float roundtrip
+ * ============================================================ */
+typedef struct {
+    float max_error;
+    float mean_error;
+    float cosine_sim;
+    float snr_db;
+} AccuracyResult;
+AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
+    float *original = (float *)aligned_alloc(64, dim * sizeof(float));
+    float *recovered = (float *)aligned_alloc(64, dim * sizeof(float));
+    /* Random normal-ish distribution */
+    for (int i = 0; i < dim; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
+    lut_from_float(t, original);
+    lut_to_float(t, recovered);
+    float max_err = 0, sum_err = 0;
+    float dot = 0, na = 0, nb = 0;
+    for (int i = 0; i < dim; i++) {
+        float err = fabsf(original[i] - recovered[i]);
+        if (err > max_err) max_err = err;
+        sum_err += err;
+        dot += original[i] * recovered[i];
+        na += original[i] * original[i];
+        nb += recovered[i] * recovered[i];
+    }
+    float noise_power = 0;
+    for (int i = 0; i < dim; i++) {
+        float e = original[i] - recovered[i];
+        noise_power += e * e;
+    }
+    AccuracyResult r;
+    r.max_error = max_err;
+    r.mean_error = sum_err / dim;
+    r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+    r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));
+    lut_free(t);
+    free(original); free(recovered);
+    return r;
+}

packed_convert.py ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env python3
+"""Packed unary converter: uint8 magnitudes + bitpacked signs + per-row scales."""
+import os, json, sys, time
+import numpy as np
+from pathlib import Path
+def load_safetensors(model_dir):
+    from safetensors.torch import load_file
+    tensors = {}
+    for f in sorted(Path(model_dir).glob("*.safetensors")):
+        print(f"  Loading {f.name}...")
+        for k, v in load_file(str(f)).items():
+            tensors[k] = v.float().numpy()
+    return tensors
+def quantize_packed(w, n_levels=7):
+    out_dim, in_dim = w.shape
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    row_max = np.max(np.abs(w), axis=1, keepdims=True)
+    row_max = np.where(row_max == 0, 1.0, row_max)
+    scales = (row_max.flatten() / n_levels).astype(np.float32)
+    mags = np.clip(np.round(np.abs(w / scales[:, None])), 0, n_levels).astype(np.uint8)
+    signs = (w < 0)
+    rmm = np.max(mags, axis=1).astype(np.uint8)
+    if in_dim < padded:
+        sp = np.zeros((out_dim, padded), dtype=bool)
+        sp[:, :in_dim] = signs
+    else:
+        sp = signs
+    bit_pos = np.uint64(1) << np.arange(64, dtype=np.uint64)
+    sign_bits = np.bitwise_or.reduce(sp.reshape(out_dim, chunks, 64).astype(np.uint64) * bit_pos, axis=2)
+    return mags, sign_bits, scales, rmm, np.mean(mags), np.mean(mags == 0)
+def convert(tensors, output_dir, n_levels=7):
+    os.makedirs(output_dir, exist_ok=True)
+    config = {"hidden_size":1536,"intermediate_size":8960,"num_attention_heads":12,
+              "num_key_value_heads":2,"num_hidden_layers":28,"vocab_size":151936,
+              "head_dim":128,"rope_theta":1000000.0,"rms_norm_eps":1e-6,
+              "n_levels":n_levels,"quant_type":"packed_unary"}
+    linear_keys = [k for k in tensors if any(p in k for p in
+        ['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
+         'gate_proj.weight','up_proj.weight','down_proj.weight'])]
+    other_keys = [k for k in tensors if k not in linear_keys]
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    total_packed = total_orig = 0
+    all_avg = []
+    for key in linear_keys:
+        w = tensors[key]; total_orig += w.nbytes
+        t0 = time.time()
+        mags, sb, sc, rmm, am, sp = quantize_packed(w, n_levels)
+        dt = time.time() - t0
+        pfx = os.path.join(output_dir, key.replace(".", "_"))
+        mags.tofile(pfx+".mags"); sb.tofile(pfx+".signs")
+        sc.tofile(pfx+".scales"); rmm.tofile(pfx+".rmm")
+        ub = mags.nbytes + sb.nbytes + sc.nbytes + rmm.nbytes
+        total_packed += ub; all_avg.append(am)
+        print(f"  {key}: {w.shape} -> {ub/1024:.0f}KB (avg_mag={am:.2f}, {dt:.1f}s)")
+    total_fp16 = 0
+    for key in other_keys:
+        w = tensors[key].astype(np.float16)
+        pfx = os.path.join(output_dir, key.replace(".", "_"))
+        w.tofile(pfx+".fp16"); total_fp16 += w.nbytes
+    manifest = {"packed":{k:list(tensors[k].shape) for k in linear_keys},
+                "fp16":{k:list(tensors[k].shape) for k in other_keys}}
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    print(f"\n=== PACKED UNARY ===")
+    print(f"Packed linear: {total_packed/1e6:.1f} MB | FP16 other: {total_fp16/1e6:.1f} MB")
+    print(f"Total: {(total_packed+total_fp16)/1e6:.1f} MB | Avg mag: {np.mean(all_avg):.3f}")
+    print(f"Expected speedup vs 7-plane: {7/np.mean(all_avg):.1f}x")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-packed"
+    tensors = load_safetensors(model_dir)
+    convert(tensors, output_dir)
+    print("Done!")

packed_engine.c ADDED Viewed

	@@ -0,0 +1,408 @@

+/*
+ * PACKED UNARY TRANSFORMER ENGINE - AVX-512 + OpenMP
+ *
+ * Instead of 7 fixed bitplanes (scanning 80% zeros),
+ * store magnitude per weight directly. Kernel processes
+ * groups of 16 weights, only loops to local max magnitude.
+ *
+ * Weight j with magnitude 3: adds x[j] THREE times (pure unary).
+ * But only 3 passes for that group, not 7.
+ *
+ * Average magnitude = 1.374, so average ~1.4 passes per group
+ * instead of always 7. That's the 5x speedup.
+ *
+ * Format per output row:
+ *   mags[in_dim]   uint8  - magnitude 0-7 per weight
+ *   signs[chunks]  uint64 - bitpacked sign (1=negative)
+ *   scale          float  - per-row scale
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <omp.h>
+#define HIDDEN      1536
+#define INTER       8960
+#define N_HEADS     12
+#define N_KV_HEADS  2
+#define HEAD_DIM    128
+#define N_LAYERS    28
+#define VOCAB       151936
+#define RMS_EPS     1e-6f
+#define ROPE_THETA  1000000.0f
+#define MAX_SEQ     4096
+#define GQA_RATIO   (N_HEADS / N_KV_HEADS)
+typedef struct {
+    uint8_t  *mags;         /* [out_dim * in_dim] magnitude per weight */
+    uint64_t *sign_bits;    /* [out_dim * chunks] bitpacked signs */
+    float    *scales;       /* [out_dim] per-row scale */
+    float    *bias;         /* [out_dim] or NULL */
+    int       out_dim, in_dim;
+    uint8_t  *row_maxmag;   /* [out_dim] max magnitude per row for early exit */
+} PL; /* Packed Linear */
+typedef struct { uint16_t *w; int od, id; } FL;
+typedef struct {
+    PL qp, kp, vp, op, gp, up, dp;
+    float *in_norm, *pn_norm;
+    float *qb, *kb, *vb;
+} Lay;
+typedef struct {
+    uint16_t *emb;
+    Lay       lay[N_LAYERS];
+    float    *fnorm;
+    FL        lmh;
+    float    *kc, *vc;
+    float    *h, *h2;
+    float    *sq, *sk, *sv, *ao;
+    float    *sg, *su, *sd;
+    float    *lg, *as;
+} M;
+/* ============================================================
+ * PACKED UNARY MATVEC
+ *
+ * Process 16 weights at a time. For each group:
+ *   1. Load 16 magnitudes (uint8)
+ *   2. Find local max magnitude
+ *   3. For m = 1 to local_max:
+ *        mask = (mag >= m)
+ *        pos_mask = mask & ~sign
+ *        neg_mask = mask & sign
+ *        acc += masked x (pos)
+ *        acc -= masked x (neg)
+ *
+ * Each pass = one unary "mark". Pure base-1.
+ * Groups where all mags <= 1: ONE pass.
+ * Groups where all mags == 0: ZERO passes. Skip entirely.
+ * ============================================================ */
+static void pmv(const PL *L, const float *x, float *y) {
+    const int od = L->out_dim, id = L->in_dim;
+    const int chunks = (id + 63) / 64;
+    const int id16 = (id + 15) & ~15;
+    float *xp = (float*)aligned_alloc(64, id16 * sizeof(float));
+    memcpy(xp, x, id * sizeof(float));
+    if (id16 > id) memset(xp + id, 0, (id16 - id) * sizeof(float));
+    #pragma omp parallel for schedule(dynamic, 64)
+    for (int i = 0; i < od; i++) {
+        const uint8_t *row_mag = L->mags + (size_t)i * id;
+        const uint64_t *row_sign = L->sign_bits + (size_t)i * chunks;
+        const int rmax = L->row_maxmag[i];
+        __m512 acc = _mm512_setzero_ps();
+        for (int j = 0; j < id; j += 16) {
+            if (j >= id16) break;
+            /* Load 16 magnitudes */
+            __m128i mv = _mm_loadu_si128((__m128i*)(row_mag + j));
+            /* Quick check: if all 16 mags are zero, skip entirely */
+            if (_mm_testz_si128(mv, mv)) continue;
+            __m512 xv = _mm512_load_ps(xp + j);
+            /* Extract 16 sign bits from bitpacked array */
+            int chunk_idx = j / 64;
+            int bit_off = j % 64;
+            uint64_t sbits = row_sign[chunk_idx];
+            uint16_t signs = (uint16_t)((sbits >> bit_off) & 0xFFFF);
+            /* Find max magnitude in this group of 16 */
+            /* Use SSE horizontal max */
+            __m128i mx = mv;
+            mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 8));
+            mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 4));
+            mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 2));
+            mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 1));
+            int local_max = _mm_extract_epi8(mx, 0);
+            /* Threshold vector for comparisons */
+            for (int m = 1; m <= local_max; m++) {
+                /* mask = (mag >= m) */
+                __m128i thresh = _mm_set1_epi8((char)m);
+                /* Compare: result is 0xFF where mag >= m, 0 otherwise */
+                /* SSE doesn't have >= for uint8, use: NOT(max(thresh, mag) == thresh XOR mag == thresh) */
+                /* Simpler: mag >= m iff mag - m doesn't underflow, i.e. saturating sub == 0 is false */
+                /* Or: max(mag, thresh) == mag means mag >= thresh */
+                __m128i cmp = _mm_cmpeq_epi8(_mm_max_epu8(mv, thresh), mv);
+                uint16_t active = (uint16_t)_mm_movemask_epi8(cmp);
+                __mmask16 pos = (__mmask16)(active & ~signs);
+                __mmask16 neg = (__mmask16)(active & signs);
+                acc = _mm512_mask_add_ps(acc, pos, acc, xv);
+                acc = _mm512_mask_sub_ps(acc, neg, acc, xv);
+            }
+        }
+        y[i] = _mm512_reduce_add_ps(acc) * L->scales[i];
+        if (L->bias) y[i] += L->bias[i];
+    }
+    free(xp);
+}
+/* FP16 matvec for lm_head */
+static void fmv(const FL *L, const float *x, float *y) {
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < L->od; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        const uint16_t *row = L->w + (size_t)i * L->id;
+        int j;
+        for (j = 0; j + 16 <= L->id; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(row + j));
+            acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
+        }
+        float s = _mm512_reduce_add_ps(acc);
+        for (; j < L->id; j++) {
+            float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
+            s += wf * x[j];
+        }
+        y[i] = s;
+    }
+}
+/* RMSNorm */
+static void rn(const float *x, const float *w, float *y, int d) {
+    __m512 sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i+16 <= d; i += 16) {
+        __m512 v = _mm512_loadu_ps(x+i);
+        sq = _mm512_fmadd_ps(v, v, sq);
+    }
+    float ss = _mm512_reduce_add_ps(sq);
+    for (; i < d; i++) ss += x[i]*x[i];
+    float r = 1.0f / sqrtf(ss/d + RMS_EPS);
+    __m512 rv = _mm512_set1_ps(r);
+    for (i = 0; i+16 <= d; i += 16)
+        _mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
+            _mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
+    for (; i < d; i++) y[i] = x[i]*r*w[i];
+}
+static void silu(float *x, int n) {
+    for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
+}
+static void emul(const float *a, const float *b, float *c, int n) {
+    int i;
+    for (i = 0; i+16 <= n; i += 16)
+        _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
+    for (; i < n; i++) c[i] = a[i]*b[i];
+}
+static void va(float *y, const float *x, int n) {
+    int i;
+    for (i = 0; i+16 <= n; i += 16)
+        _mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
+    for (; i < n; i++) y[i] += x[i];
+}
+static void rope(float *v, int pos, int d) {
+    for (int i = 0; i < d; i += 2) {
+        float f = 1.0f / powf(ROPE_THETA, (float)i/d);
+        float a = pos*f, co = cosf(a), si = sinf(a);
+        float v0 = v[i], v1 = v[i+1];
+        v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
+    }
+}
+static void sm(float *x, int n) {
+    float mx = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
+    float s = 0;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
+    float iv = 1.0f/s;
+    for (int i = 0; i < n; i++) x[i] *= iv;
+}
+static void etok(const M *m, int t, float *o) {
+    const uint16_t *r = m->emb + (size_t)t * HIDDEN;
+    int i;
+    for (i = 0; i+16 <= HIDDEN; i += 16)
+        _mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
+    for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
+}
+static float* kvp(float *c, int l, int p, int h) {
+    return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
+}
+static void do_attn(M *m, int l, int pos) {
+    Lay *ly = &m->lay[l];
+    pmv(&ly->qp, m->h2, m->sq);
+    pmv(&ly->kp, m->h2, m->sk);
+    pmv(&ly->vp, m->h2, m->sv);
+    if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
+    if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
+    if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
+    for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
+    for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
+    for (int h = 0; h < N_KV_HEADS; h++) {
+        memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
+        memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
+    }
+    float sc = 1.0f/sqrtf((float)HEAD_DIM);
+    memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
+    for (int h = 0; h < N_HEADS; h++) {
+        int kvh = h / GQA_RATIO;
+        float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
+        for (int t = 0; t <= pos; t++) {
+            float *kk = kvp(m->kc,l,t,kvh);
+            __m512 a = _mm512_setzero_ps();
+            int d;
+            for (d = 0; d+16 <= HEAD_DIM; d += 16)
+                a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
+            float dot = _mm512_reduce_add_ps(a);
+            for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
+            m->as[t] = dot * sc;
+        }
+        sm(m->as, pos+1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->as[t];
+            if (w < 1e-8f) continue;
+            float *vv = kvp(m->vc,l,t,kvh);
+            __m512 wv = _mm512_set1_ps(w);
+            int d;
+            for (d = 0; d+16 <= HEAD_DIM; d += 16)
+                _mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
+            for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
+        }
+    }
+    pmv(&ly->op, m->ao, m->h2);
+}
+static void do_mlp(M *m, int l) {
+    Lay *ly = &m->lay[l];
+    pmv(&ly->gp, m->h2, m->sg);
+    pmv(&ly->up, m->h2, m->su);
+    silu(m->sg, INTER);
+    emul(m->sg, m->su, m->sd, INTER);
+    pmv(&ly->dp, m->sd, m->h2);
+}
+float* forward_token(M *m, int tid, int pos) {
+    etok(m, tid, m->h);
+    for (int l = 0; l < N_LAYERS; l++) {
+        rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
+        do_attn(m, l, pos);
+        va(m->h, m->h2, HIDDEN);
+        rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
+        do_mlp(m, l);
+        va(m->h, m->h2, HIDDEN);
+    }
+    rn(m->h, m->fnorm, m->h2, HIDDEN);
+    fmv(&m->lmh, m->h2, m->lg);
+    return m->lg;
+}
+static int samp(float *lg, int V, float T, float tp) {
+    if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
+    sm(lg, V);
+    float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
+    memcpy(pr, lg, V*4);
+    for (int i = 0; i < V; i++) ix[i] = i;
+    float cum = 0; int nk = 0;
+    while (cum < tp && nk < V && nk < 50) {
+        int b = nk;
+        for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
+        float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
+        int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
+        cum += pr[nk]; nk++;
+    }
+    float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
+    float r = (float)rand()/RAND_MAX * s, ac = 0;
+    int ch = ix[0];
+    for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
+    free(pr); free(ix);
+    return ch;
+}
+int generate(M *m, const int *pr, int pl, int *out, int mx,
+             float T, float tp, int eos) {
+    srand(time(NULL));
+    for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
+    int pos = pl, gen = 0;
+    for (int t = 0; t < mx; t++) {
+        int nx;
+        if (T <= 0) {
+            nx = 0;
+            for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
+        } else {
+            nx = samp(m->lg, VOCAB, T, tp);
+        }
+        out[t] = nx; gen++;
+        if (nx == eos) break;
+        forward_token(m, nx, pos); pos++;
+    }
+    return gen;
+}
+M* model_alloc(void) {
+    M *m = (M*)calloc(1, sizeof(M));
+    size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
+    m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
+    m->h  = (float*)aligned_alloc(64,HIDDEN*4);
+    m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
+    m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
+    m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
+    m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
+    m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
+    m->sg = (float*)aligned_alloc(64,INTER*4);
+    m->su = (float*)aligned_alloc(64,INTER*4);
+    m->sd = (float*)aligned_alloc(64,INTER*4);
+    m->lg = (float*)aligned_alloc(64,VOCAB*4);
+    m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
+    m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
+    printf("Alloc: KV=%zuMB\n", kv*2*4/1024/1024);
+    return m;
+}
+void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
+void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
+void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
+    m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
+}
+void layer_set_norms(M *m, int l, float *i, float *p) {
+    m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
+}
+void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
+    m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
+}
+void set_pl(PL *p, uint8_t *mags, uint64_t *signs, float *scales,
+            uint8_t *rmm, int od, int id) {
+    p->mags = mags; p->sign_bits = signs; p->scales = scales;
+    p->row_maxmag = rmm; p->out_dim = od; p->in_dim = id; p->bias = NULL;
+}
+void layer_set_linears(M *m, int l,
+    uint8_t*qm,uint64_t*qs,float*qc,uint8_t*qx,int qo,int qi,
+    uint8_t*km,uint64_t*ks,float*kc,uint8_t*kx,int ko,int ki,
+    uint8_t*vm,uint64_t*vs,float*vc,uint8_t*vx,int vo,int vi,
+    uint8_t*om,uint64_t*os_,float*oc,uint8_t*ox,int oo,int oi,
+    uint8_t*gm,uint64_t*gs,float*gc,uint8_t*gx,int go,int gi,
+    uint8_t*um,uint64_t*us,float*uc,uint8_t*ux,int uo,int ui,
+    uint8_t*dm,uint64_t*ds,float*dc,uint8_t*dx,int doo,int di) {
+    set_pl(&m->lay[l].qp,qm,qs,qc,qx,qo,qi);
+    set_pl(&m->lay[l].kp,km,ks,kc,kx,ko,ki);
+    set_pl(&m->lay[l].vp,vm,vs,vc,vx,vo,vi);
+    set_pl(&m->lay[l].op,om,os_,oc,ox,oo,oi);
+    set_pl(&m->lay[l].gp,gm,gs,gc,gx,go,gi);
+    set_pl(&m->lay[l].up,um,us,uc,ux,uo,ui);
+    set_pl(&m->lay[l].dp,dm,ds,dc,dx,doo,di);
+}
+void model_reset_cache(M *m) {
+    size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
+    memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
+}
+void model_free(M *m) {
+    free(m->kc);free(m->vc);free(m->h);free(m->h2);
+    free(m->sq);free(m->sk);free(m->sv);free(m->ao);
+    free(m->sg);free(m->su);free(m->sd);
+    free(m->lg);free(m->as);free(m->fnorm);free(m);
+}

packed_loader.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+"""Packed unary loader. Loads weights, passes pointers to C engine."""
+import ctypes, os, sys, time, json
+import numpy as np
+from ctypes import c_int, c_float, c_void_p, POINTER, c_uint8, c_uint64
+class PackedEngine:
+    def __init__(self, model_dir, engine_path="./packed_engine.so"):
+        self.lib = ctypes.CDLL(engine_path)
+        self.lib.model_alloc.restype = c_void_p
+        self.lib.forward_token.restype = POINTER(c_float)
+        self.model_dir = model_dir
+        with open(os.path.join(model_dir, "manifest.json")) as f:
+            self.manifest = json.load(f)
+        with open(os.path.join(model_dir, "config.json")) as f:
+            self.config = json.load(f)
+        self.arrays = []  # prevent GC
+        self.model = self.lib.model_alloc()
+        self._load_weights()
+    def _keep(self, arr):
+        self.arrays.append(arr)
+        return arr.ctypes.data
+    def _load_file(self, key, ext, dtype):
+        path = os.path.join(self.model_dir, key.replace(".", "_") + ext)
+        return np.fromfile(path, dtype=dtype)
+    def _load_weights(self):
+        t0 = time.time()
+        fp16_keys = self.manifest["fp16"]
+        packed_keys = self.manifest["packed"]
+        # Embeddings
+        emb = self._load_file("model.embed_tokens.weight", ".fp16", np.uint16)
+        self.lib.model_set_embed(self.model, self._keep(emb))
+        print(f"  Embeddings: {emb.nbytes/1e6:.1f} MB")
+        # LM head
+        lm = self._load_file("lm_head.weight", ".fp16", np.uint16)
+        od, id_ = fp16_keys["lm_head.weight"]
+        self.lib.model_set_lm_head(self.model, self._keep(lm), od, id_)
+        print(f"  LM head: {lm.nbytes/1e6:.1f} MB")
+        # Final norm
+        fn = self._load_file("model.norm.weight", ".fp16", np.uint16).astype(np.float32)
+        # fp16 stored, convert
+        fn_f16 = self._load_file("model.norm.weight", ".fp16", np.float16)
+        fn = fn_f16.astype(np.float32)
+        self.lib.model_set_final_norm(self.model, self._keep(fn))
+        n_layers = self.config["num_hidden_layers"]
+        for l in range(n_layers):
+            pfx = f"model.layers.{l}"
+            # Norms
+            in_f16 = self._load_file(f"{pfx}.input_layernorm.weight", ".fp16", np.float16)
+            pn_f16 = self._load_file(f"{pfx}.post_attention_layernorm.weight", ".fp16", np.float16)
+            in_f = in_f16.astype(np.float32)
+            pn_f = pn_f16.astype(np.float32)
+            self.lib.layer_set_norms(self.model, l, self._keep(in_f), self._keep(pn_f))
+            # Biases (Q/K/V)
+            qb = kb = vb = None
+            qb_key = f"{pfx}.self_attn.q_proj.bias"
+            if qb_key in fp16_keys:
+                qb_f16 = self._load_file(qb_key, ".fp16", np.float16)
+                qb = qb_f16.astype(np.float32)
+                kb_f16 = self._load_file(f"{pfx}.self_attn.k_proj.bias", ".fp16", np.float16)
+                kb = kb_f16.astype(np.float32)
+                vb_f16 = self._load_file(f"{pfx}.self_attn.v_proj.bias", ".fp16", np.float16)
+                vb = vb_f16.astype(np.float32)
+                self.lib.layer_set_bias(self.model, l,
+                    self._keep(qb), self._keep(kb), self._keep(vb))
+            else:
+                self.lib.layer_set_bias(self.model, l, None, None, None)
+            # 7 linear layers: q,k,v,o,gate,up,down
+            args = []
+            for name in ['self_attn.q_proj','self_attn.k_proj','self_attn.v_proj',
+                         'self_attn.o_proj','mlp.gate_proj','mlp.up_proj','mlp.down_proj']:
+                key = f"{pfx}.{name}.weight"
+                shape = packed_keys[key]
+                od, id_ = shape
+                mags = self._load_file(key, ".mags", np.uint8)
+                signs = self._load_file(key, ".signs", np.uint64)
+                scales = self._load_file(key, ".scales", np.float32)
+                rmm = self._load_file(key, ".rmm", np.uint8)
+                args.extend([self._keep(mags), self._keep(signs),
+                            self._keep(scales), self._keep(rmm), od, id_])
+            self.lib.layer_set_linears(self.model, l, *args)
+            if (l+1) % 7 == 0 or l == n_layers-1:
+                print(f"  Loaded {l+1}/{n_layers} layers")
+        dt = time.time() - t0
+        total = sum(a.nbytes for a in self.arrays)
+        print(f"\nModel loaded in {dt:.1f}s, {total/1e6:.0f} MB in Python arrays")
+    def generate(self, token_ids, max_new_tokens=100, temperature=0.6, top_p=0.9, eos_id=151643):
+        prompt = (c_int * len(token_ids))(*token_ids)
+        output = (c_int * max_new_tokens)()
+        self.lib.model_reset_cache(self.model)
+        t0 = time.time()
+        n = self.lib.generate(self.model, prompt, len(token_ids),
+                              output, max_new_tokens, c_float(temperature),
+                              c_float(top_p), eos_id)
+        dt = time.time() - t0
+        tokens = [output[i] for i in range(n)]
+        return tokens, n, dt
+if __name__ == "__main__":
+    from transformers import AutoTokenizer
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-packed"
+    tok_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
+    print("Loading tokenizer...")
+    tok = AutoTokenizer.from_pretrained(tok_dir, trust_remote_code=True)
+    print("Loading packed unary engine...")
+    engine = PackedEngine(model_dir, "./packed_engine.so")
+    prompts = ["What is 2+2?", "Explain gravity in one sentence.", "Write a haiku about snow."]
+    for prompt in prompts:
+        msgs = [{"role": "user", "content": prompt}]
+        ids = tok.apply_chat_template(msgs, add_generation_prompt=True)
+        tokens, n, dt = engine.generate(ids, max_new_tokens=100, temperature=0.6)
+        text = tok.decode(tokens, skip_special_tokens=False)
+        print(f"\n[{prompt}] ({n} tok, {dt:.1f}s, {n/dt:.1f} tok/s)")
+        print(text[:300])
+        print("---")

proper_unary ADDED Viewed

Binary file (26 kB). View file

proper_unary.c ADDED Viewed

	@@ -0,0 +1,563 @@

+/*
+ * PROPER UNARY — ONE QUANTUM, NO SCALES
+ *
+ * Every single bit in the entire system has weight = 1 quantum.
+ * The quantum is set ONCE for the whole model.
+ * There are NO per-vector scales. NO per-row scales.
+ *
+ * The number 5.0 with quantum=0.1 is stored as 50 ones.
+ * The number 5.0 with quantum=0.01 is stored as 500 ones.
+ * More precision = more bits. That's the tradeoff.
+ *
+ * ADDITION = CONCATENATION. Always. No exceptions.
+ * Because every bit everywhere means the same thing.
+ *
+ * MATMUL: y[i] = sum_j W[i][j] * x[j]
+ *   = sum over all (w_slot, x_slot) pairs:
+ *       popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
+ *     - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
+ *   = quantum² * integer_count
+ *
+ * Output quantum = input_quantum² (magnitude grows)
+ * Or we pick output quantum = input_quantum and accept
+ * that the integer count includes the scaling.
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#define _POSIX_C_SOURCE 199309L
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+/* ============================================================
+ * PROPER UNARY VECTOR
+ * Every bit = 1 quantum. No local scale.
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign;     /* [chunks] */
+    uint64_t *slots;    /* [n_slots * chunks] */
+    int       dim;
+    int       chunks;
+    int       n_slots;
+    int       cap;      /* max slots allocated */
+} UVec;
+/* Proper unary matrix — same quantum as vectors */
+typedef struct {
+    uint64_t *sign;     /* [rows * chunks] */
+    uint64_t *slots;    /* [K * rows * chunks] */
+    int       rows, cols, chunks, K;
+} UMat;
+/* Global system quantum */
+typedef struct {
+    float quantum;      /* every bit = this much */
+    /* quantum² is the matmul output unit */
+} USystem;
+/* ============================================================
+ * ALLOC
+ * ============================================================ */
+UVec* uv_new(int dim, int cap) {
+    UVec *v = (UVec *)calloc(1, sizeof(UVec));
+    v->dim = dim;
+    v->chunks = (dim + 63) / 64;
+    v->n_slots = 0;
+    v->cap = cap;
+    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
+    v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t));
+    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
+    memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
+    return v;
+}
+UMat* um_new(int rows, int cols, int K) {
+    UMat *m = (UMat *)calloc(1, sizeof(UMat));
+    m->rows = rows; m->cols = cols; m->K = K;
+    m->chunks = (cols + 63) / 64;
+    m->sign  = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
+    m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
+    memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    return m;
+}
+void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
+void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }
+/* ============================================================
+ * QUANTIZE: float → proper unary
+ *
+ * Given global quantum q:
+ *   magnitude = round(|value| / q)
+ *   That many slots get bit set.
+ *
+ * NO per-vector absmax. NO local scale.
+ * Values that exceed K are clipped.
+ * ============================================================ */
+void uv_from_float(UVec *v, const float *x, int K, float quantum) {
+    int dim = v->dim, chunks = v->chunks;
+    v->n_slots = K;
+    memset(v->sign, 0, chunks * sizeof(uint64_t));
+    memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
+    float inv_q = 1.0f / quantum;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        if (x[i] < 0.0f) v->sign[c] |= bit;
+        int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
+        if (mag > K) mag = K;  /* clip */
+        for (int s = 0; s < mag; s++)
+            v->slots[(size_t)s * chunks + c] |= bit;
+    }
+}
+void uv_to_float(const UVec *v, float *out, float quantum) {
+    int dim = v->dim, chunks = v->chunks;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        int mag = 0;
+        for (int s = 0; s < v->n_slots; s++)
+            if (v->slots[(size_t)s * chunks + c] & bit)
+                mag++;
+        out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
+    }
+}
+void um_from_float(UMat *m, const float *data, float quantum) {
+    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
+    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
+    memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
+    float inv_q = 1.0f / quantum;
+    for (int r = 0; r < rows; r++) {
+        const float *row = data + (size_t)r * cols;
+        uint64_t *rs = m->sign + (size_t)r * chunks;
+        for (int j = 0; j < cols; j++) {
+            int c = j / 64;
+            uint64_t bit = 1ULL << (j % 64);
+            if (row[j] < 0.0f) rs[c] |= bit;
+            int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
+            if (mag > K) mag = K;
+            for (int s = 0; s < mag; s++)
+                m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
+        }
+    }
+}
+/* ============================================================
+ * CONCATENATION = ADDITION
+ *
+ * Since every bit everywhere = same quantum,
+ * appending slots IS adding magnitudes. Period.
+ *
+ * Sign handling: for elements where signs differ,
+ * cancel bits from existing slots.
+ * ============================================================ */
+void uv_concat(UVec *dst, const UVec *src) {
+    int chunks = dst->chunks;
+    for (int s = 0; s < src->n_slots; s++) {
+        if (dst->n_slots >= dst->cap) {
+            printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
+            return;
+        }
+        const uint64_t *src_slot = src->slots + (size_t)s * chunks;
+        uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t sb = src_slot[c];
+            uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
+            uint64_t disagree = dst->sign[c] ^ src->sign[c];
+            /* Same sign: straight append */
+            uint64_t add = sb & agree;
+            /* Different sign: cancel from existing */
+            uint64_t cancel = sb & disagree;
+            for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
+                uint64_t *ds = dst->slots + (size_t)d * chunks + c;
+                uint64_t overlap = *ds & cancel;
+                *ds &= ~overlap;
+                cancel &= ~overlap;
+            }
+            /* Leftover cancel = src magnitude exceeds dst, flip sign */
+            if (cancel) {
+                dst->sign[c] ^= cancel;
+                add |= cancel;
+            }
+            new_slot[c] = add;
+        }
+        /* Check if slot has any bits */
+        int any = 0;
+        for (int c = 0; c < chunks && !any; c++)
+            if (new_slot[c]) any = 1;
+        if (any) dst->n_slots++;
+    }
+}
+/* ============================================================
+ * MATMUL: y = M @ x
+ *
+ * Output unit = quantum² (one quantum from weight × one from activation)
+ * The integer accumulator directly gives the value in units of quantum².
+ *
+ * To keep everything in the same quantum system:
+ *   y_float[i] = acc * quantum²
+ *   Then requantize to unary with the SAME global quantum.
+ *   y_mag[i] = acc * quantum² / quantum = acc * quantum
+ *
+ * ============================================================ */
+void uv_matmul(
+    const UMat *M, const UVec *x,
+    UVec *y, int K_out, float quantum
+) {
+    int out_dim = M->rows;
+    int chunks = M->chunks;
+    int wK = M->K;
+    int xK = x->n_slots;
+    float q2 = quantum * quantum;
+    y->n_slots = K_out;
+    memset(y->sign, 0, y->chunks * sizeof(uint64_t));
+    memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));
+    /* Compute integer dot products */
+    int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int));
+    uint8_t *neg = (uint8_t *)calloc(out_dim, 1);
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
+        long long a = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
+            uint64_t diff = w_sign_row[c] ^ x->sign[c];
+            for (int p = 0; p < wK; p++) {
+                uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xK; q++) {
+                    uint64_t xq = x->slots[(size_t)q * chunks + c];
+                    uint64_t active = wp & xq;
+                    a += __builtin_popcountll(active & same)
+                       - __builtin_popcountll(active & diff);
+                }
+            }
+        }
+        /* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
+        float val = (float)a * quantum;
+        int mag = (int)(fabsf(val) + 0.5f);
+        if (mag > K_out) mag = K_out;
+        acc[i] = mag;
+        neg[i] = (val < 0.0f) ? 1 : 0;
+    }
+    /* Encode directly to unary — no float intermediate */
+    for (int i = 0; i < out_dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        if (neg[i]) y->sign[c] |= bit;
+        for (int s = 0; s < acc[i]; s++)
+            y->slots[(size_t)s * y->chunks + c] |= bit;
+    }
+    free(acc); free(neg);
+}
+/* ============================================================
+ * RMSNORM — resets slot count, keeps same quantum
+ * ============================================================ */
+void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) {
+    int dim = x->dim;
+    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
+    uv_to_float(x, xf, quantum);
+    float ss = 0.0f;
+    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
+    float rms = 1.0f / sqrtf(ss / dim + eps);
+    for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
+    uv_from_float(out, xf, K_out, quantum);
+    free(xf);
+}
+/* ============================================================
+ * TESTS
+ * ============================================================ */
+void test_concat_correct() {
+    printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");
+    float quantum = 0.25f;  /* every bit = 0.25 */
+    int dim = 8;
+    /* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
+     * In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
+     * Need K >= 28 slots to hold 7.0
+     */
+    float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
+    float b_vals[] = {2.0,  1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
+    float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};
+    int K = 32;
+    UVec *a = uv_new(dim, 128);
+    UVec *b = uv_new(dim, 128);
+    uv_from_float(a, a_vals, K, quantum);
+    uv_from_float(b, b_vals, K, quantum);
+    float a_rec[8], b_rec[8];
+    uv_to_float(a, a_rec, quantum);
+    uv_to_float(b, b_rec, quantum);
+    printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
+    printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
+    printf("A unary:    "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
+    printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
+    printf("B unary:    "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");
+    printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
+    uv_concat(a, b);
+    printf("After concat: %d slots\n\n", a->n_slots);
+    float result[8];
+    uv_to_float(a, result, quantum);
+    printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
+    printf("Concat  A+B:  "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
+    printf("Error:        "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");
+    uv_free(a); uv_free(b);
+}
+void test_chain_concat() {
+    printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");
+    float quantum = 0.1f;
+    int dim = 4;
+    int K = 64;
+    float vals[] = {1.0, -2.0, 3.0, -0.5};
+    UVec *acc = uv_new(dim, 512);
+    uv_from_float(acc, vals, K, quantum);
+    printf("Start: ");
+    float tmp[4];
+    uv_to_float(acc, tmp, quantum);
+    for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
+    printf(" (%d slots)\n", acc->n_slots);
+    float expected[] = {1.0, -2.0, 3.0, -0.5};
+    for (int step = 0; step < 5; step++) {
+        float add_vals[] = {0.5, 0.3, -1.0, 0.7};
+        UVec *delta = uv_new(dim, K);
+        uv_from_float(delta, add_vals, K, quantum);
+        uv_concat(acc, delta);
+        for (int i = 0; i < 4; i++) expected[i] += add_vals[i];
+        uv_to_float(acc, tmp, quantum);
+        printf("  +[0.5,0.3,-1.0,0.7] = ");
+        for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
+        printf(" (%d slots) expect:", acc->n_slots);
+        for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);
+        /* Check error */
+        float max_err = 0;
+        for(int i=0;i<4;i++) {
+            float e = fabsf(expected[i] - tmp[i]);
+            if (e > max_err) max_err = e;
+        }
+        printf(" err=%.2f\n", max_err);
+        uv_free(delta);
+    }
+    uv_free(acc);
+}
+void test_matmul() {
+    printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");
+    int rows = 512, cols = 256;
+    int wK = 32, xK = 32;
+    srand(42);
+    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
+    float *xf = (float *)malloc(cols * sizeof(float));
+    float *y_ref = (float *)calloc(rows, sizeof(float));
+    /* Small values so magnitudes fit in K slots */
+    for (size_t i = 0; i < (size_t)rows * cols; i++)
+        Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
+    for (int i = 0; i < cols; i++)
+        xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
+    /* Find quantum that fits the data range */
+    float data_max = 0;
+    for (size_t i = 0; i < (size_t)rows * cols; i++) {
+        float a = fabsf(Mf[i]);
+        if (a > data_max) data_max = a;
+    }
+    for (int i = 0; i < cols; i++) {
+        float a = fabsf(xf[i]);
+        if (a > data_max) data_max = a;
+    }
+    float quantum = data_max / wK;
+    printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
+    printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK);
+    printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);
+    UMat *M = um_new(rows, cols, wK);
+    UVec *x = uv_new(cols, xK);
+    um_from_float(M, Mf, quantum);
+    uv_from_float(x, xf, xK, quantum);
+    /* Output needs enough K for the matmul result range */
+    float ymax = 0;
+    for (int i = 0; i < rows; i++) {
+        float a = fabsf(y_ref[i]);
+        if (a > ymax) ymax = a;
+    }
+    int K_out = (int)(ymax / quantum + 1);
+    if (K_out > 4096) K_out = 4096;
+    printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);
+    UVec *y = uv_new(rows, K_out);
+    struct timespec t0, t1;
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    uv_matmul(M, x, y, K_out, quantum);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
+    float *yf = (float *)malloc(rows * sizeof(float));
+    uv_to_float(y, yf, quantum);
+    float dot = 0, na = 0, nb = 0, noise = 0;
+    for (int i = 0; i < rows; i++) {
+        dot += y_ref[i] * yf[i];
+        na += y_ref[i] * y_ref[i];
+        nb += yf[i] * yf[i];
+        float e = y_ref[i] - yf[i]; noise += e * e;
+    }
+    float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+    float snr = 10.0f * log10f(na / (noise + 1e-10f));
+    printf("\nCosine: %.6f\n", cosine);
+    printf("SNR:    %.1f dB\n", snr);
+    printf("Time:   %.1f ms\n", ms);
+    printf("\nFirst 10 values:\n");
+    printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
+    for (int i = 0; i < 10; i++)
+        printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);
+    um_free(M); uv_free(x); uv_free(y);
+    free(Mf); free(xf); free(y_ref); free(yf);
+}
+void test_residual_chain() {
+    printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");
+    float quantum = 0.05f;
+    int dim = 1024;
+    int K = 128;  /* fits values up to 6.4 */
+    srand(123);
+    float *embed = (float *)malloc(dim * sizeof(float));
+    for (int i = 0; i < dim; i++)
+        embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;
+    /* Float reference: accumulate residuals */
+    float *ref = (float *)malloc(dim * sizeof(float));
+    memcpy(ref, embed, dim * sizeof(float));
+    /* Unary: grow via concat */
+    int total_cap = K + 10 * K;  /* room for 10 concat operations */
+    UVec *residual = uv_new(dim, total_cap);
+    uv_from_float(residual, embed, K, quantum);
+    printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
+    printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");
+    for (int step = 0; step < 10; step++) {
+        float *delta = (float *)malloc(dim * sizeof(float));
+        for (int i = 0; i < dim; i++)
+            delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;
+        /* Float reference */
+        for (int i = 0; i < dim; i++) ref[i] += delta[i];
+        /* Unary: concat */
+        UVec *d = uv_new(dim, K);
+        uv_from_float(d, delta, K, quantum);
+        uv_concat(residual, d);
+        /* Compare */
+        float *rec = (float *)malloc(dim * sizeof(float));
+        uv_to_float(residual, rec, quantum);
+        float dot = 0, na = 0, nb = 0, max_err = 0;
+        for (int i = 0; i < dim; i++) {
+            dot += ref[i] * rec[i];
+            na += ref[i] * ref[i];
+            nb += rec[i] * rec[i];
+            float e = fabsf(ref[i] - rec[i]);
+            if (e > max_err) max_err = e;
+        }
+        float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+        printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);
+        uv_free(d); free(delta); free(rec);
+    }
+    uv_free(residual);
+    free(embed); free(ref);
+}
+int main() {
+    printf("================================================\n");
+    printf("  PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
+    printf("  Every bit = 1 quantum. Concat = Add.\n");
+    printf("================================================\n\n");
+    test_concat_correct();
+    test_chain_concat();
+    test_matmul();
+    test_residual_chain();
+    printf("\n=== DONE ===\n");
+    return 0;
+}

pure_unary_engine.c ADDED Viewed

	@@ -0,0 +1,658 @@

+/*
+ * PURE UNARY TRANSFORMER ENGINE
+ *
+ * ALL matrix multiplications use base-1 arithmetic:
+ *   - Weights: unary encoded (sign + N magnitude planes)
+ *   - Activations: unary encoded (sign + M magnitude planes)
+ *   - Matmul = bitwise AND + popcount across plane pairs
+ *   - Float only used for: RMSNorm, SiLU, Softmax, rescale, residual add
+ *   - These are all O(dim) not O(dim²), so don't dominate
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#define MAX_SEQ     4096
+#define RMS_EPS     1e-6f
+/* ============================================================
+ * Unary vector: a quantized 1D activation or intermediate
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign;      /* [chunks] */
+    uint64_t *planes;    /* [n_planes][chunks] */
+    float     scale;
+    int       dim;
+    int       chunks;
+    int       n_planes;
+} UnaryVec;
+/* ============================================================
+ * Config
+ * ============================================================ */
+typedef struct {
+    int hidden;
+    int inter;
+    int n_heads;
+    int n_kv_heads;
+    int head_dim;
+    int n_layers;
+    int vocab;
+    float rope_theta;
+    int tie_embeddings;
+    int w_planes;   /* weight quantization planes */
+    int a_planes;   /* activation quantization planes */
+} Config;
+/* Unary weight matrix */
+typedef struct {
+    uint64_t *sign_bits;
+    uint64_t *mag_planes;
+    float    *scales;
+    int       out_dim;
+    int       in_dim;
+    int       n_planes;
+    int       chunks;  /* = (in_dim + 63) / 64 */
+} UnaryWeight;
+/* Transformer layer */
+typedef struct {
+    UnaryWeight q_proj, k_proj, v_proj, o_proj;
+    UnaryWeight gate_proj, up_proj, down_proj;
+    float *input_norm;
+    float *post_norm;
+    float *q_norm, *k_norm;
+} Layer;
+/* Full model */
+typedef struct {
+    Config cfg;
+    uint16_t *embed;
+    Layer    *layers;
+    float    *final_norm;
+    /* KV cache (float - only O(seq × heads × dim) not O(dim²)) */
+    float *k_cache;
+    float *v_cache;
+    /* Scratch - float buffers for non-matmul ops */
+    float *hidden;       /* residual stream */
+    float *normed;       /* after RMSNorm, before quantization */
+    float *q_float;
+    float *k_float;
+    float *v_float;
+    float *attn_out;
+    float *gate_float;
+    float *up_float;
+    float *mlp_act;      /* gate*up result before quantization */
+    float *logits;
+    float *attn_scores;
+    /* Scratch - unary vectors for matmul inputs */
+    UnaryVec uv_normed;
+    UnaryVec uv_mlp_in;
+    UnaryVec uv_mlp_act;  /* for down_proj input */
+    /* Output integer accumulators (avoid malloc per call) */
+    int *acc_buf;
+} Model;
+/* ============================================================
+ * ACTIVATION QUANTIZATION: float -> unary
+ * Runs per-vector: one scale for entire vector
+ * O(dim) operation, not in the hot path
+ * ============================================================ */
+static void quantize_to_unary(
+    const float *x, int dim, int n_planes,
+    uint64_t *sign_out, uint64_t *planes_out, float *scale_out
+) {
+    int chunks = (dim + 63) / 64;
+    /* Find absmax */
+    float amax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > amax) amax = a;
+    }
+    if (amax == 0.0f) amax = 1.0f;
+    *scale_out = amax / n_planes;
+    /* Clear output */
+    memset(sign_out, 0, chunks * sizeof(uint64_t));
+    memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
+    /* Quantize element by element */
+    float inv_scale = n_planes / amax;
+    for (int i = 0; i < dim; i++) {
+        int chunk = i / 64;
+        int bit = i % 64;
+        uint64_t mask = 1ULL << bit;
+        /* Sign */
+        if (x[i] < 0.0f)
+            sign_out[chunk] |= mask;
+        /* Magnitude: thermometer encode */
+        int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
+        if (mag > n_planes) mag = n_planes;
+        for (int p = 0; p < mag; p++)
+            planes_out[(size_t)p * chunks + chunk] |= mask;
+    }
+}
+/* ============================================================
+ * PURE UNARY MATVEC: y = W @ x
+ *
+ * Both W and x are unary encoded.
+ * Inner loop is purely: AND + popcount
+ * Float multiply happens ONCE per output element (rescale)
+ * ============================================================ */
+static void pure_unary_matvec(
+    const UnaryWeight *W,
+    const uint64_t *x_sign, const uint64_t *x_planes,
+    float x_scale, int x_n_planes,
+    float *y_out,   /* float output for non-matmul ops */
+    int *acc_buf     /* scratch for integer accumulators */
+) {
+    int out_dim = W->out_dim;
+    int chunks = W->chunks;
+    int wp = W->n_planes;
+    int xp = x_n_planes;
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
+        /* Precompute same_sign mask for this row vs input */
+        /* same_sign[c] = ~(w_sign[c] ^ x_sign[c]) */
+        /* We compute this per-chunk inside the loop to avoid allocation */
+        long long acc = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign_row[c];
+            uint64_t xs = x_sign[c];
+            uint64_t same = ~(ws ^ xs);  /* bits where signs agree */
+            uint64_t diff = ws ^ xs;     /* bits where signs differ */
+            for (int p = 0; p < wp; p++) {
+                uint64_t w_mag = W->mag_planes[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xp; q++) {
+                    uint64_t x_mag = x_planes[(size_t)q * chunks + c];
+                    uint64_t active = w_mag & x_mag;
+                    /* Count positive and negative contributions */
+                    uint64_t pos = active & same;
+                    uint64_t neg = active & diff;
+                    acc += __builtin_popcountll(pos) - __builtin_popcountll(neg);
+                }
+            }
+        }
+        /* Single float rescale per output element */
+        y_out[i] = (float)acc * W->scales[i] * x_scale;
+    }
+}
+/* ============================================================
+ * FP16 embedding lookup (only used for embed/lm_head)
+ * ============================================================ */
+static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
+    const uint16_t *row = embed + (size_t)token_id * hidden;
+    int i;
+    for (i = 0; i + 16 <= hidden; i += 16) {
+        __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
+        __m512 fv = _mm512_cvtph_ps(h);
+        _mm512_storeu_ps(out + i, fv);
+    }
+    for (; i < hidden; i++) {
+        __m128i hv = _mm_set1_epi16(row[i]);
+        __m128 fv = _mm_cvtph_ps(hv);
+        _mm_store_ss(out + i, fv);
+    }
+}
+/* FP16 matvec for lm_head (vocab is huge, keep as FP16) */
+static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < out_dim; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        int j;
+        for (j = 0; j + 16 <= in_dim; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
+            __m512 wv = _mm512_cvtph_ps(h);
+            __m512 xv = _mm512_loadu_ps(x + j);
+            acc = _mm512_fmadd_ps(wv, xv, acc);
+        }
+        float sum = _mm512_reduce_add_ps(acc);
+        for (; j < in_dim; j++) {
+            __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
+            __m128 fv = _mm_cvtph_ps(hv);
+            float wf;
+            _mm_store_ss(&wf, fv);
+            sum += wf * x[j];
+        }
+        y[i] = sum;
+    }
+}
+/* ============================================================
+ * O(dim) operations - float is fine here, not the bottleneck
+ * ============================================================ */
+static void rmsnorm(const float *x, const float *w, float *y, int dim) {
+    float ss = 0.0f;
+    for (int i = 0; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
+    for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
+}
+static void rmsnorm_head(const float *x, const float *w, float *y, int dim) {
+    /* RMSNorm for a single attention head */
+    rmsnorm(x, w, y, dim);
+}
+static void silu_mul(const float *gate, const float *up, float *out, int n) {
+    for (int i = 0; i < n; i++)
+        out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
+}
+static void vec_add(float *y, const float *x, int n) {
+    for (int i = 0; i < n; i++) y[i] += x[i];
+}
+static void apply_rope(float *vec, int pos, int dim, float theta) {
+    for (int i = 0; i < dim; i += 2) {
+        float freq = 1.0f / powf(theta, (float)i / dim);
+        float angle = pos * freq;
+        float c = cosf(angle), s = sinf(angle);
+        float v0 = vec[i], v1 = vec[i + 1];
+        vec[i]     = v0 * c - v1 * s;
+        vec[i + 1] = v0 * s + v1 * c;
+    }
+}
+static void softmax(float *x, int n) {
+    float mx = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
+    float inv = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv;
+}
+/* KV cache access */
+static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
+    return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
+                    (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
+}
+/* ============================================================
+ * ALLOC unary vector scratch
+ * ============================================================ */
+static void uv_alloc(UnaryVec *uv, int dim, int n_planes) {
+    int chunks = (dim + 63) / 64;
+    uv->dim = dim;
+    uv->chunks = chunks;
+    uv->n_planes = n_planes;
+    uv->sign = (uint64_t *)aligned_alloc(64, chunks * sizeof(uint64_t));
+    uv->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * chunks * sizeof(uint64_t));
+    uv->scale = 0.0f;
+}
+/* ============================================================
+ * ATTENTION (using pure unary for projections)
+ * ============================================================ */
+static void attention(Model *m, int layer_idx, int pos) {
+    Config *c = &m->cfg;
+    Layer *layer = &m->layers[layer_idx];
+    int heads_per_kv = c->n_heads / c->n_kv_heads;
+    /* Quantize normed hidden to unary */
+    quantize_to_unary(m->normed, c->hidden, c->a_planes,
+                      m->uv_normed.sign, m->uv_normed.planes, &m->uv_normed.scale);
+    /* Q, K, V projections - PURE UNARY */
+    pure_unary_matvec(&layer->q_proj,
+        m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
+        m->q_float, m->acc_buf);
+    pure_unary_matvec(&layer->k_proj,
+        m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
+        m->k_float, m->acc_buf);
+    pure_unary_matvec(&layer->v_proj,
+        m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
+        m->v_float, m->acc_buf);
+    /* QK-Norm (per head) */
+    if (layer->q_norm) {
+        for (int h = 0; h < c->n_heads; h++)
+            rmsnorm_head(m->q_float + h * c->head_dim, layer->q_norm,
+                        m->q_float + h * c->head_dim, c->head_dim);
+    }
+    if (layer->k_norm) {
+        for (int h = 0; h < c->n_kv_heads; h++)
+            rmsnorm_head(m->k_float + h * c->head_dim, layer->k_norm,
+                        m->k_float + h * c->head_dim, c->head_dim);
+    }
+    /* RoPE */
+    for (int h = 0; h < c->n_heads; h++)
+        apply_rope(m->q_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
+    for (int h = 0; h < c->n_kv_heads; h++)
+        apply_rope(m->k_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
+    /* Store K, V to cache */
+    for (int h = 0; h < c->n_kv_heads; h++) {
+        memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
+               m->k_float + h * c->head_dim, c->head_dim * sizeof(float));
+        memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
+               m->v_float + h * c->head_dim, c->head_dim * sizeof(float));
+    }
+    /* Attention scores + weighted sum (O(seq × head_dim), not O(dim²)) */
+    float scale = 1.0f / sqrtf((float)c->head_dim);
+    memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
+    for (int h = 0; h < c->n_heads; h++) {
+        int kv_h = h / heads_per_kv;
+        float *q_head = m->q_float + h * c->head_dim;
+        float *out_head = m->attn_out + h * c->head_dim;
+        for (int t = 0; t <= pos; t++) {
+            float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
+            float dot = 0.0f;
+            for (int d = 0; d < c->head_dim; d++)
+                dot += q_head[d] * k_cached[d];
+            m->attn_scores[t] = dot * scale;
+        }
+        softmax(m->attn_scores, pos + 1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->attn_scores[t];
+            if (w < 1e-8f) continue;
+            float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
+            for (int d = 0; d < c->head_dim; d++)
+                out_head[d] += w * v_cached[d];
+        }
+    }
+    /* O projection - quantize attn_out, then pure unary */
+    int o_in = c->n_heads * c->head_dim;
+    UnaryVec uv_attn;
+    uv_alloc(&uv_attn, o_in, c->a_planes);
+    quantize_to_unary(m->attn_out, o_in, c->a_planes,
+                      uv_attn.sign, uv_attn.planes, &uv_attn.scale);
+    /* Temp buffer for O projection output */
+    float *o_out = m->normed;  /* reuse normed buffer */
+    pure_unary_matvec(&layer->o_proj,
+        uv_attn.sign, uv_attn.planes, uv_attn.scale, c->a_planes,
+        o_out, m->acc_buf);
+    /* Copy o_out to where caller expects it (normed acts as temp) */
+    memcpy(m->attn_out, o_out, c->hidden * sizeof(float));
+    free(uv_attn.sign);
+    free(uv_attn.planes);
+}
+/* ============================================================
+ * MLP (using pure unary for all projections)
+ * ============================================================ */
+static void mlp(Model *m, int layer_idx) {
+    Config *c = &m->cfg;
+    Layer *layer = &m->layers[layer_idx];
+    /* Quantize normed input */
+    quantize_to_unary(m->normed, c->hidden, c->a_planes,
+                      m->uv_mlp_in.sign, m->uv_mlp_in.planes, &m->uv_mlp_in.scale);
+    /* Gate and Up projections - PURE UNARY */
+    pure_unary_matvec(&layer->gate_proj,
+        m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
+        m->gate_float, m->acc_buf);
+    pure_unary_matvec(&layer->up_proj,
+        m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
+        m->up_float, m->acc_buf);
+    /* SiLU(gate) * up - O(inter) float op */
+    silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
+    /* Quantize for down projection */
+    quantize_to_unary(m->mlp_act, c->inter, c->a_planes,
+                      m->uv_mlp_act.sign, m->uv_mlp_act.planes, &m->uv_mlp_act.scale);
+    /* Down projection - PURE UNARY */
+    pure_unary_matvec(&layer->down_proj,
+        m->uv_mlp_act.sign, m->uv_mlp_act.planes, m->uv_mlp_act.scale, c->a_planes,
+        m->normed, m->acc_buf);  /* reuse normed as output */
+}
+/* ============================================================
+ * FORWARD ONE TOKEN
+ * ============================================================ */
+float* forward_token(Model *m, int token_id, int pos) {
+    Config *c = &m->cfg;
+    embed_token(m->embed, token_id, m->hidden, c->hidden);
+    for (int l = 0; l < c->n_layers; l++) {
+        /* Pre-attention norm */
+        rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
+        /* Attention (quantizes normed internally, outputs to attn_out) */
+        attention(m, l, pos);
+        vec_add(m->hidden, m->attn_out, c->hidden);
+        /* Post-attention norm */
+        rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
+        /* MLP (quantizes normed internally, outputs to normed) */
+        mlp(m, l);
+        vec_add(m->hidden, m->normed, c->hidden);
+    }
+    /* Final norm */
+    rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
+    /* LM head - FP16 for now (vocab projection is O(vocab × hidden), not repeated per-layer) */
+    if (c->tie_embeddings) {
+        fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
+    }
+    return m->logits;
+}
+/* ============================================================
+ * SAMPLING
+ * ============================================================ */
+static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
+    if (temperature > 0) {
+        float inv_t = 1.0f / temperature;
+        for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
+    }
+    softmax(logits, vocab);
+    int n_keep = 0;
+    float cum = 0.0f;
+    float *probs = (float *)malloc(vocab * sizeof(float));
+    int *indices = (int *)malloc(vocab * sizeof(int));
+    memcpy(probs, logits, vocab * sizeof(float));
+    for (int i = 0; i < vocab; i++) indices[i] = i;
+    while (cum < top_p && n_keep < vocab) {
+        int best = n_keep;
+        for (int i = n_keep + 1; i < vocab; i++)
+            if (probs[i] > probs[best]) best = i;
+        float tmp = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp;
+        int ti = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = ti;
+        cum += probs[n_keep];
+        n_keep++;
+        if (n_keep >= 40) break;
+    }
+    float sum = 0.0f;
+    for (int i = 0; i < n_keep; i++) sum += probs[i];
+    float r = (float)rand() / RAND_MAX * sum;
+    float acc = 0.0f;
+    int chosen = indices[0];
+    for (int i = 0; i < n_keep; i++) {
+        acc += probs[i];
+        if (acc >= r) { chosen = indices[i]; break; }
+    }
+    free(probs); free(indices);
+    return chosen;
+}
+int generate(
+    Model *m,
+    const int *prompt_ids, int prompt_len,
+    int *out_tokens, int max_new_tokens,
+    float temperature, float top_p, int eos_token
+) {
+    srand(time(NULL));
+    for (int i = 0; i < prompt_len; i++)
+        forward_token(m, prompt_ids[i], i);
+    int pos = prompt_len;
+    int generated = 0;
+    for (int t = 0; t < max_new_tokens; t++) {
+        int next;
+        if (temperature <= 0) {
+            next = 0;
+            for (int i = 1; i < m->cfg.vocab; i++)
+                if (m->logits[i] > m->logits[next]) next = i;
+        } else {
+            next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
+        }
+        out_tokens[t] = next;
+        generated++;
+        if (next == eos_token) break;
+        forward_token(m, next, pos);
+        pos++;
+    }
+    return generated;
+}
+/* ============================================================
+ * ALLOCATION
+ * ============================================================ */
+Model* model_alloc(
+    int w_planes, int a_planes,
+    int hidden, int inter, int n_heads, int n_kv_heads,
+    int head_dim, int n_layers, int vocab,
+    float rope_theta, int tie_embeddings
+) {
+    Model *m = (Model *)calloc(1, sizeof(Model));
+    Config *c = &m->cfg;
+    c->hidden = hidden; c->inter = inter;
+    c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
+    c->head_dim = head_dim; c->n_layers = n_layers;
+    c->vocab = vocab; c->rope_theta = rope_theta;
+    c->tie_embeddings = tie_embeddings;
+    c->w_planes = w_planes; c->a_planes = a_planes;
+    m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
+    size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
+    m->k_cache = (float *)calloc(kv_size, sizeof(float));
+    m->v_cache = (float *)calloc(kv_size, sizeof(float));
+    m->hidden      = (float *)aligned_alloc(64, hidden * sizeof(float));
+    m->normed      = (float *)aligned_alloc(64, (inter > hidden ? inter : hidden) * sizeof(float));
+    m->q_float     = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->k_float     = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->v_float     = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->attn_out    = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->gate_float  = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->up_float    = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->mlp_act     = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->logits      = (float *)aligned_alloc(64, vocab * sizeof(float));
+    m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
+    m->final_norm  = (float *)aligned_alloc(64, hidden * sizeof(float));
+    m->acc_buf     = (int *)aligned_alloc(64, (inter > vocab ? inter : vocab) * sizeof(int));
+    /* Unary vector scratch */
+    uv_alloc(&m->uv_normed, hidden, a_planes);
+    uv_alloc(&m->uv_mlp_in, hidden, a_planes);
+    uv_alloc(&m->uv_mlp_act, inter, a_planes);
+    size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
+    printf("PURE UNARY ENGINE\n");
+    printf("  Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
+           hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
+    printf("  Weight planes: %d, Activation planes: %d\n", w_planes, a_planes);
+    printf("  Plane pairs per matvec element: %d\n", w_planes * a_planes);
+    printf("  KV cache: %zu MB\n", kv_mb);
+    printf("  Float ops: RMSNorm, SiLU, Softmax, RoPE, residual (all O(dim))\n");
+    printf("  Integer ops: ALL matmuls (O(dim²) — the actual bottleneck)\n");
+    return m;
+}
+/* Weight setters (same interface as v2) */
+void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
+void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
+void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
+    m->layers[l].input_norm = in_norm;
+    m->layers[l].post_norm = post_norm;
+}
+void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
+    m->layers[l].q_norm = q_norm;
+    m->layers[l].k_norm = k_norm;
+}
+static void init_unary_weight(
+    UnaryWeight *uw,
+    uint64_t *sign, uint64_t *planes, float *scales,
+    int out_dim, int in_dim, int n_planes
+) {
+    uw->sign_bits = sign;
+    uw->mag_planes = planes;
+    uw->scales = scales;
+    uw->out_dim = out_dim;
+    uw->in_dim = in_dim;
+    uw->n_planes = n_planes;
+    uw->chunks = (in_dim + 63) / 64;
+}
+void layer_set_linears(
+    Model *m, int l,
+    uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
+    uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
+    uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
+    uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
+    uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
+    uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
+    uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
+    int n_planes
+) {
+    init_unary_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
+    init_unary_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
+    init_unary_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
+    init_unary_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
+    init_unary_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
+    init_unary_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
+    init_unary_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
+}
+void model_reset_cache(Model *m) {
+    size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
+    memset(m->k_cache, 0, kv_size * sizeof(float));
+    memset(m->v_cache, 0, kv_size * sizeof(float));
+}

run_convert.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import os, json, numpy as np, time, sys
+from pathlib import Path
+from safetensors import safe_open
+import torch
+sys.path.insert(0, "/root/ternary_engine")
+from convert import quantize_weight_matrix
+model_dir = "/root/ternary_engine/deepseek-r1-1.5b-hf"
+output_dir = "/root/ternary_engine/deepseek-r1-1.5b-ternary"
+alpha = 0.7
+os.makedirs(output_dir, exist_ok=True)
+tensors = {}
+for f in sorted(Path(model_dir).glob("*.safetensors")):
+    print("Loading " + f.name)
+    with safe_open(str(f), framework="pt") as st:
+        for key in st.keys():
+            tensors[key] = st.get_tensor(key).float().numpy()
+print("Loaded " + str(len(tensors)) + " tensors")
+config = {
+    "hidden_size": 1536, "intermediate_size": 8960,
+    "num_attention_heads": 12, "num_key_value_heads": 2,
+    "num_hidden_layers": 28, "vocab_size": 151936,
+    "head_dim": 128, "rope_theta": 1000000.0,
+    "rms_norm_eps": 1e-6, "alpha": alpha,
+}
+ternary_manifest = {}
+fp16_manifest = {}
+linear_suffixes = ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
+                   'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
+                   'down_proj.weight']
+total_tb = 0
+total_ob = 0
+for key, w in tensors.items():
+    prefix = os.path.join(output_dir, key.replace(".", "_"))
+    is_linear = any(key.endswith(s) for s in linear_suffixes)
+    if is_linear and len(w.shape) == 2:
+        out_dim, in_dim = w.shape
+        total_ob += w.nbytes
+        t0 = time.time()
+        pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
+        dt = time.time() - t0
+        pos.tofile(prefix + ".pos")
+        neg.tofile(prefix + ".neg")
+        scales.tofile(prefix + ".scales")
+        tb = pos.nbytes + neg.nbytes + scales.nbytes
+        total_tb += tb
+        ratio = w.nbytes / tb
+        ternary_manifest[key] = list(w.shape)
+        print("  T %s: %s -> %dKB (%.1fx, %.0f%% sparse, %.1fs)" % (
+            key, str(w.shape), tb // 1024, ratio, sparsity * 100, dt))
+    else:
+        w16 = w.astype(np.float16)
+        w16.tofile(prefix + ".fp16")
+        fp16_manifest[key] = list(w.shape)
+        print("  F %s: %s -> %dKB" % (key, str(w.shape), w16.nbytes // 1024))
+with open(os.path.join(output_dir, "config.json"), "w") as f:
+    json.dump(config, f, indent=2)
+with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+    json.dump({"ternary": ternary_manifest, "fp16": fp16_manifest}, f, indent=2)
+print("")
+print("Ternary: %.1fMB (from %.1fMB FP32)" % (total_tb / 1024 / 1024, total_ob / 1024 / 1024))
+print("DONE")

run_log_unary.py ADDED Viewed

	@@ -0,0 +1,123 @@

+#!/usr/bin/env python3
+"""Log-unary model loader. (c) 2026 OpenTransformers Ltd"""
+import ctypes, numpy as np, os, sys, json, time
+def load_and_run(model_dir, prompt, max_tokens=32, temperature=0.0, top_p=0.9, a_planes=4):
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
+    w_planes = manifest["n_planes"]
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    inter = config["intermediate_size"]
+    n_heads = config["num_attention_heads"]
+    n_kv_heads = config["num_key_value_heads"]
+    head_dim = config.get("head_dim", hidden // n_heads)
+    vocab = config["vocab_size"]
+    rope_theta = config.get("rope_theta", 10000.0)
+    tie = 1 if config.get("tie_word_embeddings", False) else 0
+    w_max = (1 << w_planes) - 1
+    a_max = (1 << a_planes) - 1
+    print(f"Config: {n_layers}L hidden={hidden} inter={inter} heads={n_heads}/{n_kv_heads}")
+    print(f"Weight: {w_planes} log-planes ({2*w_max+1} levels)")
+    print(f"Activation: {a_planes} log-planes ({2*a_max+1} levels)")
+    print(f"Plane pairs: {w_planes * a_planes}")
+    engine = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "log_unary_engine.so")
+    lib = ctypes.CDLL(engine)
+    lib.model_alloc.restype = ctypes.c_void_p
+    lib.model_alloc.argtypes = [ctypes.c_int]*2 + [ctypes.c_int]*7 + [ctypes.c_float, ctypes.c_int]
+    lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
+    lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+    lib.generate.restype = ctypes.c_int
+    lib.generate.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+                             ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+                             ctypes.c_float, ctypes.c_float, ctypes.c_int]
+    u16p = ctypes.POINTER(ctypes.c_uint16)
+    f32p = ctypes.POINTER(ctypes.c_float)
+    u64p = ctypes.POINTER(ctypes.c_uint64)
+    lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
+    lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
+    lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + \
+        [u64p, u64p, f32p, ctypes.c_int, ctypes.c_int] * 7 + [ctypes.c_int]
+    print("Allocating...")
+    model = lib.model_alloc(w_planes, a_planes, hidden, inter, n_heads, n_kv_heads,
+                            head_dim, n_layers, vocab, rope_theta, tie)
+    _refs = []
+    def load_fp16(name):
+        d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
+        _refs.append(d); return d.ctypes.data_as(u16p)
+    def load_f32(name):
+        d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
+        f = d.view(np.float16).astype(np.float32); _refs.append(f); return f.ctypes.data_as(f32p)
+    def load_unary(name):
+        fn = name.replace(".","_")
+        s = np.fromfile(os.path.join(model_dir, f"{fn}.sign"), dtype=np.uint64)
+        p = np.fromfile(os.path.join(model_dir, f"{fn}.planes"), dtype=np.uint64)
+        sc = np.fromfile(os.path.join(model_dir, f"{fn}.scales"), dtype=np.float32)
+        _refs.extend([s,p,sc])
+        return s.ctypes.data_as(u64p), p.ctypes.data_as(u64p), sc.ctypes.data_as(f32p)
+    lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
+    lib.model_set_final_norm(model, load_f32("model.norm.weight"))
+    print(f"Loading {n_layers} layers...")
+    um = manifest["unary"]
+    for l in range(n_layers):
+        p = f"model.layers.{l}"
+        lib.layer_set_norms(model, l, load_f32(f"{p}.input_layernorm.weight"),
+                            load_f32(f"{p}.post_attention_layernorm.weight"))
+        qn = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
+        if os.path.exists(qn):
+            lib.layer_set_qk_norm(model, l, load_f32(f"{p}.self_attn.q_norm.weight"),
+                                  load_f32(f"{p}.self_attn.k_norm.weight"))
+        projs = ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
+                 "mlp.gate_proj","mlp.up_proj","mlp.down_proj"]
+        args = [model, l]
+        for pj in projs:
+            key = f"{p}.{pj}.weight"
+            s,pl,sc = load_unary(key)
+            args.extend([s, pl, sc, um[key][0], um[key][1]])
+        args.append(w_planes)
+        lib.layer_set_linears(*args)
+        if (l+1) % 12 == 0 or l == n_layers-1:
+            print(f"  Layer {l+1}/{n_layers}")
+    print("Tokenizing...")
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    ids = tok.encode(prompt)
+    print(f"Prompt: {len(ids)} tokens")
+    eos = config.get("eos_token_id", 151645)
+    pa = (ctypes.c_int * len(ids))(*ids)
+    oa = (ctypes.c_int * max_tokens)()
+    print(f"\nGenerating (w={w_planes}log a={a_planes}log pairs={w_planes*a_planes})...")
+    t0 = time.time()
+    n = lib.generate(model, pa, len(ids), oa, max_tokens,
+                     ctypes.c_float(temperature), ctypes.c_float(top_p), eos)
+    dt = time.time() - t0
+    text = tok.decode([oa[i] for i in range(n)], skip_special_tokens=True)
+    print(f"\n=== LOG-UNARY ({n} tok in {dt:.1f}s = {n/dt:.2f} tok/s) ===")
+    print(text)
+    print(f"\nDecode: {n/dt:.2f} tok/s")
+if __name__ == "__main__":
+    d = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-log-unary"
+    p = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
+    mt = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    ap = int(sys.argv[4]) if len(sys.argv) > 4 else 4
+    load_and_run(d, p, mt, a_planes=ap)

run_pure_unary.py ADDED Viewed

	@@ -0,0 +1,176 @@

+#!/usr/bin/env python3
+"""
+Pure unary model loader - ALL matmuls are AND+popcount
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import ctypes, numpy as np, os, sys, json, time
+def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9, a_planes=4):
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
+    w_planes = manifest["n_planes"]
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    inter = config["intermediate_size"]
+    n_heads = config["num_attention_heads"]
+    n_kv_heads = config["num_key_value_heads"]
+    head_dim = config.get("head_dim", hidden // n_heads)
+    vocab = config["vocab_size"]
+    rope_theta = config.get("rope_theta", 10000.0)
+    tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
+    print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}")
+    print(f"Weight planes: {w_planes}, Activation planes: {a_planes}")
+    print(f"Plane pairs per element: {w_planes * a_planes}")
+    print(f"Tied embeddings: {'yes' if tie_embeddings else 'no'}")
+    engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "pure_unary_engine.so")
+    lib = ctypes.CDLL(engine_path)
+    lib.model_alloc.restype = ctypes.c_void_p
+    lib.model_alloc.argtypes = [
+        ctypes.c_int, ctypes.c_int,  # w_planes, a_planes
+        ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
+        ctypes.c_int, ctypes.c_int, ctypes.c_int,
+        ctypes.c_float, ctypes.c_int,
+    ]
+    lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
+    lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+    lib.generate.restype = ctypes.c_int
+    lib.generate.argtypes = [
+        ctypes.c_void_p,
+        ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+        ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+        ctypes.c_float, ctypes.c_float, ctypes.c_int
+    ]
+    u16p = ctypes.POINTER(ctypes.c_uint16)
+    f32p = ctypes.POINTER(ctypes.c_float)
+    u64p = ctypes.POINTER(ctypes.c_uint64)
+    lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
+    lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
+    lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_linears.argtypes = [
+        ctypes.c_void_p, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        ctypes.c_int,
+    ]
+    lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+    print("Allocating model...")
+    model = lib.model_alloc(
+        w_planes, a_planes,
+        hidden, inter, n_heads, n_kv_heads,
+        head_dim, n_layers, vocab, rope_theta, tie_embeddings
+    )
+    _refs = []
+    def load_fp16(name):
+        fname = name.replace(".", "_") + ".fp16"
+        data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
+        _refs.append(data)
+        return data.ctypes.data_as(u16p)
+    def load_f32(name):
+        fname = name.replace(".", "_") + ".fp16"
+        data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
+        f32 = data.view(np.float16).astype(np.float32)
+        _refs.append(f32)
+        return f32.ctypes.data_as(f32p)
+    def load_unary(name):
+        fname = name.replace(".", "_")
+        sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
+        planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
+        scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
+        _refs.extend([sign, planes, scales])
+        return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
+                scales.ctypes.data_as(f32p))
+    print("Loading embeddings...")
+    lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
+    print("Loading final norm...")
+    lib.model_set_final_norm(model, load_f32("model.norm.weight"))
+    print(f"Loading {n_layers} layers...")
+    for l in range(n_layers):
+        p = f"model.layers.{l}"
+        lib.layer_set_norms(model, l,
+            load_f32(f"{p}.input_layernorm.weight"),
+            load_f32(f"{p}.post_attention_layernorm.weight"))
+        # QK-Norm (Qwen3)
+        qn_path = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
+        if os.path.exists(qn_path):
+            lib.layer_set_qk_norm(model, l,
+                load_f32(f"{p}.self_attn.q_norm.weight"),
+                load_f32(f"{p}.self_attn.k_norm.weight"))
+        q_s, q_p, q_sc = load_unary(f"{p}.self_attn.q_proj.weight")
+        k_s, k_p, k_sc = load_unary(f"{p}.self_attn.k_proj.weight")
+        v_s, v_p, v_sc = load_unary(f"{p}.self_attn.v_proj.weight")
+        o_s, o_p, o_sc = load_unary(f"{p}.self_attn.o_proj.weight")
+        g_s, g_p, g_sc = load_unary(f"{p}.mlp.gate_proj.weight")
+        u_s, u_p, u_sc = load_unary(f"{p}.mlp.up_proj.weight")
+        d_s, d_p, d_sc = load_unary(f"{p}.mlp.down_proj.weight")
+        um = manifest["unary"]
+        lib.layer_set_linears(model, l,
+            q_s, q_p, q_sc, um[f"{p}.self_attn.q_proj.weight"][0], um[f"{p}.self_attn.q_proj.weight"][1],
+            k_s, k_p, k_sc, um[f"{p}.self_attn.k_proj.weight"][0], um[f"{p}.self_attn.k_proj.weight"][1],
+            v_s, v_p, v_sc, um[f"{p}.self_attn.v_proj.weight"][0], um[f"{p}.self_attn.v_proj.weight"][1],
+            o_s, o_p, o_sc, um[f"{p}.self_attn.o_proj.weight"][0], um[f"{p}.self_attn.o_proj.weight"][1],
+            g_s, g_p, g_sc, um[f"{p}.mlp.gate_proj.weight"][0], um[f"{p}.mlp.gate_proj.weight"][1],
+            u_s, u_p, u_sc, um[f"{p}.mlp.up_proj.weight"][0], um[f"{p}.mlp.up_proj.weight"][1],
+            d_s, d_p, d_sc, um[f"{p}.mlp.down_proj.weight"][0], um[f"{p}.mlp.down_proj.weight"][1],
+            w_planes)
+        if (l + 1) % 6 == 0 or l == n_layers - 1:
+            print(f"  Loaded layer {l+1}/{n_layers}")
+    # Tokenize
+    print("Tokenizing...")
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    input_ids = tokenizer.encode(prompt)
+    print(f"Prompt: {len(input_ids)} tokens -> {repr(prompt[:60])}")
+    eos_token = config.get("eos_token_id", 151645)
+    prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
+    out_arr = (ctypes.c_int * max_tokens)()
+    print(f"\nGenerating (temp={temperature}, top_p={top_p}, a_planes={a_planes})...")
+    t0 = time.time()
+    n_gen = lib.generate(
+        model, prompt_arr, len(input_ids),
+        out_arr, max_tokens,
+        ctypes.c_float(temperature), ctypes.c_float(top_p), eos_token
+    )
+    dt = time.time() - t0
+    out_ids = [out_arr[i] for i in range(n_gen)]
+    text = tokenizer.decode(out_ids, skip_special_tokens=True)
+    print(f"\n=== PURE UNARY Output ({n_gen} tokens in {dt:.1f}s = {n_gen/dt:.2f} tok/s) ===")
+    print(text)
+    print(f"\nDecode speed: {n_gen/dt:.2f} tok/s")
+    return text
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
+    prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
+    max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 32
+    a_planes = int(sys.argv[4]) if len(sys.argv) > 4 else 4
+    load_and_run(model_dir, prompt, max_tokens=max_tokens, a_planes=a_planes)

run_qwen3_4b.py ADDED Viewed

	@@ -0,0 +1,221 @@

+#!/usr/bin/env python3
+"""
+Unary model loader for Qwen3-4B-Thinking.
+Loads converted weights and runs inference via unary_engine_v2.so
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import ctypes, numpy as np, os, sys, json, time
+def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9):
+    # Load config
+    config = json.load(open(os.path.join(model_dir, "config.json")))
+    manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
+    n_planes = manifest["n_planes"]
+    n_layers = config["num_hidden_layers"]
+    hidden = config["hidden_size"]
+    inter = config["intermediate_size"]
+    n_heads = config["num_attention_heads"]
+    n_kv_heads = config["num_key_value_heads"]
+    head_dim = config.get("head_dim", hidden // n_heads)
+    vocab = config["vocab_size"]
+    rope_theta = config.get("rope_theta", 10000.0)
+    has_attn_bias = 1 if config.get("attention_bias", False) else 0
+    tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
+    print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}, vocab={vocab}")
+    print(f"QK-Norm: yes, Tied embeddings: {'yes' if tie_embeddings else 'no'}, n_planes={n_planes}")
+    # Load C engine
+    engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "unary_engine_v2.so")
+    lib = ctypes.CDLL(engine_path)
+    # Configure function signatures
+    lib.model_alloc.restype = ctypes.c_void_p
+    lib.model_alloc.argtypes = [
+        ctypes.c_int,  # n_planes
+        ctypes.c_int,  # hidden
+        ctypes.c_int,  # inter
+        ctypes.c_int,  # n_heads
+        ctypes.c_int,  # n_kv_heads
+        ctypes.c_int,  # head_dim
+        ctypes.c_int,  # n_layers
+        ctypes.c_int,  # vocab
+        ctypes.c_float,  # rope_theta
+        ctypes.c_int,  # has_attn_bias
+        ctypes.c_int,  # tie_embeddings
+    ]
+    lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
+    lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+    lib.generate.restype = ctypes.c_int
+    lib.generate.argtypes = [
+        ctypes.c_void_p,
+        ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+        ctypes.POINTER(ctypes.c_int), ctypes.c_int,
+        ctypes.c_float, ctypes.c_float, ctypes.c_int
+    ]
+    u16p = ctypes.POINTER(ctypes.c_uint16)
+    f32p = ctypes.POINTER(ctypes.c_float)
+    u64p = ctypes.POINTER(ctypes.c_uint64)
+    lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
+    lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
+    lib.model_set_lm_head.argtypes = [ctypes.c_void_p, u16p, ctypes.c_int, ctypes.c_int]
+    lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p, f32p]
+    lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
+    lib.layer_set_linears.argtypes = [
+        ctypes.c_void_p, ctypes.c_int,
+        # q: sign, planes, scales, out, in
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # k
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # v
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # o
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # gate
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # up
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        # down
+        u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
+        ctypes.c_int,  # n_planes
+    ]
+    lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+    # Allocate model
+    print("Allocating model...")
+    model = lib.model_alloc(
+        n_planes, hidden, inter, n_heads, n_kv_heads,
+        head_dim, n_layers, vocab, rope_theta,
+        has_attn_bias, tie_embeddings
+    )
+    # Keep references to prevent GC
+    _refs = []
+    def load_fp16(name):
+        fname = name.replace(".", "_") + ".fp16"
+        path = os.path.join(model_dir, fname)
+        data = np.fromfile(path, dtype=np.uint16)
+        _refs.append(data)
+        return data.ctypes.data_as(u16p)
+    def load_f32_from_fp16(name):
+        fname = name.replace(".", "_") + ".fp16"
+        path = os.path.join(model_dir, fname)
+        data = np.fromfile(path, dtype=np.uint16)
+        # Convert FP16 -> FP32
+        f32 = data.view(np.float16).astype(np.float32)
+        _refs.append(f32)
+        return f32.ctypes.data_as(f32p)
+    def load_unary(name):
+        fname = name.replace(".", "_")
+        sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
+        planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
+        scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
+        _refs.extend([sign, planes, scales])
+        return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
+                scales.ctypes.data_as(f32p))
+    # Load embeddings
+    print("Loading embeddings...")
+    embed_ptr = load_fp16("model.embed_tokens.weight")
+    lib.model_set_embed(model, embed_ptr)
+    # Load final norm
+    print("Loading final norm...")
+    fnorm_ptr = load_f32_from_fp16("model.norm.weight")
+    lib.model_set_final_norm(model, fnorm_ptr)
+    # Load layers
+    print(f"Loading {n_layers} layers...")
+    for l in range(n_layers):
+        prefix = f"model.layers.{l}"
+        # Norms
+        in_norm = load_f32_from_fp16(f"{prefix}.input_layernorm.weight")
+        post_norm = load_f32_from_fp16(f"{prefix}.post_attention_layernorm.weight")
+        lib.layer_set_norms(model, l, in_norm, post_norm)
+        # QK-Norm
+        q_norm = load_f32_from_fp16(f"{prefix}.self_attn.q_norm.weight")
+        k_norm = load_f32_from_fp16(f"{prefix}.self_attn.k_norm.weight")
+        lib.layer_set_qk_norm(model, l, q_norm, k_norm)
+        # Linear layers
+        q_s, q_p, q_sc = load_unary(f"{prefix}.self_attn.q_proj.weight")
+        k_s, k_p, k_sc = load_unary(f"{prefix}.self_attn.k_proj.weight")
+        v_s, v_p, v_sc = load_unary(f"{prefix}.self_attn.v_proj.weight")
+        o_s, o_p, o_sc = load_unary(f"{prefix}.self_attn.o_proj.weight")
+        g_s, g_p, g_sc = load_unary(f"{prefix}.mlp.gate_proj.weight")
+        u_s, u_p, u_sc = load_unary(f"{prefix}.mlp.up_proj.weight")
+        d_s, d_p, d_sc = load_unary(f"{prefix}.mlp.down_proj.weight")
+        # Dims from manifest
+        q_shape = manifest["unary"][f"{prefix}.self_attn.q_proj.weight"]
+        k_shape = manifest["unary"][f"{prefix}.self_attn.k_proj.weight"]
+        v_shape = manifest["unary"][f"{prefix}.self_attn.v_proj.weight"]
+        o_shape = manifest["unary"][f"{prefix}.self_attn.o_proj.weight"]
+        g_shape = manifest["unary"][f"{prefix}.mlp.gate_proj.weight"]
+        u_shape = manifest["unary"][f"{prefix}.mlp.up_proj.weight"]
+        d_shape = manifest["unary"][f"{prefix}.mlp.down_proj.weight"]
+        lib.layer_set_linears(
+            model, l,
+            q_s, q_p, q_sc, q_shape[0], q_shape[1],
+            k_s, k_p, k_sc, k_shape[0], k_shape[1],
+            v_s, v_p, v_sc, v_shape[0], v_shape[1],
+            o_s, o_p, o_sc, o_shape[0], o_shape[1],
+            g_s, g_p, g_sc, g_shape[0], g_shape[1],
+            u_s, u_p, u_sc, u_shape[0], u_shape[1],
+            d_s, d_p, d_sc, d_shape[0], d_shape[1],
+            n_planes
+        )
+        if (l + 1) % 6 == 0 or l == n_layers - 1:
+            print(f"  Loaded layer {l+1}/{n_layers}")
+    # Tokenize
+    print("Tokenizing prompt...")
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
+    input_ids = tokenizer.encode(prompt)
+    print(f"Prompt: {len(input_ids)} tokens")
+    eos_token = config.get("eos_token_id", 151645)
+    # Generate
+    prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
+    out_arr = (ctypes.c_int * max_tokens)()
+    print(f"\nGenerating (temp={temperature}, top_p={top_p})...")
+    t0 = time.time()
+    n_generated = lib.generate(
+        model, prompt_arr, len(input_ids),
+        out_arr, max_tokens,
+        ctypes.c_float(temperature), ctypes.c_float(top_p),
+        eos_token
+    )
+    dt = time.time() - t0
+    out_ids = [out_arr[i] for i in range(n_generated)]
+    text = tokenizer.decode(out_ids, skip_special_tokens=True)
+    total_tokens = len(input_ids) + n_generated
+    print(f"\n=== Output ({n_generated} tokens in {dt:.1f}s = {n_generated/dt:.1f} tok/s) ===")
+    print(text)
+    print(f"\nPrefill: {len(input_ids)} tokens, Decode: {n_generated} tokens")
+    print(f"Total time: {dt:.1f}s, Speed: {total_tokens/dt:.1f} tok/s total, {n_generated/dt:.1f} tok/s decode")
+    return text
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
+    prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
+    max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 64
+    load_and_run(model_dir, prompt, max_tokens=max_tokens)

server.py ADDED Viewed

	@@ -0,0 +1,107 @@

+#!/usr/bin/env python3
+"""
+OpenAI-compatible API server for Ternary Transformer Engine.
+Drop-in replacement for llama-server.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import json
+import time
+import threading
+from http.server import HTTPServer, BaseHTTPRequestHandler
+from inference import TernaryQwen, Tokenizer, load_kernel
+import os
+MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary")
+TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf")
+HOST = os.environ.get("HOST", "127.0.0.1")
+PORT = int(os.environ.get("PORT", "8080"))
+print("Loading ternary kernel...")
+kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
+print(f"Loading model from {MODEL_DIR}...")
+model = TernaryQwen(MODEL_DIR, kernel)
+print(f"Loading tokenizer from {TOKENIZER_DIR}...")
+tokenizer = Tokenizer(TOKENIZER_DIR)
+lock = threading.Lock()
+print("Ready!")
+class Handler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        if self.path == "/v1/chat/completions":
+            length = int(self.headers.get("Content-Length", 0))
+            body = json.loads(self.rfile.read(length))
+            messages = body.get("messages", [])
+            max_tokens = body.get("max_tokens", 256)
+            temperature = body.get("temperature", 0.6)
+            top_p = body.get("top_p", 0.95)
+            # Build prompt
+            prompt = tokenizer.apply_chat_template(messages)
+            input_ids = tokenizer.encode(prompt)
+            # Generate
+            with lock:
+                gen_ids, stats = model.generate(
+                    input_ids,
+                    max_new_tokens=max_tokens,
+                    temperature=temperature,
+                    top_p=top_p
+                )
+            text = tokenizer.decode(gen_ids)
+            response = {
+                "id": f"chatcmpl-ternary-{int(time.time())}",
+                "object": "chat.completion",
+                "created": int(time.time()),
+                "model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY",
+                "choices": [{
+                    "index": 0,
+                    "message": {"role": "assistant", "content": text},
+                    "finish_reason": "stop"
+                }],
+                "usage": {
+                    "prompt_tokens": len(input_ids),
+                    "completion_tokens": stats["tokens_generated"],
+                    "total_tokens": len(input_ids) + stats["tokens_generated"]
+                },
+                "timings": {
+                    "prompt_n": stats["prefill_tokens"],
+                    "prompt_ms": stats["prefill_ms"],
+                    "predicted_n": stats["tokens_generated"],
+                    "predicted_ms": stats["decode_ms"],
+                    "predicted_per_second": stats["tok_per_sec"],
+                }
+            }
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(json.dumps(response).encode())
+        else:
+            self.send_response(404)
+            self.end_headers()
+    def do_GET(self):
+        if self.path == "/health":
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.end_headers()
+            self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}')
+        else:
+            self.send_response(404)
+            self.end_headers()
+    def log_message(self, format, *args):
+        pass
+if __name__ == "__main__":
+    server = HTTPServer((HOST, PORT), Handler)
+    print(f"Ternary engine serving on {HOST}:{PORT}")
+    server.serve_forever()

ternary_kernel.c ADDED Viewed

	@@ -0,0 +1,265 @@

+/*
+ * Ternary Neural Network Kernel - AVX-512 optimized
+ *
+ * Weights are stored as two bitplanes per row:
+ *   pos_mask: bit=1 where weight = +1
+ *   neg_mask: bit=1 where weight = -1
+ *   (both 0 = weight is 0)
+ *
+ * Matmul becomes: y[i] = sum(x[j] where pos) - sum(x[j] where neg)
+ * No multiplication at all — just masked add/subtract.
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+/* ============================================================
+ * Core ternary matmul: y = W_ternary @ x
+ *
+ * W stored as bitplanes: pos_bits[out_dim][ceil(in_dim/64)] uint64
+ *                        neg_bits[out_dim][ceil(in_dim/64)] uint64
+ * x: float32[in_dim]
+ * y: float32[out_dim]
+ * bias: float32[out_dim] or NULL
+ * scale: float32 per-row scale factor (to recover magnitude)
+ * ============================================================ */
+void ternary_matvec_avx512(
+    const uint64_t *pos_bits,   /* [out_dim * chunks] */
+    const uint64_t *neg_bits,   /* [out_dim * chunks] */
+    const float    *scales,     /* [out_dim] per-row scale */
+    const float    *x,          /* [in_dim] input activations */
+    float          *y,          /* [out_dim] output */
+    int             out_dim,
+    int             in_dim
+) {
+    int chunks = (in_dim + 63) / 64;  /* 64 weights per uint64 */
+    /* Pad input to multiple of 16 floats for AVX-512 */
+    int in_padded = (in_dim + 15) & ~15;
+    float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
+    memcpy(x_pad, x, in_dim * sizeof(float));
+    memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
+    for (int i = 0; i < out_dim; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        const uint64_t *row_pos = pos_bits + (size_t)i * chunks;
+        const uint64_t *row_neg = neg_bits + (size_t)i * chunks;
+        /* Process 64 weights at a time (4 AVX-512 ops of 16 floats each) */
+        for (int c = 0; c < chunks; c++) {
+            uint64_t pb = row_pos[c];
+            uint64_t nb = row_neg[c];
+            int base = c * 64;
+            /* Process in groups of 16 floats */
+            for (int g = 0; g < 4 && (base + g * 16) < in_padded; g++) {
+                int offset = base + g * 16;
+                __m512 xv = _mm512_load_ps(x_pad + offset);
+                /* Extract 16 bits for this group */
+                __mmask16 pmask = (__mmask16)((pb >> (g * 16)) & 0xFFFF);
+                __mmask16 nmask = (__mmask16)((nb >> (g * 16)) & 0xFFFF);
+                /* Masked add where weight = +1, masked subtract where weight = -1 */
+                acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
+                acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
+            }
+        }
+        /* Horizontal sum */
+        float sum = _mm512_reduce_add_ps(acc);
+        /* Apply per-row scale to recover magnitude */
+        y[i] = sum * scales[i];
+    }
+    free(x_pad);
+}
+/* ============================================================
+ * Batched version: Y = W_ternary @ X  (multiple input vectors)
+ * X: [batch, in_dim], Y: [batch, out_dim]
+ * ============================================================ */
+void ternary_matmul_avx512(
+    const uint64_t *pos_bits,
+    const uint64_t *neg_bits,
+    const float    *scales,
+    const float    *X,
+    float          *Y,
+    int             batch,
+    int             out_dim,
+    int             in_dim
+) {
+    for (int b = 0; b < batch; b++) {
+        ternary_matvec_avx512(
+            pos_bits, neg_bits, scales,
+            X + (size_t)b * in_dim,
+            Y + (size_t)b * out_dim,
+            out_dim, in_dim
+        );
+    }
+}
+/* ============================================================
+ * RMSNorm: y = x * (1/rms(x)) * weight
+ * ============================================================ */
+void rmsnorm_avx512(
+    const float *x,
+    const float *weight,
+    float       *y,
+    int          dim,
+    float        eps
+) {
+    /* Compute sum of squares */
+    __m512 sum_sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
+    }
+    float ss = _mm512_reduce_add_ps(sum_sq);
+    /* Handle remainder */
+    for (; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + eps);
+    /* Apply norm and weight */
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 wv = _mm512_loadu_ps(weight + i);
+        __m512 rv = _mm512_set1_ps(rms);
+        __m512 out = _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv);
+        _mm512_storeu_ps(y + i, out);
+    }
+    for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
+}
+/* ============================================================
+ * SiLU activation: x * sigmoid(x)
+ * ============================================================ */
+static inline float silu_scalar(float x) {
+    return x / (1.0f + expf(-x));
+}
+void silu_avx512(float *x, int n) {
+    /* Scalar fallback — vectorized exp is complex */
+    for (int i = 0; i < n; i++) {
+        x[i] = silu_scalar(x[i]);
+    }
+}
+/* ============================================================
+ * Element-wise multiply: y = a * b
+ * ============================================================ */
+void elemwise_mul_avx512(const float *a, const float *b, float *y, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 av = _mm512_loadu_ps(a + i);
+        __m512 bv = _mm512_loadu_ps(b + i);
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(av, bv));
+    }
+    for (; i < n; i++) y[i] = a[i] * b[i];
+}
+/* ============================================================
+ * Softmax
+ * ============================================================ */
+void softmax(float *x, int n) {
+    float max_val = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
+    float sum = 0;
+    for (int i = 0; i < n; i++) {
+        x[i] = expf(x[i] - max_val);
+        sum += x[i];
+    }
+    float inv_sum = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv_sum;
+}
+/* ============================================================
+ * RoPE (Rotary Position Embedding)
+ * ============================================================ */
+void apply_rope(
+    float *q,       /* [n_heads, head_dim] */
+    float *k,       /* [n_kv_heads, head_dim] */
+    int    n_heads,
+    int    n_kv_heads,
+    int    head_dim,
+    int    pos,
+    float  rope_theta
+) {
+    for (int h = 0; h < n_heads + n_kv_heads; h++) {
+        float *vec = (h < n_heads) ? q + h * head_dim : k + (h - n_heads) * head_dim;
+        for (int i = 0; i < head_dim; i += 2) {
+            float freq = 1.0f / powf(rope_theta, (float)i / head_dim);
+            float angle = pos * freq;
+            float cos_a = cosf(angle);
+            float sin_a = sinf(angle);
+            float v0 = vec[i];
+            float v1 = vec[i + 1];
+            vec[i]     = v0 * cos_a - v1 * sin_a;
+            vec[i + 1] = v0 * sin_a + v1 * cos_a;
+        }
+    }
+}
+/* ============================================================
+ * Quantization: convert float weights to ternary
+ * Uses per-row threshold: threshold = alpha * mean(|w|)
+ * Returns: pos_bits, neg_bits, scales
+ * ============================================================ */
+void quantize_to_ternary(
+    const float *weights,   /* [out_dim, in_dim] */
+    uint64_t    *pos_bits,  /* [out_dim * chunks] output */
+    uint64_t    *neg_bits,  /* [out_dim * chunks] output */
+    float       *scales,    /* [out_dim] output */
+    int          out_dim,
+    int          in_dim,
+    float        alpha      /* threshold multiplier, typically 0.7-1.0 */
+) {
+    int chunks = (in_dim + 63) / 64;
+    for (int i = 0; i < out_dim; i++) {
+        const float *row = weights + (size_t)i * in_dim;
+        /* Compute mean absolute value for threshold */
+        float abs_sum = 0;
+        for (int j = 0; j < in_dim; j++) abs_sum += fabsf(row[j]);
+        float mean_abs = abs_sum / in_dim;
+        float threshold = alpha * mean_abs;
+        /* Compute scale: mean of absolute values of non-zero quantized weights */
+        float nz_sum = 0;
+        int nz_count = 0;
+        for (int j = 0; j < in_dim; j++) {
+            if (fabsf(row[j]) >= threshold) {
+                nz_sum += fabsf(row[j]);
+                nz_count++;
+            }
+        }
+        scales[i] = (nz_count > 0) ? (nz_sum / nz_count) : 1.0f;
+        /* Quantize to ternary bits */
+        for (int c = 0; c < chunks; c++) {
+            uint64_t pb = 0, nb = 0;
+            for (int b = 0; b < 64; b++) {
+                int j = c * 64 + b;
+                if (j >= in_dim) break;
+                if (row[j] >= threshold) {
+                    pb |= (1ULL << b);
+                } else if (row[j] <= -threshold) {
+                    nb |= (1ULL << b);
+                }
+            }
+            pos_bits[(size_t)i * chunks + c] = pb;
+            neg_bits[(size_t)i * chunks + c] = nb;
+        }
+    }
+}

test_logunary ADDED Viewed

Binary file (26.3 kB). View file

test_logunary.c ADDED Viewed

	@@ -0,0 +1,153 @@

+/*
+ * Log-Unary Tensor Tests
+ * Benchmarks accuracy and speed of native base-1 log-encoded tensors
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <time.h>
+/* Forward declarations from library */
+typedef struct LogUnaryTensor LogUnaryTensor;
+typedef struct LogUnaryMatrix LogUnaryMatrix;
+typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
+typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;
+extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
+extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
+extern void lut_free(LogUnaryTensor *t);
+extern void lum_free(LogUnaryMatrix *m);
+extern void lut_from_float(LogUnaryTensor *t, const float *x);
+extern void lut_to_float(const LogUnaryTensor *t, float *out);
+extern void lum_from_float(LogUnaryMatrix *m, const float *data);
+extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
+extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
+extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
+extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
+extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);
+/* Test matvec correctness against float reference */
+static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
+    printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);
+    /* Random float matrix and vector */
+    float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
+    float *x_float = (float *)malloc(cols * sizeof(float));
+    float *y_ref = (float *)calloc(rows, sizeof(float));
+    float *y_lut = (float *)malloc(rows * sizeof(float));
+    srand(42);
+    for (size_t i = 0; i < (size_t)rows * cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    for (int i = 0; i < cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    /* Float reference matmul */
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];
+    /* Log-unary matmul */
+    LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
+    LogUnaryTensor *x = lut_alloc(cols, planes, bias);
+    LogUnaryTensor *y = lut_alloc(rows, planes, bias);
+    lum_from_float(M, M_float);
+    lut_from_float(x, x_float);
+    lum_matvec(M, x, y);
+    lut_to_float(y, y_lut);
+    /* Compare */
+    float dot = 0, na = 0, nb = 0, max_err = 0;
+    for (int i = 0; i < rows; i++) {
+        dot += y_ref[i] * y_lut[i];
+        na += y_ref[i] * y_ref[i];
+        nb += y_lut[i] * y_lut[i];
+        float err = fabsf(y_ref[i] - y_lut[i]);
+        if (err > max_err) max_err = err;
+    }
+    float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+    float noise = 0;
+    for (int i = 0; i < rows; i++) {
+        float e = y_ref[i] - y_lut[i];
+        noise += e * e;
+    }
+    float snr = 10.0f * log10f(na / (noise + 1e-10f));
+    printf("  Cosine similarity: %.6f\n", cosine);
+    printf("  SNR: %.1f dB\n", snr);
+    printf("  Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));
+    /* Show first few values */
+    printf("  First 5 values:\n");
+    for (int i = 0; i < 5 && i < rows; i++)
+        printf("    ref=%.4f  lut=%.4f  err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);
+    lum_free(M); lut_free(x); lut_free(y);
+    free(M_float); free(x_float); free(y_ref); free(y_lut);
+}
+int main() {
+    srand(time(NULL));
+    printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");
+    /* 1. Roundtrip accuracy at different plane counts */
+    printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
+    printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
+    for (int np = 4; np <= 12; np += 2) {
+        int bias = np / 2;
+        AccuracyResult r = lut_accuracy_test(4096, np, bias);
+        printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
+               np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
+    }
+    /* 2. Matvec correctness */
+    test_matvec_correctness(64, 256, 7, 3);
+    test_matvec_correctness(256, 1024, 7, 3);
+    test_matvec_correctness(512, 2560, 7, 3);  /* Qwen3-4B hidden dim */
+    /* 3. Speed benchmarks - various configurations */
+    printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
+    printf("%10s %6s %6s %6s %10s %10s %10s\n",
+           "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");
+    struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
+        /* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
+        {4096, 2560, 7, 4, 3, "q_proj"},
+        {4096, 2560, 7, 7, 3, "q_proj_7x7"},
+        {1024, 2560, 7, 4, 3, "k_proj"},
+        /* Qwen3-4B MLP: inter=9728 */
+        {9728, 2560, 7, 4, 3, "gate_proj"},
+        {2560, 9728, 7, 4, 3, "down_proj"},
+        /* Different plane counts */
+        {4096, 2560, 4, 4, 2, "4x4"},
+        {4096, 2560, 8, 8, 4, "8x8"},
+        {4096, 2560, 10, 6, 3, "10x6"},
+    };
+    int n_configs = sizeof(configs) / sizeof(configs[0]);
+    for (int c = 0; c < n_configs; c++) {
+        int iters = 3;
+        BenchResult r = lum_bench_matvec(
+            configs[c].rows, configs[c].cols,
+            configs[c].wp, configs[c].xp, configs[c].bias, iters);
+        printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG  [%s]\n",
+               configs[c].rows, configs[c].cols,
+               configs[c].wp, configs[c].xp, configs[c].bias,
+               r.wall_time_s * 1000,
+               r.elements_per_sec / 1e6,
+               r.gops,
+               configs[c].label);
+    }
+    printf("\n=== DONE ===\n");
+    return 0;
+}

test_popcount.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/env python3
+"""Test the full-unary popcount engine."""
+import ctypes, numpy as np, os, time, sys
+os.environ["OMP_NUM_THREADS"] = "16"
+MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary4"
+HF_DIR = "deepseek-r1-1.5b-hf"
+N_PLANES = int(sys.argv[2]) if len(sys.argv) > 2 else 4
+lib = ctypes.CDLL("./unary_full.so")
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+args = [ctypes.c_void_p, ctypes.c_int]
+for _ in range(7):
+    args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+args.append(ctypes.c_int)
+lib.layer_set_linears.argtypes = args
+lib.generate.restype = ctypes.c_int
+lib.generate.argtypes = [
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
+    ctypes.c_void_p, ctypes.c_int,
+    ctypes.c_float, ctypes.c_float, ctypes.c_int
+]
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+lib.model_free.argtypes = [ctypes.c_void_p]
+_refs = []
+def keep(a):
+    _refs.append(a)
+    return a.ctypes.data
+print(f"Loading model from {MODEL_DIR} (w_planes={N_PLANES})...")
+m = lib.model_alloc(N_PLANES)
+# Embed + final norm + lm_head
+e = np.fromfile(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"), dtype=np.uint16)
+lib.model_set_embed(m, keep(e))
+fn = np.fromfile(os.path.join(MODEL_DIR, "model_norm_weight.fp16"), dtype=np.float16).astype(np.float32)
+lib.model_set_final_norm(m, keep(fn))
+lm = np.fromfile(os.path.join(MODEL_DIR, "lm_head_weight.fp16"), dtype=np.uint16)
+lib.model_set_lm_head(m, keep(lm), 151936, 1536)
+PROJS = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
+         "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
+DIMS = {
+    "self_attn_q_proj": (1536, 1536), "self_attn_k_proj": (256, 1536),
+    "self_attn_v_proj": (256, 1536), "self_attn_o_proj": (1536, 1536),
+    "mlp_gate_proj": (8960, 1536), "mlp_up_proj": (8960, 1536),
+    "mlp_down_proj": (1536, 8960),
+}
+for l in range(28):
+    in_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
+    po_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
+    lib.layer_set_norms(m, l, keep(in_n), keep(po_n))
+    qb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
+    kb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
+    vb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
+    lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
+    pa = []
+    for p in PROJS:
+        base = os.path.join(MODEL_DIR, f"model_layers_{l}_{p}_weight")
+        s = np.fromfile(base + ".sign", dtype=np.uint64)
+        pl = np.fromfile(base + ".planes", dtype=np.uint64)
+        sc = np.fromfile(base + ".scales", dtype=np.float32)
+        od, id_ = DIMS[p]
+        pa.extend([keep(s), keep(pl), keep(sc), od, id_])
+    lib.layer_set_linears(m, l, *pa, N_PLANES)
+    if (l + 1) % 7 == 0:
+        print(f"  Layer {l+1}/28")
+print("Model loaded!")
+from transformers import AutoTokenizer
+tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
+msg = [{"role": "user", "content": "What is 2+2?"}]
+ids = tok.apply_chat_template(msg, add_generation_prompt=True)
+arr = np.array(ids, dtype=np.int32)
+out = np.zeros(30, dtype=np.int32)
+lib.model_reset_cache(m)
+print(f"Prompt: {len(ids)} tokens, generating 30...")
+t0 = time.time()
+n = lib.generate(m, arr.ctypes.data, len(ids), out.ctypes.data, 30,
+                 ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
+dt = time.time() - t0
+text = tok.decode(out[:n].tolist(), skip_special_tokens=False)
+print(f"\n=== {n} tokens, {dt:.1f}s, {n/dt:.1f} tok/s ===")
+print(text)
+print("===")
+lib.model_free(m)

true_unary ADDED Viewed

Binary file (29.9 kB). View file

true_unary.c ADDED Viewed

	@@ -0,0 +1,552 @@

+/*
+ * TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
+ *
+ * Representation:
+ *   A value of magnitude M is stored as M consecutive 1-bits.
+ *   The number IS the count of ones.
+ *   Every bit has weight exactly 1.
+ *
+ * For a vector element quantized to integer range [-K, K]:
+ *   sign:      1 bit (0=positive, 1=negative)
+ *   magnitude: K bit positions, first |value| are 1, rest are 0
+ *
+ * Storage layout for a vector of dim D with max magnitude K:
+ *   sign:   uint64[(D+63)/64]           — one sign bit per element
+ *   unary:  uint64[K * (D+63)/64]       — K bitplanes across D elements
+ *            Plane p has bit j set iff |element_j| > p
+ *            (thermometer = true unary in bitplane form)
+ *
+ * Multiplication: w * x = popcount of ones(w) matched with ones(x)
+ *   Since every bit = 1, the dot product is JUST COUNTING.
+ *   No weights, no shifts, no corrections.
+ *   sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
+ *                 = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
+ *
+ * YES this uses more memory. A 2560-dim vector with K=32 uses:
+ *   32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
+ *   But the MATH IS EXACT (to quantization level).
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#define _POSIX_C_SOURCE 199309L
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+/* ============================================================
+ * TRUE UNARY VECTOR
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign;     /* [chunks] — 1 bit per element */
+    uint64_t *unary;    /* [K * chunks] — K bitplanes, each bit = weight 1 */
+    float     scale;    /* float scale: real_value = sign * count * scale */
+    int       dim;
+    int       chunks;   /* (dim+63)/64 */
+    int       K;        /* max magnitude = number of unary bitplanes */
+} TrueUnaryVec;
+/* TRUE UNARY MATRIX — row-major */
+typedef struct {
+    uint64_t *sign;     /* [rows * chunks] */
+    uint64_t *unary;    /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */
+    float    *scales;   /* [rows] — per-row scale factors */
+    int       rows;
+    int       cols;
+    int       chunks;   /* (cols+63)/64 */
+    int       K;        /* max magnitude per element */
+} TrueUnaryMat;
+/* ============================================================
+ * ALLOCATION
+ * ============================================================ */
+TrueUnaryVec* tuv_alloc(int dim, int K) {
+    TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec));
+    v->dim = dim;
+    v->K = K;
+    v->chunks = (dim + 63) / 64;
+    v->scale = 1.0f;
+    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
+    v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t));
+    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
+    memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
+    return v;
+}
+TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
+    TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat));
+    m->rows = rows;
+    m->cols = cols;
+    m->K = K;
+    m->chunks = (cols + 63) / 64;
+    m->sign   = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
+    m->unary  = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
+    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
+    memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
+    for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
+    return m;
+}
+void tuv_free(TrueUnaryVec *v) {
+    if (v) { free(v->sign); free(v->unary); free(v); }
+}
+void tum_free(TrueUnaryMat *m) {
+    if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
+}
+/* ============================================================
+ * FLOAT → TRUE UNARY
+ *
+ * Quantize: integer_val = round(float_val / scale * K)
+ * Then store |integer_val| as that many 1-bits.
+ *
+ * For vector: single global scale = absmax / K
+ * For matrix: per-row scale = row_absmax / K
+ * ============================================================ */
+void tuv_from_float(TrueUnaryVec *v, const float *x) {
+    int dim = v->dim, K = v->K, chunks = v->chunks;
+    memset(v->sign, 0, chunks * sizeof(uint64_t));
+    memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));
+    float amax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > amax) amax = a;
+    }
+    if (amax == 0.0f) { v->scale = 1.0f; return; }
+    v->scale = amax / K;
+    float inv = K / amax;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        if (x[i] < 0.0f) v->sign[c] |= bit;
+        int mag = (int)(fabsf(x[i]) * inv + 0.5f);
+        if (mag > K) mag = K;
+        /* TRUE UNARY: set planes 0 through mag-1 */
+        for (int p = 0; p < mag; p++)
+            v->unary[(size_t)p * chunks + c] |= bit;
+    }
+}
+void tuv_to_float(const TrueUnaryVec *v, float *out) {
+    int dim = v->dim, K = v->K, chunks = v->chunks;
+    for (int i = 0; i < dim; i++) {
+        int c = i / 64;
+        uint64_t bit = 1ULL << (i % 64);
+        /* Count set planes = magnitude in base-1 */
+        int mag = 0;
+        for (int p = 0; p < K; p++) {
+            if (v->unary[(size_t)p * chunks + c] & bit)
+                mag++;
+        }
+        float val = (float)mag * v->scale;
+        out[i] = (v->sign[c] & bit) ? -val : val;
+    }
+}
+void tum_from_float(TrueUnaryMat *m, const float *data) {
+    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
+    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
+    memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
+    for (int r = 0; r < rows; r++) {
+        const float *row = data + (size_t)r * cols;
+        float amax = 0.0f;
+        for (int j = 0; j < cols; j++) {
+            float a = fabsf(row[j]);
+            if (a > amax) amax = a;
+        }
+        if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
+        m->scales[r] = amax / K;
+        float inv = K / amax;
+        uint64_t *row_sign = m->sign + (size_t)r * chunks;
+        for (int j = 0; j < cols; j++) {
+            int c = j / 64;
+            uint64_t bit = 1ULL << (j % 64);
+            if (row[j] < 0.0f) row_sign[c] |= bit;
+            int mag = (int)(fabsf(row[j]) * inv + 0.5f);
+            if (mag > K) mag = K;
+            for (int p = 0; p < mag; p++)
+                m->unary[((size_t)p * rows + r) * chunks + c] |= bit;
+        }
+    }
+}
+/* ============================================================
+ * TRUE UNARY MATVEC: y = M @ x
+ *
+ * THE CORE OPERATION.
+ *
+ * For each output element y[i]:
+ *   For each pair of planes (p from weight, q from activation):
+ *     active = w_plane_p[i] AND x_plane_q
+ *     same   = active AND ~(w_sign[i] XOR x_sign)
+ *     diff   = active AND  (w_sign[i] XOR x_sign)
+ *     acc += popcount(same) - popcount(diff)
+ *
+ *   EVERY PLANE PAIR HAS WEIGHT = 1.
+ *   No shifts. No scaling between planes. No corrections.
+ *   The count IS the answer.
+ *
+ *   y[i] = acc * w_scale[i] * x_scale
+ *   (single float multiply at the very end)
+ *
+ * ============================================================ */
+void tum_matvec(
+    const TrueUnaryMat *M,
+    const TrueUnaryVec *x,
+    float *y_out   /* float output, requantize externally if needed */
+) {
+    int out_dim = M->rows;
+    int chunks = M->chunks;
+    int wK = M->K;
+    int xK = x->K;
+    #pragma omp parallel for schedule(dynamic, 32)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
+        long long acc = 0;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign_row[c];
+            uint64_t xs = x->sign[c];
+            uint64_t same = ~(ws ^ xs);
+            uint64_t diff = ws ^ xs;
+            /*
+             * PURE BASE-1: every plane pair contributes weight 1.
+             * acc += popcount(w_plane AND x_plane AND same_sign)
+             *      - popcount(w_plane AND x_plane AND diff_sign)
+             */
+            for (int p = 0; p < wK; p++) {
+                uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < xK; q++) {
+                    uint64_t xq = x->unary[(size_t)q * chunks + c];
+                    uint64_t active = wp & xq;
+                    acc += __builtin_popcountll(active & same)
+                         - __builtin_popcountll(active & diff);
+                }
+            }
+        }
+        /* Single float rescale per output element */
+        y_out[i] = (float)acc * M->scales[i] * x->scale;
+    }
+}
+/* ============================================================
+ * OPTIMIZED MATVEC: collapse x planes first
+ *
+ * Instead of iterating wK * xK plane pairs per chunk,
+ * precompute per-chunk activation sums:
+ *   x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
+ *   x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
+ *
+ * Then for each weight plane p:
+ *   This doesn't directly simplify because we need AND with wp first.
+ *
+ * ALTERNATIVE: precompute per-element x magnitudes in unary,
+ * then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
+ *
+ * For now: provide both the naive and a vertically-accumulated variant.
+ *
+ * VERTICAL ACCUMULATE: sum all weight planes into a per-element
+ * count, then multiply by x count. Reduces from O(wK*xK*chunks)
+ * to O((wK+xK)*chunks + dim).
+ * ============================================================ */
+void tum_matvec_fast(
+    const TrueUnaryMat *M,
+    const TrueUnaryVec *x,
+    float *y_out
+) {
+    int out_dim = M->rows;
+    int cols = M->cols;
+    int chunks = M->chunks;
+    int xK = x->K;
+    /* Step 1: compute x magnitudes (per-element popcount across planes)
+     * x_mag[j] = number of x planes where bit j is set
+     * This is O(xK * chunks) = O(xK * dim / 64)
+     */
+    int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
+    memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
+    for (int q = 0; q < xK; q++) {
+        const uint64_t *xplane = x->unary + (size_t)q * chunks;
+        for (int c = 0; c < chunks; c++) {
+            uint64_t bits = xplane[c];
+            while (bits) {
+                int bit = __builtin_ctzll(bits);
+                int j = c * 64 + bit;
+                if (j < cols) x_mag[j]++;
+                bits &= bits - 1;
+            }
+        }
+    }
+    /* Apply sign to x_mag: positive if same sign as...
+     * Actually we need signed x_mag relative to each weight row's sign.
+     * So we keep x_mag unsigned and handle sign per output element.
+     */
+    /* Step 2: for each output row, compute:
+     * y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
+     *
+     * w_mag[i][j] = number of weight planes where bit j is set
+     * sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
+     *
+     * We compute w_mag by vertical popcount across weight planes.
+     * This is O(wK * chunks) per row.
+     */
+    #pragma omp parallel
+    {
+        int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
+        #pragma omp for schedule(dynamic, 32)
+        for (int i = 0; i < out_dim; i++) {
+            memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
+            /* Vertical popcount: count set planes per element */
+            for (int p = 0; p < M->K; p++) {
+                const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks;
+                for (int c = 0; c < chunks; c++) {
+                    uint64_t bits = wplane[c];
+                    while (bits) {
+                        int bit = __builtin_ctzll(bits);
+                        int j = c * 64 + bit;
+                        if (j < cols) w_mag[j]++;
+                        bits &= bits - 1;
+                    }
+                }
+            }
+            /* Dot product with sign */
+            const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
+            long long acc = 0;
+            for (int j = 0; j < cols; j++) {
+                int c = j / 64;
+                uint64_t bit = 1ULL << (j % 64);
+                int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
+                int product = (int)w_mag[j] * (int)x_mag[j];
+                acc += same_sign ? product : -product;
+            }
+            y_out[i] = (float)acc * M->scales[i] * x->scale;
+        }
+        free(w_mag);
+    }
+    free(x_mag);
+}
+/* ============================================================
+ * BENCHMARK + ACCURACY
+ * ============================================================ */
+typedef struct {
+    float cosine;
+    float snr_db;
+    float max_rel_err;
+    double ms_naive;
+    double ms_fast;
+    double gops_naive;
+    double gops_fast;
+} TestResult;
+TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
+    TestResult r = {0};
+    srand(42);
+    /* Random float matrix and vector (normal distribution) */
+    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
+    float *xf = (float *)malloc(cols * sizeof(float));
+    float *y_ref = (float *)calloc(rows, sizeof(float));
+    float *y_naive = (float *)malloc(rows * sizeof(float));
+    float *y_fast = (float *)malloc(rows * sizeof(float));
+    for (size_t i = 0; i < (size_t)rows * cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    for (int i = 0; i < cols; i++) {
+        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+        xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+    }
+    /* Float reference */
+    for (int i = 0; i < rows; i++)
+        for (int j = 0; j < cols; j++)
+            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
+    /* Convert to true unary */
+    TrueUnaryMat *M = tum_alloc(rows, cols, wK);
+    TrueUnaryVec *x = tuv_alloc(cols, xK);
+    tum_from_float(M, Mf);
+    tuv_from_float(x, xf);
+    /* Naive matvec */
+    struct timespec t0, t1;
+    tum_matvec(M, x, y_naive);  /* warmup */
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    for (int i = 0; i < iters; i++)
+        tum_matvec(M, x, y_naive);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
+    /* Fast matvec */
+    tum_matvec_fast(M, x, y_fast);  /* warmup */
+    clock_gettime(CLOCK_MONOTONIC, &t0);
+    for (int i = 0; i < iters; i++)
+        tum_matvec_fast(M, x, y_fast);
+    clock_gettime(CLOCK_MONOTONIC, &t1);
+    r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
+    /* Accuracy vs float reference */
+    float dot = 0, na = 0, nb = 0, max_re = 0;
+    for (int i = 0; i < rows; i++) {
+        dot += y_ref[i] * y_naive[i];
+        na += y_ref[i] * y_ref[i];
+        nb += y_naive[i] * y_naive[i];
+        float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
+        if (re > max_re) max_re = re;
+    }
+    r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
+    float noise = 0;
+    for (int i = 0; i < rows; i++) {
+        float e = y_ref[i] - y_naive[i]; noise += e * e;
+    }
+    r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
+    r.max_rel_err = max_re;
+    /* Verify naive == fast */
+    float fast_err = 0;
+    for (int i = 0; i < rows; i++) {
+        float e = fabsf(y_naive[i] - y_fast[i]);
+        if (e > fast_err) fast_err = e;
+    }
+    if (fast_err > 0.01f)
+        printf("  WARNING: naive vs fast max diff = %.4f\n", fast_err);
+    double ops = 2.0 * rows * cols;
+    r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
+    r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);
+    tum_free(M); tuv_free(x);
+    free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
+    return r;
+}
+/* ============================================================
+ * MAIN: sweep K values, show accuracy + speed tradeoff
+ * ============================================================ */
+int main() {
+    printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
+    printf("Every bit has weight 1. Value = count of ones.\n");
+    printf("Matmul = AND + popcount, no weighting.\n\n");
+    /* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
+    int rows = 4096, cols = 2560;
+    printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);
+    printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n",
+           "wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
+           "Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");
+    struct { int wK; int xK; } configs[] = {
+        {8,   4},
+        {8,   8},
+        {16,  8},
+        {16, 16},
+        {32,  8},
+        {32, 16},
+        {32, 32},
+        {64, 16},
+        {64, 32},
+    };
+    int n = sizeof(configs) / sizeof(configs[0]);
+    for (int c = 0; c < n; c++) {
+        int wK = configs[c].wK;
+        int xK = configs[c].xK;
+        int iters = (wK <= 16 && xK <= 16) ? 3 : 1;
+        TestResult r = tum_test(rows, cols, wK, xK, iters);
+        /* Memory for this layer's weights */
+        size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
+        size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
+        size_t scale_bytes = rows * 4;
+        double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;
+        printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n",
+               wK, xK, r.cosine, r.snr_db, r.max_rel_err,
+               r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
+    }
+    /* Show first 5 values for K=32,16 case */
+    printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
+    {
+        int sr = 512, sc = 2560;
+        srand(42);
+        float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float));
+        float *xf = (float *)malloc(sc * sizeof(float));
+        float *y_ref = (float *)calloc(sr, sizeof(float));
+        float *y_unary = (float *)malloc(sr * sizeof(float));
+        for (size_t i = 0; i < (size_t)sr * sc; i++) {
+            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+        }
+        for (int i = 0; i < sc; i++) {
+            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
+            xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
+        }
+        for (int i = 0; i < sr; i++)
+            for (int j = 0; j < sc; j++)
+                y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];
+        TrueUnaryMat *M = tum_alloc(sr, sc, 32);
+        TrueUnaryVec *x = tuv_alloc(sc, 16);
+        tum_from_float(M, Mf);
+        tuv_from_float(x, xf);
+        tum_matvec(M, x, y_unary);
+        printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
+        for (int i = 0; i < 10; i++)
+            printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);
+        tum_free(M); tuv_free(x);
+        free(Mf); free(xf); free(y_ref); free(y_unary);
+    }
+    printf("\n=== DONE ===\n");
+    return 0;
+}

unary_convert.py ADDED Viewed

	@@ -0,0 +1,189 @@

+#!/usr/bin/env python3
+"""
+Convert model weights to UNARY (base-1) thermometer encoding.
+True unary: magnitude N = N consecutive 1-bits across N bitplanes.
+Each bitplane contributes equally (value=1), NOT binary powers.
+Weight 0.3 with scale -> magnitude 5 -> planes 0,1,2,3,4 have bit set
+Weight -0.1 with scale -> magnitude 2, sign=neg -> planes 0,1 set + sign bit
+More precision than ternary (N+1 levels vs 3), still no multiplication.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import os
+import json
+import numpy as np
+from pathlib import Path
+import time
+def load_safetensors(model_dir):
+    """Load all tensors from safetensors files."""
+    import torch
+    from safetensors.torch import load_file
+    tensors = {}
+    for f in sorted(Path(model_dir).glob("*.safetensors")):
+        print(f"Loading {f.name}...")
+        state = load_file(str(f))
+        for key, val in state.items():
+            tensors[key] = val.float().numpy()
+    return tensors
+def quantize_matrix_unary(weight, n_planes=7):
+    """Quantize weight matrix to unary thermometer encoding.
+    n_planes determines max magnitude (and precision levels = n_planes + 1).
+    n_planes=7 gives 8 levels: {0,1,2,3,4,5,6,7} * sign = 15 distinct values.
+    Returns: sign_bits, mag_planes, scales, sparsity
+    """
+    w = weight.astype(np.float32)
+    out_dim, in_dim = w.shape
+    chunks = ((in_dim + 63) // 64)
+    padded = chunks * 64
+    # Per-row quantization
+    row_max = np.max(np.abs(w), axis=1, keepdims=True)
+    row_max = np.where(row_max == 0, 1.0, row_max)
+    # Scale to [0, n_planes] range per row
+    scales = (row_max.flatten() / n_planes).astype(np.float32)
+    # Quantize to integer magnitudes
+    w_scaled = w / scales[:, None]  # Now in [-n_planes, +n_planes]
+    magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
+    magnitudes = np.clip(magnitudes, 0, n_planes)
+    signs = (w < 0)  # True = negative
+    # Sparsity (magnitude 0)
+    sparsity = np.mean(magnitudes == 0)
+    # Pad to multiple of 64
+    if in_dim < padded:
+        magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded - in_dim), dtype=np.int32)], axis=1)
+        signs = np.concatenate([signs, np.zeros((out_dim, padded - in_dim), dtype=bool)], axis=1)
+    # Pack sign bits - vectorized
+    bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
+    signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
+    sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)  # [out_dim, chunks]
+    # Pack magnitude planes - thermometer encoding
+    # Plane p has bit set where magnitude > p (i.e., magnitude >= p+1)
+    mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
+    for p in range(n_planes):
+        active = (magnitudes >= (p + 1))  # [out_dim, padded]
+        active_r = active.reshape(out_dim, chunks, 64).astype(np.uint64)
+        mag_planes[p] = np.bitwise_or.reduce(active_r * bit_positions, axis=2)
+    return sign_bits, mag_planes, scales, sparsity
+def save_unary_model(tensors, output_dir, n_planes=7):
+    """Convert and save full model to unary format."""
+    os.makedirs(output_dir, exist_ok=True)
+    config = {
+        "hidden_size": 1536,
+        "intermediate_size": 8960,
+        "num_attention_heads": 12,
+        "num_key_value_heads": 2,
+        "num_hidden_layers": 28,
+        "vocab_size": 151936,
+        "head_dim": 128,
+        "rope_theta": 1000000.0,
+        "rms_norm_eps": 1e-6,
+        "n_planes": n_planes,
+        "quant_type": "unary",
+    }
+    ternary_keys = []
+    keep_keys = []
+    for key in tensors:
+        if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
+                                    'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
+                                    'down_proj.weight']):
+            ternary_keys.append(key)
+        else:
+            keep_keys.append(key)
+    print(f"\nUnary layers: {len(ternary_keys)} (n_planes={n_planes}, levels={n_planes+1})")
+    print(f"FP16 layers: {len(keep_keys)}")
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    total_unary_bytes = 0
+    total_original_bytes = 0
+    for key in ternary_keys:
+        w = tensors[key]
+        out_dim, in_dim = w.shape
+        total_original_bytes += w.nbytes
+        t0 = time.time()
+        sign_bits, mag_planes, scales, sparsity = quantize_matrix_unary(w, n_planes)
+        dt = time.time() - t0
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        sign_bits.tofile(prefix + ".sign")
+        mag_planes.tofile(prefix + ".planes")
+        scales.tofile(prefix + ".scales")
+        unary_bytes = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
+        total_unary_bytes += unary_bytes
+        ratio = w.nbytes / unary_bytes
+        # Calculate effective bits per weight
+        bpw = (unary_bytes * 8) / (out_dim * in_dim)
+        print(f"  {key}: {w.shape} -> unary ({unary_bytes/1024:.0f}KB, "
+              f"{ratio:.1f}x compress, {bpw:.2f} bpw, {sparsity:.1%} sparse, {dt:.1f}s)")
+    total_fp16_bytes = 0
+    for key in keep_keys:
+        w = tensors[key].astype(np.float16)
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        w.tofile(prefix + ".fp16")
+        total_fp16_bytes += w.nbytes
+        print(f"  {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
+    manifest = {
+        "unary": {k: list(tensors[k].shape) for k in ternary_keys},
+        "fp16": {k: list(tensors[k].shape) for k in keep_keys},
+    }
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    total_bytes = total_unary_bytes + total_fp16_bytes
+    avg_bpw = (total_unary_bytes * 8) / sum(np.prod(tensors[k].shape) for k in ternary_keys)
+    print(f"\n=== Summary ===")
+    print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
+    print(f"Unary linear weights: {total_unary_bytes/1024/1024:.1f} MB")
+    print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
+    print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
+    print(f"Average bits per weight (linear): {avg_bpw:.2f}")
+    print(f"Compression vs FP32: {(total_original_bytes + total_fp16_bytes)/total_bytes:.1f}x")
+    print(f"Precision levels: {n_planes + 1} (vs ternary=3, INT4=16)")
+if __name__ == "__main__":
+    import sys
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
+    print(f"Loading model from {model_dir}...")
+    tensors = load_safetensors(model_dir)
+    print(f"Converting to unary (n_planes={n_planes})...")
+    save_unary_model(tensors, output_dir, n_planes)
+    print("Done!")

unary_convert_v2.py ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env python3
+"""
+Pure Unary Converter - interleaved plane layout [out_dim][chunks][n_planes]
+for cache-friendly access in the kernel.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import os, json, sys, time
+import numpy as np
+from pathlib import Path
+def load_safetensors(model_dir):
+    import torch
+    from safetensors.torch import load_file
+    tensors = {}
+    for f in sorted(Path(model_dir).glob("*.safetensors")):
+        print(f"Loading {f.name}...")
+        for k, v in load_file(str(f)).items():
+            tensors[k] = v.float().numpy()
+    return tensors
+def quantize_unary_interleaved(weight, n_planes):
+    """Quantize and pack into interleaved layout [out_dim][chunks][n_planes]"""
+    w = weight.astype(np.float32)
+    out_dim, in_dim = w.shape
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    row_max = np.max(np.abs(w), axis=1, keepdims=True)
+    row_max = np.where(row_max == 0, 1.0, row_max)
+    scales = (row_max.flatten() / n_planes).astype(np.float32)
+    w_scaled = w / scales[:, None]
+    magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
+    magnitudes = np.clip(magnitudes, 0, n_planes)
+    signs = (w < 0)
+    sparsity = np.mean(magnitudes == 0)
+    if in_dim < padded:
+        magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded-in_dim), dtype=np.int32)], axis=1)
+        signs = np.concatenate([signs, np.zeros((out_dim, padded-in_dim), dtype=bool)], axis=1)
+    # Pack sign bits [out_dim][chunks]
+    bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
+    signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
+    sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
+    # Pack magnitude planes INTERLEAVED: [out_dim][chunks][n_planes]
+    mag_planes = np.zeros((out_dim, chunks, n_planes), dtype=np.uint64)
+    for p in range(n_planes):
+        active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
+        mag_planes[:, :, p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
+    return sign_bits, mag_planes, scales, sparsity
+def convert(model_dir, output_dir, n_planes):
+    os.makedirs(output_dir, exist_ok=True)
+    tensors = load_safetensors(model_dir)
+    config = {
+        "hidden_size": 1536, "intermediate_size": 8960,
+        "num_attention_heads": 12, "num_key_value_heads": 2,
+        "num_hidden_layers": 28, "vocab_size": 151936,
+        "head_dim": 128, "rope_theta": 1000000.0,
+        "rms_norm_eps": 1e-6, "n_planes": n_planes,
+        "quant_type": "unary_interleaved",
+    }
+    linear_keys = [k for k in tensors if any(p in k for p in
+        ['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
+         'gate_proj.weight','up_proj.weight','down_proj.weight'])]
+    other_keys = [k for k in tensors if k not in linear_keys]
+    print(f"\nUnary: {len(linear_keys)} layers, {n_planes} planes ({2*n_planes+1} levels)")
+    print(f"FP16: {len(other_keys)} layers\n")
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    total_unary = total_orig = total_fp16 = 0
+    for key in linear_keys:
+        w = tensors[key]
+        total_orig += w.nbytes
+        t0 = time.time()
+        sign_bits, mag_planes, scales, sparsity = quantize_unary_interleaved(w, n_planes)
+        dt = time.time() - t0
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        sign_bits.tofile(prefix + ".sign")
+        mag_planes.tofile(prefix + ".planes")  # [out_dim][chunks][n_planes] contiguous
+        scales.tofile(prefix + ".scales")
+        ub = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
+        total_unary += ub
+        bpw = (ub * 8) / (w.shape[0] * w.shape[1])
+        print(f"  {key}: {w.shape} -> {ub/1024:.0f}KB ({bpw:.1f}bpw, {sparsity:.0%} sparse, {dt:.1f}s)")
+    for key in other_keys:
+        w = tensors[key].astype(np.float16)
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        w.tofile(prefix + ".fp16")
+        total_fp16 += w.nbytes
+        print(f"  {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
+    manifest = {
+        "unary": {k: list(tensors[k].shape) for k in linear_keys},
+        "fp16": {k: list(tensors[k].shape) for k in other_keys},
+    }
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    total = total_unary + total_fp16
+    avg_bpw = (total_unary * 8) / sum(np.prod(tensors[k].shape) for k in linear_keys)
+    print(f"\n=== Summary ===")
+    print(f"Unary weights: {total_unary/1024/1024:.1f} MB ({avg_bpw:.1f} avg bpw)")
+    print(f"FP16 weights: {total_fp16/1024/1024:.1f} MB")
+    print(f"Total: {total/1024/1024:.1f} MB")
+    print(f"Planes: {n_planes}, Levels: {2*n_planes+1}")
+    print(f"Layout: interleaved [out_dim][chunks][n_planes]")
+    print("Done!")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary31"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 31
+    convert(model_dir, output_dir, n_planes)

unary_engine.c ADDED Viewed

	@@ -0,0 +1,381 @@

+/*
+ * PURE UNARY (BASE-1) TRANSFORMER ENGINE
+ * AVX-512 + OpenMP. Full Qwen2 forward pass in C.
+ *
+ * Thermometer encoding: magnitude M = M planes set.
+ * Each plane contributes EXACTLY 1. No powers. No binary.
+ * 7 planes = 8 levels {0,1,2,3,4,5,6,7} * sign.
+ *
+ * Model format on disk (from unary_convert.py):
+ *   .sign   = [out_dim * chunks] uint64  (1=negative)
+ *   .planes = [n_planes * out_dim * chunks] uint64  (thermometer)
+ *   .scales = [out_dim] float32  (per-row)
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <omp.h>
+#define HIDDEN      1536
+#define INTER       8960
+#define N_HEADS     12
+#define N_KV_HEADS  2
+#define HEAD_DIM    128
+#define N_LAYERS    28
+#define VOCAB       151936
+#define RMS_EPS     1e-6f
+#define ROPE_THETA  1000000.0f
+#define MAX_SEQ     4096
+#define GQA_RATIO   (N_HEADS / N_KV_HEADS)
+typedef struct {
+    uint64_t *sign_bits;    /* [out_dim * chunks] */
+    uint64_t *mag_planes;   /* [n_planes * out_dim * chunks] */
+    float    *scales;       /* [out_dim] */
+    float    *bias;         /* [out_dim] or NULL */
+    int       out_dim, in_dim, n_planes;
+} UL; /* Unary Linear */
+typedef struct { uint16_t *w; int od, id; } FL; /* FP16 Linear */
+typedef struct {
+    UL qp, kp, vp, op, gp, up, dp;
+    float *in_norm, *pn_norm;
+    float *qb, *kb, *vb;
+} Lay;
+typedef struct {
+    uint16_t *emb;
+    Lay       lay[N_LAYERS];
+    float    *fnorm;
+    FL        lmh;
+    float    *kc, *vc;          /* KV cache */
+    float    *h, *h2;           /* hidden states */
+    float    *sq, *sk, *sv;     /* QKV scratch */
+    float    *ao;               /* attn output */
+    float    *sg, *su, *sd;     /* MLP scratch */
+    float    *lg;               /* logits */
+    float    *as;               /* attn scores */
+    int       np;
+} M;
+/* ============================================================
+ * PURE UNARY MATVEC
+ *
+ * y[i] = scales[i] * SUM over planes p:
+ *            SUM over j where plane_p bit j is set:
+ *                sign[j]==0 ? +x[j] : -x[j]
+ *
+ * Each plane contributes 1. Seven planes, seven passes.
+ * Embarrassingly parallel over output rows.
+ * ============================================================ */
+static void umv(const UL *L, const float *x, float *y) {
+    const int od = L->out_dim, id = L->in_dim, np = L->n_planes;
+    const int ch = (id + 63) / 64;
+    const int idp = (id + 15) & ~15;
+    float *xp = (float*)aligned_alloc(64, idp * sizeof(float));
+    memcpy(xp, x, id * sizeof(float));
+    if (idp > id) memset(xp + id, 0, (idp - id) * sizeof(float));
+    #pragma omp parallel for schedule(dynamic, 64)
+    for (int i = 0; i < od; i++) {
+        const uint64_t *rs = L->sign_bits + (size_t)i * ch;
+        float tot = 0.0f;
+        for (int p = 0; p < np; p++) {
+            const uint64_t *pr = L->mag_planes + ((size_t)p * od + i) * ch;
+            __m512 acc = _mm512_setzero_ps();
+            for (int c = 0; c < ch; c++) {
+                uint64_t mb = pr[c], sb = rs[c];
+                uint64_t pos = mb & ~sb;
+                uint64_t neg = mb & sb;
+                for (int g = 0; g < 4; g++) {
+                    int off = c * 64 + g * 16;
+                    if (off >= idp) break;
+                    __m512 xv = _mm512_load_ps(xp + off);
+                    __mmask16 pm = (__mmask16)((pos >> (g*16)) & 0xFFFF);
+                    __mmask16 nm = (__mmask16)((neg >> (g*16)) & 0xFFFF);
+                    acc = _mm512_mask_add_ps(acc, pm, acc, xv);
+                    acc = _mm512_mask_sub_ps(acc, nm, acc, xv);
+                }
+            }
+            /* PURE UNARY: each plane worth exactly 1 */
+            tot += _mm512_reduce_add_ps(acc);
+        }
+        y[i] = tot * L->scales[i];
+        if (L->bias) y[i] += L->bias[i];
+    }
+    free(xp);
+}
+/* FP16 matvec (lm_head only) */
+static void fmv(const FL *L, const float *x, float *y) {
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < L->od; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        const uint16_t *row = L->w + (size_t)i * L->id;
+        int j;
+        for (j = 0; j + 16 <= L->id; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(row + j));
+            acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
+        }
+        float s = _mm512_reduce_add_ps(acc);
+        for (; j < L->id; j++) {
+            float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
+            s += wf * x[j];
+        }
+        y[i] = s;
+    }
+}
+/* RMSNorm */
+static void rn(const float *x, const float *w, float *y, int d) {
+    __m512 sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i+16 <= d; i += 16) {
+        __m512 v = _mm512_loadu_ps(x+i);
+        sq = _mm512_fmadd_ps(v, v, sq);
+    }
+    float ss = _mm512_reduce_add_ps(sq);
+    for (; i < d; i++) ss += x[i]*x[i];
+    float r = 1.0f / sqrtf(ss/d + RMS_EPS);
+    __m512 rv = _mm512_set1_ps(r);
+    for (i = 0; i+16 <= d; i += 16)
+        _mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
+            _mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
+    for (; i < d; i++) y[i] = x[i]*r*w[i];
+}
+static void silu(float *x, int n) {
+    for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
+}
+static void emul(const float *a, const float *b, float *c, int n) {
+    int i;
+    for (i = 0; i+16 <= n; i += 16)
+        _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
+    for (; i < n; i++) c[i] = a[i]*b[i];
+}
+static void va(float *y, const float *x, int n) {
+    int i;
+    for (i = 0; i+16 <= n; i += 16)
+        _mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
+    for (; i < n; i++) y[i] += x[i];
+}
+static void rope(float *v, int pos, int d) {
+    for (int i = 0; i < d; i += 2) {
+        float f = 1.0f / powf(ROPE_THETA, (float)i/d);
+        float a = pos*f, co = cosf(a), si = sinf(a);
+        float v0 = v[i], v1 = v[i+1];
+        v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
+    }
+}
+static void sm(float *x, int n) {
+    float mx = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
+    float s = 0;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
+    float iv = 1.0f/s;
+    for (int i = 0; i < n; i++) x[i] *= iv;
+}
+static void etok(const M *m, int t, float *o) {
+    const uint16_t *r = m->emb + (size_t)t * HIDDEN;
+    int i;
+    for (i = 0; i+16 <= HIDDEN; i += 16)
+        _mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
+    for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
+}
+static float* kvp(float *c, int l, int p, int h) {
+    return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
+}
+static void do_attn(M *m, int l, int pos) {
+    Lay *ly = &m->lay[l];
+    umv(&ly->qp, m->h2, m->sq);
+    umv(&ly->kp, m->h2, m->sk);
+    umv(&ly->vp, m->h2, m->sv);
+    if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
+    if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
+    if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
+    for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
+    for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
+    for (int h = 0; h < N_KV_HEADS; h++) {
+        memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
+        memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
+    }
+    float sc = 1.0f/sqrtf((float)HEAD_DIM);
+    memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
+    for (int h = 0; h < N_HEADS; h++) {
+        int kvh = h / GQA_RATIO;
+        float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
+        for (int t = 0; t <= pos; t++) {
+            float *kk = kvp(m->kc,l,t,kvh);
+            __m512 a = _mm512_setzero_ps();
+            int d;
+            for (d = 0; d+16 <= HEAD_DIM; d += 16)
+                a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
+            float dot = _mm512_reduce_add_ps(a);
+            for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
+            m->as[t] = dot * sc;
+        }
+        sm(m->as, pos+1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->as[t];
+            if (w < 1e-8f) continue;
+            float *vv = kvp(m->vc,l,t,kvh);
+            __m512 wv = _mm512_set1_ps(w);
+            int d;
+            for (d = 0; d+16 <= HEAD_DIM; d += 16)
+                _mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
+            for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
+        }
+    }
+    umv(&ly->op, m->ao, m->h2);
+}
+static void do_mlp(M *m, int l) {
+    Lay *ly = &m->lay[l];
+    umv(&ly->gp, m->h2, m->sg);
+    umv(&ly->up, m->h2, m->su);
+    silu(m->sg, INTER);
+    emul(m->sg, m->su, m->sd, INTER);
+    umv(&ly->dp, m->sd, m->h2);
+}
+float* forward_token(M *m, int tid, int pos) {
+    etok(m, tid, m->h);
+    for (int l = 0; l < N_LAYERS; l++) {
+        rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
+        do_attn(m, l, pos);
+        va(m->h, m->h2, HIDDEN);
+        rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
+        do_mlp(m, l);
+        va(m->h, m->h2, HIDDEN);
+    }
+    rn(m->h, m->fnorm, m->h2, HIDDEN);
+    fmv(&m->lmh, m->h2, m->lg);
+    return m->lg;
+}
+static int samp(float *lg, int V, float T, float tp) {
+    if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
+    sm(lg, V);
+    float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
+    memcpy(pr, lg, V*4);
+    for (int i = 0; i < V; i++) ix[i] = i;
+    float cum = 0; int nk = 0;
+    while (cum < tp && nk < V && nk < 50) {
+        int b = nk;
+        for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
+        float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
+        int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
+        cum += pr[nk]; nk++;
+    }
+    float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
+    float r = (float)rand()/RAND_MAX * s, ac = 0;
+    int ch = ix[0];
+    for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
+    free(pr); free(ix);
+    return ch;
+}
+int generate(M *m, const int *pr, int pl, int *out, int mx,
+             float T, float tp, int eos) {
+    srand(time(NULL));
+    for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
+    int pos = pl, gen = 0;
+    for (int t = 0; t < mx; t++) {
+        int nx;
+        if (T <= 0) {
+            nx = 0;
+            for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
+        } else {
+            nx = samp(m->lg, VOCAB, T, tp);
+        }
+        out[t] = nx; gen++;
+        if (nx == eos) break;
+        forward_token(m, nx, pos); pos++;
+    }
+    return gen;
+}
+/* ============================================================
+ * ALLOCATION + WEIGHT SETTERS (called from Python)
+ * ============================================================ */
+M* model_alloc(int np) {
+    M *m = (M*)calloc(1, sizeof(M));
+    m->np = np;
+    size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
+    m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
+    m->h  = (float*)aligned_alloc(64,HIDDEN*4);
+    m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
+    m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
+    m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
+    m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
+    m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
+    m->sg = (float*)aligned_alloc(64,INTER*4);
+    m->su = (float*)aligned_alloc(64,INTER*4);
+    m->sd = (float*)aligned_alloc(64,INTER*4);
+    m->lg = (float*)aligned_alloc(64,VOCAB*4);
+    m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
+    m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
+    printf("Alloc: KV=%zuMB np=%d\n", kv*2*4/1024/1024, np);
+    return m;
+}
+void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
+void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
+void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
+    m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
+}
+void layer_set_norms(M *m, int l, float *i, float *p) {
+    m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
+}
+void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
+    m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
+}
+static void set_ul(UL *u, uint64_t *s, uint64_t *p, float *sc, int o, int i, int np) {
+    u->sign_bits=s; u->mag_planes=p; u->scales=sc;
+    u->out_dim=o; u->in_dim=i; u->n_planes=np; u->bias=NULL;
+}
+void layer_set_linears(M *m, int l,
+    uint64_t*qs,uint64_t*qp,float*qc,int qo,int qi,
+    uint64_t*ks,uint64_t*kp,float*kc,int ko,int ki,
+    uint64_t*vs,uint64_t*vp,float*vc,int vo,int vi,
+    uint64_t*os,uint64_t*op,float*oc,int oo,int oi,
+    uint64_t*gs,uint64_t*gp,float*gc,int go,int gi,
+    uint64_t*us,uint64_t*up,float*uc,int uo,int ui,
+    uint64_t*ds,uint64_t*dp,float*dc,int doo,int di, int np) {
+    set_ul(&m->lay[l].qp,qs,qp,qc,qo,qi,np);
+    set_ul(&m->lay[l].kp,ks,kp,kc,ko,ki,np);
+    set_ul(&m->lay[l].vp,vs,vp,vc,vo,vi,np);
+    set_ul(&m->lay[l].op,os,op,oc,oo,oi,np);
+    set_ul(&m->lay[l].gp,gs,gp,gc,go,gi,np);
+    set_ul(&m->lay[l].up,us,up,uc,uo,ui,np);
+    set_ul(&m->lay[l].dp,ds,dp,dc,doo,di,np);
+}
+void model_reset_cache(M *m) {
+    size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
+    memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
+}
+void model_free(M *m) {
+    free(m->kc);free(m->vc);free(m->h);free(m->h2);
+    free(m->sq);free(m->sk);free(m->sv);free(m->ao);
+    free(m->sg);free(m->su);free(m->sd);
+    free(m->lg);free(m->as);free(m->fnorm);free(m);
+}

unary_engine_v2.c ADDED Viewed

	@@ -0,0 +1,629 @@

+/*
+ * UNARY TRANSFORMER ENGINE v2 - Configurable dimensions
+ *
+ * Full Qwen2/Qwen3 forward pass in C with AVX-512 + OpenMP.
+ * Supports any model size via runtime config.
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <omp.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#define MAX_SEQ     4096
+#define RMS_EPS     1e-6f
+/* ============================================================
+ * Config - set at init time
+ * ============================================================ */
+typedef struct {
+    int hidden;
+    int inter;
+    int n_heads;
+    int n_kv_heads;
+    int head_dim;
+    int n_layers;
+    int vocab;
+    float rope_theta;
+    int has_attn_bias;    /* 1 for Qwen2 (1.5B), 0 for Qwen3 (4B) */
+    int tie_embeddings;   /* 1 if lm_head shares embed weights */
+} Config;
+/* ============================================================
+ * Unary linear layer
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign_bits;
+    uint64_t *mag_planes;
+    float    *scales;
+    float    *bias;
+    int       out_dim;
+    int       in_dim;
+    int       n_planes;
+} UnaryLinear;
+/* FP16 linear (for lm_head when not tied) */
+typedef struct {
+    uint16_t *weight;
+    int       out_dim;
+    int       in_dim;
+} FP16Linear;
+/* ============================================================
+ * Transformer layer
+ * ============================================================ */
+typedef struct {
+    UnaryLinear q_proj, k_proj, v_proj, o_proj;
+    UnaryLinear gate_proj, up_proj, down_proj;
+    float *input_norm;
+    float *post_norm;
+    float *q_bias, *k_bias, *v_bias;
+    float *q_norm, *k_norm;   /* QK-Norm (Qwen3) */
+} Layer;
+/* ============================================================
+ * Full model
+ * ============================================================ */
+typedef struct {
+    Config cfg;
+    uint16_t *embed;        /* FP16 embeddings */
+    Layer    *layers;       /* Dynamic array */
+    float    *final_norm;
+    FP16Linear lm_head;     /* Only used if !tie_embeddings */
+    /* KV cache */
+    float *k_cache;
+    float *v_cache;
+    /* Scratch buffers */
+    float *hidden;
+    float *hidden2;
+    float *q;
+    float *k;
+    float *v;
+    float *attn_out;
+    float *gate;
+    float *up;
+    float *down_in;
+    float *logits;
+    float *attn_scores;
+    int n_planes;
+} Model;
+/* ============================================================
+ * AVX-512 Unary matvec: y = W @ x
+ * ============================================================ */
+static void unary_matvec(
+    const UnaryLinear *layer, const float *x, float *y
+) {
+    int out_dim = layer->out_dim;
+    int in_dim = layer->in_dim;
+    int n_planes = layer->n_planes;
+    int chunks = (in_dim + 63) / 64;
+    int in_padded = (in_dim + 15) & ~15;
+    #pragma omp parallel for schedule(dynamic, 64)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *row_sign = layer->sign_bits + (size_t)i * chunks;
+        float total = 0.0f;
+        /* Aligned local copy of input for this thread */
+        float x_local[in_padded] __attribute__((aligned(64)));
+        memcpy(x_local, x, in_dim * sizeof(float));
+        if (in_padded > in_dim)
+            memset(x_local + in_dim, 0, (in_padded - in_dim) * sizeof(float));
+        for (int p = 0; p < n_planes; p++) {
+            const uint64_t *plane_row = layer->mag_planes +
+                ((size_t)p * out_dim + i) * chunks;
+            __m512 acc = _mm512_setzero_ps();
+            for (int c = 0; c < chunks; c++) {
+                uint64_t mbits = plane_row[c];
+                uint64_t sbits = row_sign[c];
+                uint64_t pos_bits = mbits & ~sbits;
+                uint64_t neg_bits = mbits & sbits;
+                for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
+                    int offset = c * 64 + g * 16;
+                    __m512 xv = _mm512_load_ps(x_local + offset);
+                    __mmask16 pmask = (__mmask16)((pos_bits >> (g * 16)) & 0xFFFF);
+                    __mmask16 nmask = (__mmask16)((neg_bits >> (g * 16)) & 0xFFFF);
+                    acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
+                    acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
+                }
+            }
+            total += _mm512_reduce_add_ps(acc);
+        }
+        y[i] = total * layer->scales[i];
+        if (layer->bias) y[i] += layer->bias[i];
+    }
+}
+/* FP16 matvec for lm_head */
+static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
+    int out_dim = layer->out_dim;
+    int in_dim = layer->in_dim;
+    const uint16_t *w = layer->weight;
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < out_dim; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        int j;
+        for (j = 0; j + 16 <= in_dim; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
+            __m512 wv = _mm512_cvtph_ps(h);
+            __m512 xv = _mm512_loadu_ps(x + j);
+            acc = _mm512_fmadd_ps(wv, xv, acc);
+        }
+        float sum = _mm512_reduce_add_ps(acc);
+        for (; j < in_dim; j++) {
+            __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
+            __m128 fv = _mm_cvtph_ps(hv);
+            float wf;
+            _mm_store_ss(&wf, fv);
+            sum += wf * x[j];
+        }
+        y[i] = sum;
+    }
+}
+/* ============================================================
+ * Basic ops - all AVX-512 vectorized
+ * ============================================================ */
+static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
+    __m512 sum_sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
+    }
+    float ss = _mm512_reduce_add_ps(sum_sq);
+    for (; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 wv = _mm512_loadu_ps(weight + i);
+        __m512 rv = _mm512_set1_ps(rms);
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
+    }
+    for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
+}
+static void silu_inplace(float *x, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), xv);
+        /* exp(-x) approximation not great with AVX, use scalar */
+        float tmp[16];
+        _mm512_storeu_ps(tmp, xv);
+        for (int j = 0; j < 16; j++)
+            tmp[j] = tmp[j] / (1.0f + expf(-tmp[j]));
+        _mm512_storeu_ps(x + i, _mm512_loadu_ps(tmp));
+    }
+    for (; i < n; i++)
+        x[i] = x[i] / (1.0f + expf(-x[i]));
+}
+static void elemwise_mul(const float *a, const float *b, float *c, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 av = _mm512_loadu_ps(a + i);
+        __m512 bv = _mm512_loadu_ps(b + i);
+        _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
+    }
+    for (; i < n; i++) c[i] = a[i] * b[i];
+}
+static void vec_add(float *y, const float *x, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 yv = _mm512_loadu_ps(y + i);
+        __m512 xv = _mm512_loadu_ps(x + i);
+        _mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
+    }
+    for (; i < n; i++) y[i] += x[i];
+}
+static void apply_rope(float *vec, int pos, int dim, float theta) {
+    for (int i = 0; i < dim; i += 2) {
+        float freq = 1.0f / powf(theta, (float)i / dim);
+        float angle = pos * freq;
+        float cos_a = cosf(angle);
+        float sin_a = sinf(angle);
+        float v0 = vec[i];
+        float v1 = vec[i + 1];
+        vec[i]     = v0 * cos_a - v1 * sin_a;
+        vec[i + 1] = v0 * sin_a + v1 * cos_a;
+    }
+}
+static void softmax(float *x, int n) {
+    float max_val = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
+    float inv = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv;
+}
+/* ============================================================
+ * Embedding lookup (FP16 -> FP32)
+ * ============================================================ */
+static void embed_token(const Model *m, int token_id, float *out) {
+    int hidden = m->cfg.hidden;
+    const uint16_t *row = m->embed + (size_t)token_id * hidden;
+    int i;
+    for (i = 0; i + 16 <= hidden; i += 16) {
+        __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
+        __m512 fv = _mm512_cvtph_ps(h);
+        _mm512_storeu_ps(out + i, fv);
+    }
+    for (; i < hidden; i++) {
+        __m128i hv = _mm_set1_epi16(row[i]);
+        __m128 fv = _mm_cvtph_ps(hv);
+        _mm_store_ss(out + i, fv);
+    }
+}
+/* KV cache helpers */
+static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
+    return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
+                    (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
+}
+/* ============================================================
+ * ATTENTION
+ * ============================================================ */
+static void attention(Model *m, int layer_idx, int pos) {
+    Config *c = &m->cfg;
+    Layer *layer = &m->layers[layer_idx];
+    int heads_per_kv = c->n_heads / c->n_kv_heads;
+    unary_matvec(&layer->q_proj, m->hidden2, m->q);
+    unary_matvec(&layer->k_proj, m->hidden2, m->k);
+    unary_matvec(&layer->v_proj, m->hidden2, m->v);
+    if (c->has_attn_bias) {
+        if (layer->q_bias) vec_add(m->q, layer->q_bias, c->n_heads * c->head_dim);
+        if (layer->k_bias) vec_add(m->k, layer->k_bias, c->n_kv_heads * c->head_dim);
+        if (layer->v_bias) vec_add(m->v, layer->v_bias, c->n_kv_heads * c->head_dim);
+    }
+    /* QK-Norm (Qwen3): RMSNorm each head's Q and K before RoPE */
+    if (layer->q_norm) {
+        for (int h = 0; h < c->n_heads; h++)
+            rmsnorm(m->q + h * c->head_dim, layer->q_norm, m->q + h * c->head_dim, c->head_dim);
+    }
+    if (layer->k_norm) {
+        for (int h = 0; h < c->n_kv_heads; h++)
+            rmsnorm(m->k + h * c->head_dim, layer->k_norm, m->k + h * c->head_dim, c->head_dim);
+    }
+    for (int h = 0; h < c->n_heads; h++)
+        apply_rope(m->q + h * c->head_dim, pos, c->head_dim, c->rope_theta);
+    for (int h = 0; h < c->n_kv_heads; h++)
+        apply_rope(m->k + h * c->head_dim, pos, c->head_dim, c->rope_theta);
+    for (int h = 0; h < c->n_kv_heads; h++) {
+        memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
+               m->k + h * c->head_dim, c->head_dim * sizeof(float));
+        memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
+               m->v + h * c->head_dim, c->head_dim * sizeof(float));
+    }
+    float scale = 1.0f / sqrtf((float)c->head_dim);
+    memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
+    for (int h = 0; h < c->n_heads; h++) {
+        int kv_h = h / heads_per_kv;
+        float *q_head = m->q + h * c->head_dim;
+        float *out_head = m->attn_out + h * c->head_dim;
+        for (int t = 0; t <= pos; t++) {
+            float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
+            __m512 acc = _mm512_setzero_ps();
+            int d;
+            for (d = 0; d + 16 <= c->head_dim; d += 16) {
+                __m512 qv = _mm512_loadu_ps(q_head + d);
+                __m512 kv = _mm512_loadu_ps(k_cached + d);
+                acc = _mm512_fmadd_ps(qv, kv, acc);
+            }
+            float dot = _mm512_reduce_add_ps(acc);
+            for (; d < c->head_dim; d++) dot += q_head[d] * k_cached[d];
+            m->attn_scores[t] = dot * scale;
+        }
+        softmax(m->attn_scores, pos + 1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->attn_scores[t];
+            if (w < 1e-8f) continue;
+            float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
+            __m512 wv = _mm512_set1_ps(w);
+            int d;
+            for (d = 0; d + 16 <= c->head_dim; d += 16) {
+                __m512 ov = _mm512_loadu_ps(out_head + d);
+                __m512 vv = _mm512_loadu_ps(v_cached + d);
+                _mm512_storeu_ps(out_head + d, _mm512_fmadd_ps(wv, vv, ov));
+            }
+            for (; d < c->head_dim; d++) out_head[d] += w * v_cached[d];
+        }
+    }
+    unary_matvec(&layer->o_proj, m->attn_out, m->hidden2);
+}
+/* ============================================================
+ * MLP - SwiGLU
+ * ============================================================ */
+static void mlp(Model *m, int layer_idx) {
+    Layer *layer = &m->layers[layer_idx];
+    int inter = m->cfg.inter;
+    unary_matvec(&layer->gate_proj, m->hidden2, m->gate);
+    unary_matvec(&layer->up_proj, m->hidden2, m->up);
+    silu_inplace(m->gate, inter);
+    elemwise_mul(m->gate, m->up, m->down_in, inter);
+    unary_matvec(&layer->down_proj, m->down_in, m->hidden2);
+}
+/* ============================================================
+ * FORWARD ONE TOKEN
+ * ============================================================ */
+float* forward_token(Model *m, int token_id, int pos) {
+    Config *c = &m->cfg;
+    embed_token(m, token_id, m->hidden);
+    for (int l = 0; l < c->n_layers; l++) {
+        rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, c->hidden);
+        attention(m, l, pos);
+        vec_add(m->hidden, m->hidden2, c->hidden);
+        rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, c->hidden);
+        mlp(m, l);
+        vec_add(m->hidden, m->hidden2, c->hidden);
+    }
+    rmsnorm(m->hidden, m->final_norm, m->hidden2, c->hidden);
+    /* LM head - either tied embeddings or separate FP16 */
+    if (c->tie_embeddings) {
+        /* Use embed weights as lm_head (FP16 matvec) */
+        FP16Linear tied;
+        tied.weight = m->embed;
+        tied.out_dim = c->vocab;
+        tied.in_dim = c->hidden;
+        fp16_matvec(&tied, m->hidden2, m->logits);
+    } else {
+        fp16_matvec(&m->lm_head, m->hidden2, m->logits);
+    }
+    return m->logits;
+}
+/* ============================================================
+ * TOP-P SAMPLING
+ * ============================================================ */
+static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
+    if (temperature > 0) {
+        float inv_t = 1.0f / temperature;
+        for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
+    }
+    softmax(logits, vocab);
+    float *probs = (float *)malloc(vocab * sizeof(float));
+    int *indices = (int *)malloc(vocab * sizeof(int));
+    memcpy(probs, logits, vocab * sizeof(float));
+    for (int i = 0; i < vocab; i++) indices[i] = i;
+    int n_keep = 0;
+    float cum = 0.0f;
+    while (cum < top_p && n_keep < vocab) {
+        int best = n_keep;
+        for (int i = n_keep + 1; i < vocab; i++)
+            if (probs[i] > probs[best]) best = i;
+        float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
+        int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
+        cum += probs[n_keep];
+        n_keep++;
+        if (n_keep >= 40) break;
+    }
+    float sum = 0.0f;
+    for (int i = 0; i < n_keep; i++) sum += probs[i];
+    float r = (float)rand() / RAND_MAX * sum;
+    float acc = 0.0f;
+    int chosen = indices[0];
+    for (int i = 0; i < n_keep; i++) {
+        acc += probs[i];
+        if (acc >= r) { chosen = indices[i]; break; }
+    }
+    free(probs);
+    free(indices);
+    return chosen;
+}
+/* ============================================================
+ * GENERATE
+ * ============================================================ */
+int generate(
+    Model *m,
+    const int *prompt_ids, int prompt_len,
+    int *out_tokens, int max_new_tokens,
+    float temperature, float top_p,
+    int eos_token
+) {
+    srand(time(NULL));
+    for (int i = 0; i < prompt_len; i++) {
+        forward_token(m, prompt_ids[i], i);
+    }
+    int pos = prompt_len;
+    int generated = 0;
+    for (int t = 0; t < max_new_tokens; t++) {
+        float *logits = m->logits;
+        int next_token;
+        if (temperature <= 0) {
+            next_token = 0;
+            for (int i = 1; i < m->cfg.vocab; i++)
+                if (logits[i] > logits[next_token]) next_token = i;
+        } else {
+            next_token = sample_top_p(logits, m->cfg.vocab, temperature, top_p);
+        }
+        out_tokens[t] = next_token;
+        generated++;
+        if (next_token == eos_token) break;
+        forward_token(m, next_token, pos);
+        pos++;
+    }
+    return generated;
+}
+/* ============================================================
+ * MODEL ALLOCATION with config
+ * ============================================================ */
+Model* model_alloc(
+    int n_planes,
+    int hidden, int inter, int n_heads, int n_kv_heads,
+    int head_dim, int n_layers, int vocab,
+    float rope_theta, int has_attn_bias, int tie_embeddings
+) {
+    Model *m = (Model *)calloc(1, sizeof(Model));
+    m->n_planes = n_planes;
+    Config *c = &m->cfg;
+    c->hidden = hidden;
+    c->inter = inter;
+    c->n_heads = n_heads;
+    c->n_kv_heads = n_kv_heads;
+    c->head_dim = head_dim;
+    c->n_layers = n_layers;
+    c->vocab = vocab;
+    c->rope_theta = rope_theta;
+    c->has_attn_bias = has_attn_bias;
+    c->tie_embeddings = tie_embeddings;
+    m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
+    size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
+    m->k_cache = (float *)calloc(kv_size, sizeof(float));
+    m->v_cache = (float *)calloc(kv_size, sizeof(float));
+    m->hidden      = (float *)aligned_alloc(64, hidden * sizeof(float));
+    m->hidden2     = (float *)aligned_alloc(64, hidden * sizeof(float));
+    m->q           = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->k           = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->v           = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
+    m->attn_out    = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
+    m->gate        = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->up          = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->down_in     = (float *)aligned_alloc(64, inter * sizeof(float));
+    m->logits      = (float *)aligned_alloc(64, vocab * sizeof(float));
+    m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
+    m->final_norm  = (float *)aligned_alloc(64, hidden * sizeof(float));
+    size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
+    printf("Model config: hidden=%d inter=%d heads=%d kv_heads=%d layers=%d vocab=%d\n",
+           hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
+    printf("KV cache: %zu MB, tied_embed=%d, attn_bias=%d\n",
+           kv_mb, tie_embeddings, has_attn_bias);
+    return m;
+}
+/* Weight setters */
+void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
+void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
+void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
+    m->lm_head.weight = data;
+    m->lm_head.out_dim = out_dim;
+    m->lm_head.in_dim = in_dim;
+}
+void layer_set_norms(Model *m, int l, float *input_norm, float *post_norm) {
+    m->layers[l].input_norm = input_norm;
+    m->layers[l].post_norm = post_norm;
+}
+void layer_set_bias(Model *m, int l, float *q_bias, float *k_bias, float *v_bias) {
+    m->layers[l].q_bias = q_bias;
+    m->layers[l].k_bias = k_bias;
+    m->layers[l].v_bias = v_bias;
+}
+void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
+    m->layers[l].q_norm = q_norm;
+    m->layers[l].k_norm = k_norm;
+}
+void layer_set_unary(
+    UnaryLinear *ul,
+    uint64_t *sign, uint64_t *planes, float *scales,
+    int out_dim, int in_dim, int n_planes
+) {
+    ul->sign_bits = sign;
+    ul->mag_planes = planes;
+    ul->scales = scales;
+    ul->out_dim = out_dim;
+    ul->in_dim = in_dim;
+    ul->n_planes = n_planes;
+    ul->bias = NULL;
+}
+void layer_set_linears(
+    Model *m, int l,
+    uint64_t *q_sign, uint64_t *q_planes, float *q_scales, int q_out, int q_in,
+    uint64_t *k_sign, uint64_t *k_planes, float *k_scales, int k_out, int k_in,
+    uint64_t *v_sign, uint64_t *v_planes, float *v_scales, int v_out, int v_in,
+    uint64_t *o_sign, uint64_t *o_planes, float *o_scales, int o_out, int o_in,
+    uint64_t *g_sign, uint64_t *g_planes, float *g_scales, int g_out, int g_in,
+    uint64_t *u_sign, uint64_t *u_planes, float *u_scales, int u_out, int u_in,
+    uint64_t *d_sign, uint64_t *d_planes, float *d_scales, int d_out, int d_in,
+    int n_planes
+) {
+    layer_set_unary(&m->layers[l].q_proj, q_sign, q_planes, q_scales, q_out, q_in, n_planes);
+    layer_set_unary(&m->layers[l].k_proj, k_sign, k_planes, k_scales, k_out, k_in, n_planes);
+    layer_set_unary(&m->layers[l].v_proj, v_sign, v_planes, v_scales, v_out, v_in, n_planes);
+    layer_set_unary(&m->layers[l].o_proj, o_sign, o_planes, o_scales, o_out, o_in, n_planes);
+    layer_set_unary(&m->layers[l].gate_proj, g_sign, g_planes, g_scales, g_out, g_in, n_planes);
+    layer_set_unary(&m->layers[l].up_proj, u_sign, u_planes, u_scales, u_out, u_in, n_planes);
+    layer_set_unary(&m->layers[l].down_proj, d_sign, d_planes, d_scales, d_out, d_in, n_planes);
+}
+void model_reset_cache(Model *m) {
+    size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
+    memset(m->k_cache, 0, kv_size * sizeof(float));
+    memset(m->v_cache, 0, kv_size * sizeof(float));
+}
+void model_free(Model *m) {
+    free(m->k_cache); free(m->v_cache);
+    free(m->hidden); free(m->hidden2);
+    free(m->q); free(m->k); free(m->v);
+    free(m->attn_out); free(m->gate); free(m->up); free(m->down_in);
+    free(m->logits); free(m->attn_scores); free(m->final_norm);
+    free(m->layers);
+    free(m);
+}

unary_full.c ADDED Viewed

	@@ -0,0 +1,742 @@

+/*
+ * FULL UNARY ENGINE - Weights AND Activations in Base-1
+ *
+ * True unary: the entire matmul is popcount of ANDed bitplanes.
+ * No floating point in the inner loop. No multiplication anywhere.
+ *
+ * Weight w with magnitude M_w (thermometer: M_w planes with bit set)
+ * Activation x with magnitude M_x (thermometer: M_x planes with bit set)
+ *
+ * dot(w, x) for row i:
+ *   For each weight plane p (0..W-1) and act plane q (0..A-1):
+ *     contribution = popcount( w_plane_p[i] AND act_plane_q AND same_sign )
+ *                  - popcount( w_plane_p[i] AND act_plane_q AND diff_sign )
+ *   y[i] = sum_of_contributions * w_scale[i] * act_scale
+ *
+ * The outer sum has W*A terms, each is a popcount over 64 elements.
+ * With W=4, A=4: 16 popcounts per 64 elements = insanely fast.
+ *
+ * AVX-512 VPOPCNTDQ: one instruction for 8x64-bit popcounts.
+ * On Skylake (no VPOPCNTDQ): use Harley-Seal or scalar POPCNT.
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+#include <time.h>
+#include <omp.h>
+#include <x86intrin.h>
+/* ============================================================
+ * Config (DeepSeek-R1-Distill-Qwen-1.5B)
+ * ============================================================ */
+#define HIDDEN      1536
+#define INTER       8960
+#define N_HEADS     12
+#define N_KV_HEADS  2
+#define HEAD_DIM    128
+#define N_LAYERS    28
+#define VOCAB       151936
+#define RMS_EPS     1e-6f
+#define ROPE_THETA  1000000.0f
+#define MAX_SEQ     4096
+#define HEADS_PER_KV (N_HEADS / N_KV_HEADS)
+/* Unary config */
+#define W_PLANES    4    /* weight magnitude planes (5 levels: 0-4) */
+#define A_PLANES    8    /* activation magnitude planes (9 levels: 0-8) */
+/* ============================================================
+ * Portable popcount for 64-bit
+ * Uses hardware POPCNT (available on Skylake)
+ * ============================================================ */
+static inline int popcnt64(uint64_t x) {
+    return __builtin_popcountll(x);
+}
+/* ============================================================
+ * Unary Linear Layer (weight storage)
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign_bits;    /* [out_dim * chunks] - 1=negative */
+    uint64_t *mag_planes;   /* [W_PLANES * out_dim * chunks] */
+    float    *scales;       /* [out_dim] per-row scale */
+    float    *bias;         /* [out_dim] or NULL */
+    int       out_dim;
+    int       in_dim;
+    int       n_planes;
+} UnaryLinear;
+/* FP16 Linear (for lm_head and embed) */
+typedef struct {
+    uint16_t *weight;
+    int       out_dim;
+    int       in_dim;
+} FP16Linear;
+/* ============================================================
+ * Quantized Activation Buffer
+ * Activations quantized to unary thermometer on the fly.
+ * ============================================================ */
+typedef struct {
+    uint64_t *sign_bits;    /* [chunks] */
+    uint64_t *mag_planes;   /* [A_PLANES * chunks] */
+    float     scale;        /* single scale for entire vector */
+    int       dim;
+    int       chunks;
+} QuantAct;
+/* ============================================================
+ * Transformer Layer
+ * ============================================================ */
+typedef struct {
+    UnaryLinear q_proj, k_proj, v_proj, o_proj;
+    UnaryLinear gate_proj, up_proj, down_proj;
+    float *input_norm;
+    float *post_norm;
+    float *q_bias, *k_bias, *v_bias;
+} Layer;
+/* ============================================================
+ * Full Model
+ * ============================================================ */
+typedef struct {
+    uint16_t *embed;
+    Layer     layers[N_LAYERS];
+    float    *final_norm;
+    FP16Linear lm_head;
+    /* KV cache (keep as float - only used in attention dot products) */
+    float *k_cache;
+    float *v_cache;
+    /* Float scratch (for between operations) */
+    float *hidden;
+    float *hidden2;
+    float *q_buf;           /* [N_HEADS * HEAD_DIM] */
+    float *k_buf;           /* [N_KV_HEADS * HEAD_DIM] */
+    float *v_buf;
+    float *attn_out;
+    float *gate_buf;        /* [INTER] */
+    float *up_buf;
+    float *mlp_buf;         /* [INTER] for gate*up result */
+    float *logits;
+    float *attn_scores;
+    /* Quantized activation buffers (reusable) */
+    QuantAct qa_hidden;     /* for HIDDEN-dim activations */
+    QuantAct qa_inter;      /* for INTER-dim activations */
+    int n_w_planes;
+    int n_a_planes;
+} Model;
+/* ============================================================
+ * QUANTIZE ACTIVATION TO UNARY (on the fly)
+ *
+ * Takes float vector, produces unary bitplanes.
+ * This is the key operation that enables full-unary matmul.
+ * ============================================================ */
+static void quantize_activation(const float *x, QuantAct *qa) {
+    int dim = qa->dim;
+    int chunks = qa->chunks;
+    int n_planes = A_PLANES;
+    /* Find absmax for scale */
+    float absmax = 0.0f;
+    for (int i = 0; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > absmax) absmax = a;
+    }
+    if (absmax == 0.0f) absmax = 1.0f;
+    qa->scale = absmax / n_planes;
+    float inv_scale = 1.0f / qa->scale;
+    /* Clear bitplanes */
+    memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
+    memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
+    /* Quantize and pack into bitplanes */
+    for (int i = 0; i < dim; i++) {
+        int chunk = i / 64;
+        int bit = i % 64;
+        uint64_t mask = (uint64_t)1 << bit;
+        float val = x[i];
+        if (val < 0) {
+            qa->sign_bits[chunk] |= mask;
+            val = -val;
+        }
+        int mag = (int)(val * inv_scale + 0.5f);
+        if (mag > n_planes) mag = n_planes;
+        /* Thermometer: set planes 0..mag-1 */
+        for (int p = 0; p < mag; p++) {
+            qa->mag_planes[p * chunks + chunk] |= mask;
+        }
+    }
+}
+/* Vectorized quantize - process 64 elements at a time */
+static void quantize_activation_fast(const float *x, QuantAct *qa) {
+    int dim = qa->dim;
+    int chunks = qa->chunks;
+    int padded = chunks * 64;
+    int n_planes = A_PLANES;
+    /* Find absmax with AVX-512 */
+    __m512 vmax = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 av = _mm512_abs_ps(xv);
+        vmax = _mm512_max_ps(vmax, av);
+    }
+    float absmax = _mm512_reduce_max_ps(vmax);
+    for (; i < dim; i++) {
+        float a = fabsf(x[i]);
+        if (a > absmax) absmax = a;
+    }
+    if (absmax == 0.0f) absmax = 1.0f;
+    qa->scale = absmax / n_planes;
+    float inv_scale = (float)n_planes / absmax;
+    /* Clear */
+    memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
+    memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
+    /* Process 16 floats at a time, pack bits */
+    __m512 v_inv = _mm512_set1_ps(inv_scale);
+    __m512 v_half = _mm512_set1_ps(0.5f);
+    __m512 v_zero = _mm512_setzero_ps();
+    for (int c = 0; c < chunks; c++) {
+        uint64_t sign_word = 0;
+        uint64_t plane_words[A_PLANES];
+        memset(plane_words, 0, sizeof(plane_words));
+        for (int g = 0; g < 4; g++) {
+            int offset = c * 64 + g * 16;
+            if (offset >= dim) break;
+            /* Load 16 floats */
+            __m512 xv;
+            if (offset + 16 <= dim) {
+                xv = _mm512_loadu_ps(x + offset);
+            } else {
+                /* Partial load at end */
+                xv = _mm512_setzero_ps();
+                for (int j = 0; j < dim - offset; j++) {
+                    ((float*)&xv)[j] = x[offset + j];
+                }
+            }
+            /* Sign: negative mask */
+            __mmask16 neg_mask = _mm512_cmplt_ps_mask(xv, v_zero);
+            sign_word |= ((uint64_t)neg_mask << (g * 16));
+            /* Absolute value and quantize */
+            __m512 av = _mm512_abs_ps(xv);
+            __m512 qv = _mm512_fmadd_ps(av, v_inv, v_half);
+            /* Convert to int and clamp */
+            __m512i iv = _mm512_cvttps_epi32(qv);
+            __m512i v_max = _mm512_set1_epi32(n_planes);
+            iv = _mm512_min_epi32(iv, v_max);
+            /* Thermometer encode: plane p has bit set if magnitude > p */
+            for (int p = 0; p < n_planes; p++) {
+                __m512i vp = _mm512_set1_epi32(p + 1);
+                __mmask16 active = _mm512_cmpge_epi32_mask(iv, vp);
+                plane_words[p] |= ((uint64_t)active << (g * 16));
+            }
+        }
+        qa->sign_bits[c] = sign_word;
+        for (int p = 0; p < n_planes; p++) {
+            qa->mag_planes[p * chunks + c] = plane_words[p];
+        }
+    }
+}
+/* ============================================================
+ * FULL-UNARY MATVEC via POPCOUNT
+ *
+ * y[i] = w_scale[i] * act_scale *
+ *         sum_{p=0}^{W-1} sum_{q=0}^{A-1}
+ *           ( popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND ~a_sign)   // both positive
+ *           + popcount(w_plane_p[i] AND a_plane_q AND  w_sign AND  a_sign)   // both negative (neg*neg=pos)
+ *           - popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND  a_sign)   // pos weight * neg act
+ *           - popcount(w_plane_p[i] AND a_plane_q AND  w_sign AND ~a_sign) ) // neg weight * pos act
+ *
+ * Simplification: same_sign = ~(w_sign XOR a_sign), diff_sign = w_sign XOR a_sign
+ *   contribution = popcount(w_plane AND a_plane AND same_sign)
+ *                - popcount(w_plane AND a_plane AND diff_sign)
+ * ============================================================ */
+static void unary_matvec_popcount(
+    const UnaryLinear *layer, const QuantAct *qa, float *y
+) {
+    int out_dim = layer->out_dim;
+    int chunks = qa->chunks;
+    int n_w = layer->n_planes;
+    int n_a = A_PLANES;
+    float act_scale = qa->scale;
+    #pragma omp parallel for schedule(dynamic, 64)
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *w_sign = layer->sign_bits + (size_t)i * chunks;
+        long total = 0;  /* integer accumulator! */
+        for (int c = 0; c < chunks; c++) {
+            uint64_t ws = w_sign[c];
+            uint64_t as = qa->sign_bits[c];
+            uint64_t same_sign = ~(ws ^ as);  /* bits where signs agree */
+            uint64_t diff_sign = ws ^ as;     /* bits where signs differ */
+            for (int p = 0; p < n_w; p++) {
+                uint64_t wp = layer->mag_planes[((size_t)p * out_dim + i) * chunks + c];
+                for (int q = 0; q < n_a; q++) {
+                    uint64_t aq = qa->mag_planes[q * chunks + c];
+                    uint64_t active = wp & aq;  /* both have magnitude at this level */
+                    total += popcnt64(active & same_sign);
+                    total -= popcnt64(active & diff_sign);
+                }
+            }
+        }
+        y[i] = (float)total * layer->scales[i] * act_scale;
+        if (layer->bias) y[i] += layer->bias[i];
+    }
+}
+/* ============================================================
+ * FP16 matvec for lm_head (final projection to vocab)
+ * ============================================================ */
+static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
+    int out_dim = layer->out_dim;
+    int in_dim = layer->in_dim;
+    const uint16_t *w = layer->weight;
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int i = 0; i < out_dim; i++) {
+        __m512 acc = _mm512_setzero_ps();
+        int j;
+        for (j = 0; j + 16 <= in_dim; j += 16) {
+            __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
+            __m512 wv = _mm512_cvtph_ps(h);
+            __m512 xv = _mm512_loadu_ps(x + j);
+            acc = _mm512_fmadd_ps(wv, xv, acc);
+        }
+        float sum = _mm512_reduce_add_ps(acc);
+        for (; j < in_dim; j++) {
+            __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
+            __m128 fv = _mm_cvtph_ps(hv);
+            float wf;
+            _mm_store_ss(&wf, fv);
+            sum += wf * x[j];
+        }
+        y[i] = sum;
+    }
+}
+/* ============================================================
+ * Basic ops (still float for norms, residuals, attention)
+ * ============================================================ */
+static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
+    __m512 sum_sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
+    }
+    float ss = _mm512_reduce_add_ps(sum_sq);
+    for (; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 wv = _mm512_loadu_ps(weight + i);
+        __m512 rv = _mm512_set1_ps(rms);
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
+    }
+    for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
+}
+static void silu_inplace(float *x, int n) {
+    int i;
+    __m512 one = _mm512_set1_ps(1.0f);
+    /* SiLU vectorized: x / (1 + exp(-x)) */
+    for (i = 0; i < n; i++) {
+        x[i] = x[i] / (1.0f + expf(-x[i]));
+    }
+}
+static void elemwise_mul(const float *a, const float *b, float *c, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 av = _mm512_loadu_ps(a + i);
+        __m512 bv = _mm512_loadu_ps(b + i);
+        _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
+    }
+    for (; i < n; i++) c[i] = a[i] * b[i];
+}
+static void vec_add(float *y, const float *x, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 yv = _mm512_loadu_ps(y + i);
+        __m512 xv = _mm512_loadu_ps(x + i);
+        _mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
+    }
+    for (; i < n; i++) y[i] += x[i];
+}
+static void apply_rope(float *vec, int pos, int dim) {
+    for (int i = 0; i < dim; i += 2) {
+        float freq = 1.0f / powf(ROPE_THETA, (float)i / dim);
+        float angle = pos * freq;
+        float c = cosf(angle), s = sinf(angle);
+        float v0 = vec[i], v1 = vec[i + 1];
+        vec[i]     = v0 * c - v1 * s;
+        vec[i + 1] = v0 * s + v1 * c;
+    }
+}
+static void softmax(float *x, int n) {
+    float max_val = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
+    float inv = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv;
+}
+static void embed_token(const Model *m, int token_id, float *out) {
+    const uint16_t *row = m->embed + (size_t)token_id * HIDDEN;
+    int i;
+    for (i = 0; i + 16 <= HIDDEN; i += 16) {
+        __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
+        _mm512_storeu_ps(out + i, _mm512_cvtph_ps(h));
+    }
+    for (; i < HIDDEN; i++) {
+        __m128i hv = _mm_set1_epi16(row[i]);
+        _mm_store_ss(out + i, _mm_cvtph_ps(hv));
+    }
+}
+static float* kv_ptr(float *cache, int layer, int pos, int kv_head) {
+    return cache + ((size_t)layer * MAX_SEQ * N_KV_HEADS +
+                    (size_t)pos * N_KV_HEADS + kv_head) * HEAD_DIM;
+}
+/* ============================================================
+ * ATTENTION
+ *
+ * Q/K/V projections use full-unary popcount matmul.
+ * Attention scores and value accumulation stay float
+ * (these are O(seq_len) not O(dim²), not the bottleneck).
+ * ============================================================ */
+static void attention(Model *m, int layer_idx, int pos) {
+    Layer *layer = &m->layers[layer_idx];
+    /* Quantize hidden2 to unary for projections */
+    quantize_activation_fast(m->hidden2, &m->qa_hidden);
+    /* Project Q, K, V via popcount matvec */
+    unary_matvec_popcount(&layer->q_proj, &m->qa_hidden, m->q_buf);
+    unary_matvec_popcount(&layer->k_proj, &m->qa_hidden, m->k_buf);
+    unary_matvec_popcount(&layer->v_proj, &m->qa_hidden, m->v_buf);
+    /* Add biases */
+    if (layer->q_bias) vec_add(m->q_buf, layer->q_bias, N_HEADS * HEAD_DIM);
+    if (layer->k_bias) vec_add(m->k_buf, layer->k_bias, N_KV_HEADS * HEAD_DIM);
+    if (layer->v_bias) vec_add(m->v_buf, layer->v_bias, N_KV_HEADS * HEAD_DIM);
+    /* RoPE */
+    for (int h = 0; h < N_HEADS; h++)
+        apply_rope(m->q_buf + h * HEAD_DIM, pos, HEAD_DIM);
+    for (int h = 0; h < N_KV_HEADS; h++)
+        apply_rope(m->k_buf + h * HEAD_DIM, pos, HEAD_DIM);
+    /* Store KV */
+    for (int h = 0; h < N_KV_HEADS; h++) {
+        memcpy(kv_ptr(m->k_cache, layer_idx, pos, h), m->k_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
+        memcpy(kv_ptr(m->v_cache, layer_idx, pos, h), m->v_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
+    }
+    /* Attention */
+    float scale = 1.0f / sqrtf((float)HEAD_DIM);
+    memset(m->attn_out, 0, N_HEADS * HEAD_DIM * sizeof(float));
+    for (int h = 0; h < N_HEADS; h++) {
+        int kv_h = h / HEADS_PER_KV;
+        float *qh = m->q_buf + h * HEAD_DIM;
+        float *oh = m->attn_out + h * HEAD_DIM;
+        for (int t = 0; t <= pos; t++) {
+            float *kc = kv_ptr(m->k_cache, layer_idx, t, kv_h);
+            __m512 acc = _mm512_setzero_ps();
+            int d;
+            for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
+                acc = _mm512_fmadd_ps(_mm512_loadu_ps(qh + d), _mm512_loadu_ps(kc + d), acc);
+            }
+            float dot = _mm512_reduce_add_ps(acc);
+            for (; d < HEAD_DIM; d++) dot += qh[d] * kc[d];
+            m->attn_scores[t] = dot * scale;
+        }
+        softmax(m->attn_scores, pos + 1);
+        for (int t = 0; t <= pos; t++) {
+            float w = m->attn_scores[t];
+            if (w < 1e-8f) continue;
+            float *vc = kv_ptr(m->v_cache, layer_idx, t, kv_h);
+            __m512 wv = _mm512_set1_ps(w);
+            int d;
+            for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
+                __m512 ov = _mm512_loadu_ps(oh + d);
+                _mm512_storeu_ps(oh + d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vc + d), ov));
+            }
+            for (; d < HEAD_DIM; d++) oh[d] += w * vc[d];
+        }
+    }
+    /* O projection: quantize attn_out, then popcount matvec */
+    quantize_activation_fast(m->attn_out, &m->qa_hidden);
+    unary_matvec_popcount(&layer->o_proj, &m->qa_hidden, m->hidden2);
+}
+/* ============================================================
+ * MLP - SwiGLU with unary matmuls
+ * ============================================================ */
+static void mlp(Model *m, int layer_idx) {
+    Layer *layer = &m->layers[layer_idx];
+    /* Quantize hidden2 */
+    quantize_activation_fast(m->hidden2, &m->qa_hidden);
+    /* gate and up projections via popcount */
+    unary_matvec_popcount(&layer->gate_proj, &m->qa_hidden, m->gate_buf);
+    unary_matvec_popcount(&layer->up_proj, &m->qa_hidden, m->up_buf);
+    /* SwiGLU: silu(gate) * up */
+    silu_inplace(m->gate_buf, INTER);
+    elemwise_mul(m->gate_buf, m->up_buf, m->mlp_buf, INTER);
+    /* Down projection: quantize intermediate, popcount matvec */
+    quantize_activation_fast(m->mlp_buf, &m->qa_inter);
+    unary_matvec_popcount(&layer->down_proj, &m->qa_inter, m->hidden2);
+}
+/* ============================================================
+ * FORWARD ONE TOKEN
+ * ============================================================ */
+float* forward_token(Model *m, int token_id, int pos) {
+    embed_token(m, token_id, m->hidden);
+    for (int l = 0; l < N_LAYERS; l++) {
+        rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, HIDDEN);
+        attention(m, l, pos);
+        vec_add(m->hidden, m->hidden2, HIDDEN);
+        rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, HIDDEN);
+        mlp(m, l);
+        vec_add(m->hidden, m->hidden2, HIDDEN);
+    }
+    rmsnorm(m->hidden, m->final_norm, m->hidden2, HIDDEN);
+    fp16_matvec(&m->lm_head, m->hidden2, m->logits);
+    return m->logits;
+}
+/* ============================================================
+ * SAMPLING
+ * ============================================================ */
+static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
+    if (temperature > 0) {
+        float inv_t = 1.0f / temperature;
+        for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
+    }
+    softmax(logits, vocab);
+    float *probs = (float *)malloc(vocab * sizeof(float));
+    int *indices = (int *)malloc(vocab * sizeof(int));
+    memcpy(probs, logits, vocab * sizeof(float));
+    for (int i = 0; i < vocab; i++) indices[i] = i;
+    int n_keep = 0;
+    float cum = 0.0f;
+    while (cum < top_p && n_keep < vocab && n_keep < 40) {
+        int best = n_keep;
+        for (int i = n_keep + 1; i < vocab; i++)
+            if (probs[i] > probs[best]) best = i;
+        float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
+        int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
+        cum += probs[n_keep];
+        n_keep++;
+    }
+    float sum = 0.0f;
+    for (int i = 0; i < n_keep; i++) sum += probs[i];
+    float r = (float)rand() / RAND_MAX * sum;
+    float acc = 0.0f;
+    int chosen = indices[0];
+    for (int i = 0; i < n_keep; i++) {
+        acc += probs[i]; if (acc >= r) { chosen = indices[i]; break; }
+    }
+    free(probs); free(indices);
+    return chosen;
+}
+/* ============================================================
+ * GENERATE
+ * ============================================================ */
+int generate(
+    Model *m,
+    const int *prompt_ids, int prompt_len,
+    int *out_tokens, int max_new_tokens,
+    float temperature, float top_p,
+    int eos_token
+) {
+    srand(time(NULL));
+    for (int i = 0; i < prompt_len; i++)
+        forward_token(m, prompt_ids[i], i);
+    int pos = prompt_len;
+    int generated = 0;
+    for (int t = 0; t < max_new_tokens; t++) {
+        float *logits = m->logits;
+        int next = (temperature <= 0) ? 0 : sample_top_p(logits, VOCAB, temperature, top_p);
+        if (temperature <= 0) {
+            for (int i = 1; i < VOCAB; i++)
+                if (logits[i] > logits[next]) next = i;
+        }
+        out_tokens[t] = next;
+        generated++;
+        if (next == eos_token) break;
+        forward_token(m, next, pos);
+        pos++;
+    }
+    return generated;
+}
+/* ============================================================
+ * ALLOCATE QUANTIZED ACTIVATION BUFFER
+ * ============================================================ */
+static void qa_alloc(QuantAct *qa, int dim) {
+    qa->dim = dim;
+    qa->chunks = (dim + 63) / 64;
+    qa->sign_bits = (uint64_t *)aligned_alloc(64, qa->chunks * sizeof(uint64_t));
+    qa->mag_planes = (uint64_t *)aligned_alloc(64, A_PLANES * qa->chunks * sizeof(uint64_t));
+    qa->scale = 1.0f;
+}
+/* ============================================================
+ * MODEL ALLOC
+ * ============================================================ */
+Model* model_alloc(int n_w_planes) {
+    Model *m = (Model *)calloc(1, sizeof(Model));
+    m->n_w_planes = n_w_planes;
+    m->n_a_planes = A_PLANES;
+    size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
+    m->k_cache = (float *)calloc(kv_size, sizeof(float));
+    m->v_cache = (float *)calloc(kv_size, sizeof(float));
+    m->hidden     = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
+    m->hidden2    = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
+    m->q_buf      = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
+    m->k_buf      = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
+    m->v_buf      = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
+    m->attn_out   = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
+    m->gate_buf   = (float *)aligned_alloc(64, INTER * sizeof(float));
+    m->up_buf     = (float *)aligned_alloc(64, INTER * sizeof(float));
+    m->mlp_buf    = (float *)aligned_alloc(64, INTER * sizeof(float));
+    m->logits     = (float *)aligned_alloc(64, VOCAB * sizeof(float));
+    m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
+    m->final_norm = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
+    qa_alloc(&m->qa_hidden, HIDDEN);
+    qa_alloc(&m->qa_inter, INTER);
+    printf("Model allocated: KV=%zuMB, W_PLANES=%d, A_PLANES=%d\n",
+           kv_size * 2 * sizeof(float) / (1024*1024), n_w_planes, A_PLANES);
+    return m;
+}
+void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
+void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, HIDDEN * sizeof(float)); }
+void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
+    m->lm_head.weight = data; m->lm_head.out_dim = out_dim; m->lm_head.in_dim = in_dim;
+}
+void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
+    m->layers[l].input_norm = in_norm;
+    m->layers[l].post_norm = post_norm;
+}
+void layer_set_bias(Model *m, int l, float *qb, float *kb, float *vb) {
+    m->layers[l].q_bias = qb; m->layers[l].k_bias = kb; m->layers[l].v_bias = vb;
+}
+void layer_set_unary(
+    UnaryLinear *ul, uint64_t *sign, uint64_t *planes, float *scales,
+    int out_dim, int in_dim, int n_planes
+) {
+    ul->sign_bits = sign; ul->mag_planes = planes; ul->scales = scales;
+    ul->out_dim = out_dim; ul->in_dim = in_dim; ul->n_planes = n_planes;
+    ul->bias = NULL;
+}
+void layer_set_linears(
+    Model *m, int l,
+    uint64_t *qs, uint64_t *qp, float *qsc, int qo, int qi,
+    uint64_t *ks, uint64_t *kp, float *ksc, int ko, int ki,
+    uint64_t *vs, uint64_t *vp, float *vsc, int vo, int vi,
+    uint64_t *os, uint64_t *op, float *osc, int oo, int oi,
+    uint64_t *gs, uint64_t *gp, float *gsc, int go, int gi,
+    uint64_t *us, uint64_t *up, float *usc, int uo, int ui,
+    uint64_t *ds, uint64_t *dp, float *dsc, int doo, int di,
+    int n_planes
+) {
+    layer_set_unary(&m->layers[l].q_proj, qs, qp, qsc, qo, qi, n_planes);
+    layer_set_unary(&m->layers[l].k_proj, ks, kp, ksc, ko, ki, n_planes);
+    layer_set_unary(&m->layers[l].v_proj, vs, vp, vsc, vo, vi, n_planes);
+    layer_set_unary(&m->layers[l].o_proj, os, op, osc, oo, oi, n_planes);
+    layer_set_unary(&m->layers[l].gate_proj, gs, gp, gsc, go, gi, n_planes);
+    layer_set_unary(&m->layers[l].up_proj, us, up, usc, uo, ui, n_planes);
+    layer_set_unary(&m->layers[l].down_proj, ds, dp, dsc, doo, di, n_planes);
+}
+void model_reset_cache(Model *m) {
+    size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
+    memset(m->k_cache, 0, kv_size * sizeof(float));
+    memset(m->v_cache, 0, kv_size * sizeof(float));
+}
+void model_free(Model *m) {
+    free(m->k_cache); free(m->v_cache);
+    free(m->hidden); free(m->hidden2);
+    free(m->q_buf); free(m->k_buf); free(m->v_buf);
+    free(m->attn_out); free(m->gate_buf); free(m->up_buf); free(m->mlp_buf);
+    free(m->logits); free(m->attn_scores); free(m->final_norm);
+    free(m->qa_hidden.sign_bits); free(m->qa_hidden.mag_planes);
+    free(m->qa_inter.sign_bits); free(m->qa_inter.mag_planes);
+    free(m);
+}

unary_group_convert.py ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/usr/bin/env python3
+"""
+Convert model to UNARY with GROUP quantization.
+Each group of 32 weights gets its own scale factor.
+This dramatically improves accuracy vs per-row scaling.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import os, json, sys, time
+import numpy as np
+from pathlib import Path
+GROUP_SIZE = 32
+def load_safetensors(model_dir):
+    import torch
+    from safetensors.torch import load_file
+    tensors = {}
+    for f in sorted(Path(model_dir).glob("*.safetensors")):
+        print(f"Loading {f.name}...")
+        for key, val in load_file(str(f)).items():
+            tensors[key] = val.float().numpy()
+    return tensors
+def quantize_group_unary(weight, n_planes=7):
+    """Quantize with per-group scales. GROUP_SIZE=32 weights per scale."""
+    w = weight.astype(np.float32)
+    out_dim, in_dim = w.shape
+    n_groups = (in_dim + GROUP_SIZE - 1) // GROUP_SIZE
+    chunks = (in_dim + 63) // 64
+    padded = chunks * 64
+    # Pad weight to multiple of GROUP_SIZE
+    if in_dim % GROUP_SIZE != 0:
+        pad_w = GROUP_SIZE - (in_dim % GROUP_SIZE)
+        w = np.concatenate([w, np.zeros((out_dim, pad_w), dtype=np.float32)], axis=1)
+    # Reshape to groups: [out_dim, n_groups, GROUP_SIZE]
+    w_grouped = w[:, :n_groups * GROUP_SIZE].reshape(out_dim, n_groups, GROUP_SIZE)
+    # Per-group max absolute value
+    group_max = np.max(np.abs(w_grouped), axis=2)  # [out_dim, n_groups]
+    group_max = np.where(group_max == 0, 1.0, group_max)
+    # Per-group scales
+    group_scales = (group_max / n_planes).astype(np.float32)  # [out_dim, n_groups]
+    # Quantize per group
+    w_scaled = w_grouped / group_scales[:, :, None]  # [out_dim, n_groups, GROUP_SIZE]
+    magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
+    magnitudes = np.clip(magnitudes, 0, n_planes)
+    signs = (w_grouped < 0)
+    # Flatten back to [out_dim, n_groups * GROUP_SIZE]
+    magnitudes = magnitudes.reshape(out_dim, -1)
+    signs = signs.reshape(out_dim, -1)
+    # Pad to multiple of 64 for bitpacking
+    if magnitudes.shape[1] < padded:
+        extra = padded - magnitudes.shape[1]
+        magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, extra), dtype=np.int32)], axis=1)
+        signs = np.concatenate([signs, np.zeros((out_dim, extra), dtype=bool)], axis=1)
+    sparsity = np.mean(magnitudes == 0)
+    # Pack bits
+    bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
+    signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
+    sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
+    mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
+    for p in range(n_planes):
+        active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
+        mag_planes[p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
+    return sign_bits, mag_planes, group_scales, sparsity
+def test_accuracy(weight, sign_bits, mag_planes, group_scales, n_planes):
+    """Test reconstruction accuracy of a single layer."""
+    out_dim, in_dim = weight.shape
+    n_groups = group_scales.shape[1]
+    chunks = (in_dim + 63) // 64
+    np.random.seed(42)
+    x = np.random.randn(in_dim).astype(np.float32)
+    y_orig = weight @ x
+    # Reconstruct weights from unary format
+    w_recon = np.zeros((out_dim, chunks * 64), dtype=np.float32)
+    for p in range(n_planes):
+        for i in range(out_dim):
+            for c in range(chunks):
+                mbits = mag_planes[p, i, c]
+                sbits = sign_bits[i, c]
+                for b in range(64):
+                    if mbits & (1 << b):
+                        col = c * 64 + b
+                        g = col // GROUP_SIZE
+                        if g < n_groups:
+                            sign = -1.0 if (sbits & (1 << b)) else 1.0
+                            w_recon[i, col] += sign * group_scales[i, g]
+    y_recon = w_recon[:, :in_dim] @ x
+    cosim = np.dot(y_orig, y_recon) / (np.linalg.norm(y_orig) * np.linalg.norm(y_recon))
+    return cosim
+def convert(model_dir, output_dir, n_planes=7):
+    os.makedirs(output_dir, exist_ok=True)
+    tensors = load_safetensors(model_dir)
+    linear_keys = [k for k in tensors if any(p in k for p in
+        ['q_proj.weight', 'k_proj.weight', 'v_proj.weight', 'o_proj.weight',
+         'gate_proj.weight', 'up_proj.weight', 'down_proj.weight'])]
+    other_keys = [k for k in tensors if k not in linear_keys]
+    print(f"\nGroup-unary: {len(linear_keys)} layers, n_planes={n_planes}, group_size={GROUP_SIZE}")
+    config = {
+        "hidden_size": 1536, "intermediate_size": 8960,
+        "num_attention_heads": 12, "num_key_value_heads": 2,
+        "num_hidden_layers": 28, "vocab_size": 151936,
+        "head_dim": 128, "rope_theta": 1000000.0, "rms_norm_eps": 1e-6,
+        "n_planes": n_planes, "group_size": GROUP_SIZE,
+        "quant_type": "unary_group",
+    }
+    with open(os.path.join(output_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=2)
+    total_unary = 0
+    total_orig = 0
+    # Test accuracy on first layer
+    test_key = linear_keys[0]
+    for key in linear_keys:
+        w = tensors[key]
+        total_orig += w.nbytes
+        t0 = time.time()
+        sign_bits, mag_planes, group_scales, sparsity = quantize_group_unary(w, n_planes)
+        dt = time.time() - t0
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        sign_bits.tofile(prefix + ".sign")
+        mag_planes.tofile(prefix + ".planes")
+        group_scales.tofile(prefix + ".gscales")
+        nbytes = sign_bits.nbytes + mag_planes.nbytes + group_scales.nbytes
+        total_unary += nbytes
+        print(f"  {key}: {w.shape} -> {nbytes/1024:.0f}KB ({dt:.1f}s, {sparsity:.0%} sparse)")
+    total_fp16 = 0
+    for key in other_keys:
+        w = tensors[key].astype(np.float16)
+        prefix = os.path.join(output_dir, key.replace(".", "_"))
+        w.tofile(prefix + ".fp16")
+        total_fp16 += w.nbytes
+        print(f"  {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
+    manifest = {
+        "unary": {k: list(tensors[k].shape) for k in linear_keys},
+        "fp16": {k: list(tensors[k].shape) for k in other_keys},
+    }
+    with open(os.path.join(output_dir, "manifest.json"), "w") as f:
+        json.dump(manifest, f, indent=2)
+    total = total_unary + total_fp16
+    print(f"\n=== Summary ===")
+    print(f"Original FP32: {total_orig/1e6:.0f} MB")
+    print(f"Unary+group: {total_unary/1e6:.0f} MB")
+    print(f"FP16 other: {total_fp16/1e6:.0f} MB")
+    print(f"Total: {total/1e6:.0f} MB")
+    # Quick accuracy test
+    print(f"\nAccuracy test on {test_key}...")
+    w = tensors[test_key]
+    sign_bits, mag_planes, group_scales, _ = quantize_group_unary(w, n_planes)
+    cosim = test_accuracy(w, sign_bits, mag_planes, group_scales, n_planes)
+    print(f"  Cosine similarity: {cosim:.4f}")
+if __name__ == "__main__":
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
+    output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-gunary"
+    n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
+    convert(model_dir, output_dir, n_planes)
+    print("Done!")

unary_kernel.c ADDED Viewed

	@@ -0,0 +1,120 @@

+/*
+ * UNARY (Base-1) Neural Network Kernel - AVX-512
+ *
+ * Weights quantized to signed integers [-N..+N], stored as:
+ *   sign_bits[row][chunks] - 1 = negative, 0 = positive
+ *   mag_planes[plane][row][chunks] - unary thermometer bitplanes
+ *   scales[row] - per-row float32 scale
+ *
+ * For magnitude M, the first M bitplanes have bit=1 at that position.
+ * E.g. magnitude 3 with max_planes=7: planes 0,1,2 have bit set.
+ *
+ * TRUE UNARY: each plane contributes equally (value 1 per plane).
+ * NOT binary (where plane p contributes 2^p).
+ *
+ * y[i] = scale[i] * sum_planes( signed_masked_sum(x, plane, sign) )
+ *
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
+ */
+#include <immintrin.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <stdio.h>
+void unary_matvec_avx512(
+    const uint64_t *sign_bits,
+    const uint64_t *mag_planes,
+    const float    *scales,
+    const float    *x,
+    float          *y,
+    int             out_dim,
+    int             in_dim,
+    int             n_planes
+) {
+    int chunks = (in_dim + 63) / 64;
+    int in_padded = (in_dim + 15) & ~15;
+    float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
+    memcpy(x_pad, x, in_dim * sizeof(float));
+    memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
+    for (int i = 0; i < out_dim; i++) {
+        const uint64_t *row_sign = sign_bits + (size_t)i * chunks;
+        float total = 0.0f;
+        for (int p = 0; p < n_planes; p++) {
+            const uint64_t *plane_row = mag_planes +
+                ((size_t)p * out_dim + i) * chunks;
+            __m512 acc = _mm512_setzero_ps();
+            for (int c = 0; c < chunks; c++) {
+                uint64_t mbits = plane_row[c];
+                uint64_t sbits = row_sign[c];
+                uint64_t pos = mbits & ~sbits;
+                uint64_t neg = mbits & sbits;
+                for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
+                    int offset = c * 64 + g * 16;
+                    __m512 xv = _mm512_load_ps(x_pad + offset);
+                    __mmask16 pmask = (__mmask16)((pos >> (g * 16)) & 0xFFFF);
+                    __mmask16 nmask = (__mmask16)((neg >> (g * 16)) & 0xFFFF);
+                    acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
+                    acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
+                }
+            }
+            total += _mm512_reduce_add_ps(acc);
+        }
+        y[i] = total * scales[i];
+    }
+    free(x_pad);
+}
+void rmsnorm_avx512(
+    const float *x, const float *weight, float *y, int dim, float eps
+) {
+    __m512 sum_sq = _mm512_setzero_ps();
+    int i;
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
+    }
+    float ss = _mm512_reduce_add_ps(sum_sq);
+    for (; i < dim; i++) ss += x[i] * x[i];
+    float rms = 1.0f / sqrtf(ss / dim + eps);
+    for (i = 0; i + 16 <= dim; i += 16) {
+        __m512 xv = _mm512_loadu_ps(x + i);
+        __m512 wv = _mm512_loadu_ps(weight + i);
+        __m512 rv = _mm512_set1_ps(rms);
+        _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
+    }
+    for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
+}
+void silu_avx512(float *x, int n) {
+    for (int i = 0; i < n; i++) {
+        float v = x[i];
+        x[i] = v / (1.0f + expf(-v));
+    }
+}
+void elemwise_mul_avx512(const float *a, const float *b, float *c, int n) {
+    int i;
+    for (i = 0; i + 16 <= n; i += 16) {
+        __m512 av = _mm512_loadu_ps(a + i);
+        __m512 bv = _mm512_loadu_ps(b + i);
+        _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
+    }
+    for (; i < n; i++) c[i] = a[i] * b[i];
+}
+void softmax_avx512(float *x, int n) {
+    float max_val = x[0];
+    for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
+    float sum = 0.0f;
+    for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
+    float inv = 1.0f / sum;
+    for (int i = 0; i < n; i++) x[i] *= inv;
+}

unary_loader.py ADDED Viewed

	@@ -0,0 +1,202 @@

+#!/usr/bin/env python3
+"""
+Thin Python loader for the Unary C Engine.
+Loads weights from disk, passes pointers to C, calls C generate().
+ZERO Python in the inference hot path.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import ctypes
+import numpy as np
+import json
+import os
+import time
+from pathlib import Path
+from transformers import AutoTokenizer
+class UnaryEngine:
+    def __init__(self, model_dir, so_path="unary_engine.so"):
+        self.model_dir = Path(model_dir)
+        self.lib = ctypes.CDLL(so_path)
+        self._setup_ctypes()
+        # Load config
+        with open(self.model_dir / "config.json") as f:
+            self.config = json.load(f)
+        self.n_planes = self.config["n_planes"]
+        # Load manifest
+        with open(self.model_dir / "manifest.json") as f:
+            self.manifest = json.load(f)
+        # Allocate model in C
+        self.model = self.lib.model_alloc(self.n_planes)
+        # Keep references so GC doesn't free numpy arrays
+        self._refs = []
+        # Load all weights
+        self._load_weights()
+    def _setup_ctypes(self):
+        L = self.lib
+        L.model_alloc.restype = ctypes.c_void_p
+        L.model_alloc.argtypes = [ctypes.c_int]
+        L.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+        L.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+        L.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+        L.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+        L.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+        # layer_set_linears: model, layer_idx, then 7x (sign, planes, scales, out, in), plus n_planes
+        args = [ctypes.c_void_p, ctypes.c_int]
+        for _ in range(7):
+            args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+        args.append(ctypes.c_int)
+        L.layer_set_linears.argtypes = args
+        L.model_reset_cache.argtypes = [ctypes.c_void_p]
+        L.model_free.argtypes = [ctypes.c_void_p]
+        L.forward_token.restype = ctypes.POINTER(ctypes.c_float)
+        L.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+        L.generate.restype = ctypes.c_int
+        L.generate.argtypes = [
+            ctypes.c_void_p,
+            ctypes.c_void_p, ctypes.c_int,
+            ctypes.c_void_p, ctypes.c_int,
+            ctypes.c_float, ctypes.c_float,
+            ctypes.c_int
+        ]
+    def _keep(self, arr):
+        """Keep reference to prevent GC."""
+        self._refs.append(arr)
+        return arr.ctypes.data
+    def _load_fp16(self, key):
+        path = self.model_dir / (key.replace(".", "_") + ".fp16")
+        arr = np.fromfile(str(path), dtype=np.float16)
+        return arr
+    def _load_fp16_as_f32(self, key):
+        arr = self._load_fp16(key).astype(np.float32)
+        self._refs.append(arr)
+        return arr
+    def _load_unary(self, key):
+        prefix = str(self.model_dir / key.replace(".", "_"))
+        sign = np.fromfile(prefix + ".sign", dtype=np.uint64)
+        planes = np.fromfile(prefix + ".planes", dtype=np.uint64)
+        scales = np.fromfile(prefix + ".scales", dtype=np.float32)
+        self._refs.extend([sign, planes, scales])
+        shape = self.manifest["unary"][key]
+        return sign, planes, scales, shape[0], shape[1]
+    def _load_weights(self):
+        t0 = time.time()
+        # Embeddings
+        embed = self._load_fp16("model.embed_tokens.weight")
+        self._refs.append(embed)
+        self.lib.model_set_embed(self.model, embed.ctypes.data)
+        print(f"  Embeddings: {embed.nbytes/1024/1024:.1f} MB")
+        # Final norm
+        fnorm = self._load_fp16_as_f32("model.norm.weight")
+        self.lib.model_set_final_norm(self.model, fnorm.ctypes.data)
+        # LM head
+        lm = self._load_fp16("lm_head.weight")
+        self._refs.append(lm)
+        shape = self.manifest["fp16"]["lm_head.weight"]
+        self.lib.model_set_lm_head(self.model, lm.ctypes.data, shape[0], shape[1])
+        print(f"  LM head: {lm.nbytes/1024/1024:.1f} MB")
+        # Layers
+        for l in range(28):
+            prefix = f"model.layers.{l}"
+            # Norms
+            in_norm = self._load_fp16_as_f32(f"{prefix}.input_layernorm.weight")
+            post_norm = self._load_fp16_as_f32(f"{prefix}.post_attention_layernorm.weight")
+            self.lib.layer_set_norms(self.model, l, in_norm.ctypes.data, post_norm.ctypes.data)
+            # Biases
+            q_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.q_proj.bias")
+            k_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.k_proj.bias")
+            v_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.v_proj.bias")
+            self.lib.layer_set_bias(self.model, l,
+                                    q_bias.ctypes.data, k_bias.ctypes.data, v_bias.ctypes.data)
+            # Unary linear layers
+            projs = ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj',
+                     'self_attn.o_proj', 'mlp.gate_proj', 'mlp.up_proj', 'mlp.down_proj']
+            linear_args = []
+            for proj in projs:
+                key = f"{prefix}.{proj}.weight"
+                sign, planes, scales, out_d, in_d = self._load_unary(key)
+                linear_args.extend([sign.ctypes.data, planes.ctypes.data,
+                                    scales.ctypes.data, out_d, in_d])
+            self.lib.layer_set_linears(self.model, l, *linear_args, self.n_planes)
+            if (l + 1) % 7 == 0:
+                print(f"  Loaded {l+1}/28 layers")
+        dt = time.time() - t0
+        total = sum(a.nbytes for a in self._refs) / 1024 / 1024
+        print(f"\nModel loaded in {dt:.1f}s, {total:.0f} MB in Python arrays")
+    def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95, eos_token=151643):
+        self.lib.model_reset_cache(self.model)
+        prompt = np.array(token_ids, dtype=np.int32)
+        output = np.zeros(max_new_tokens, dtype=np.int32)
+        t0 = time.time()
+        n_gen = self.lib.generate(
+            self.model,
+            prompt.ctypes.data, len(prompt),
+            output.ctypes.data, max_new_tokens,
+            ctypes.c_float(temperature), ctypes.c_float(top_p),
+            eos_token
+        )
+        dt = time.time() - t0
+        return output[:n_gen].tolist(), n_gen, dt
+def main():
+    import sys
+    model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary"
+    hf_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
+    print("Loading tokenizer...")
+    tok = AutoTokenizer.from_pretrained(hf_dir, trust_remote_code=True)
+    print("Loading unary engine...")
+    engine = UnaryEngine(model_dir, "./unary_engine.so")
+    # Test
+    messages = [{"role": "user", "content": "What is 2+2?"}]
+    prompt = tok.apply_chat_template(messages, add_generation_prompt=True)
+    print(f"\nPrompt: {len(prompt)} tokens")
+    print("Generating...")
+    tokens, n_gen, dt = engine.generate(prompt, max_new_tokens=60, temperature=0.6)
+    text = tok.decode(tokens)
+    print(f"\n--- Output ({n_gen} tokens in {dt:.2f}s = {n_gen/dt:.1f} tok/s) ---")
+    print(text)
+    print("---")
+if __name__ == "__main__":
+    main()

unary_run.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python3
+"""
+Unary Engine Runner - Loads weights into the C engine and generates text.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import ctypes, numpy as np, os, sys, time, struct
+MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
+HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
+ENGINE = "/root/ternary_engine/unary_engine.so"
+N_PLANES = 7
+N_LAYERS = 28
+HIDDEN = 1536
+VOCAB = 151936
+# Load engine
+lib = ctypes.CDLL(ENGINE)
+# Define function signatures
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.restype = None
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.restype = None
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.restype = None
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.restype = None
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.restype = None
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_linears.restype = None
+# 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
+lib.layer_set_linears.argtypes = [
+    ctypes.c_void_p, ctypes.c_int,  # model, layer_idx
+    # q_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # k_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # v_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # o_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # gate_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # up_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # down_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    ctypes.c_int  # n_planes
+]
+lib.generate.restype = ctypes.c_int
+lib.generate.argtypes = [
+    ctypes.c_void_p,  # model
+    ctypes.c_void_p, ctypes.c_int,  # prompt_ids, prompt_len
+    ctypes.c_void_p, ctypes.c_int,  # out_tokens, max_new_tokens
+    ctypes.c_float, ctypes.c_float,  # temperature, top_p
+    ctypes.c_int  # eos_token
+]
+lib.model_reset_cache.restype = None
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+lib.model_free.restype = None
+lib.model_free.argtypes = [ctypes.c_void_p]
+def load_fp16_as_uint16(path):
+    """Load FP16 file as raw uint16 array (for passing to C as FP16)"""
+    return np.fromfile(path, dtype=np.uint16)
+def load_fp16_as_f32(path):
+    """Load FP16 file and convert to FP32"""
+    raw = np.fromfile(path, dtype=np.float16)
+    return raw.astype(np.float32)
+def load_unary(name):
+    """Load sign, planes, scales for a unary layer"""
+    base = os.path.join(MODEL_DIR, name)
+    sign = np.fromfile(base + ".sign", dtype=np.uint64)
+    planes = np.fromfile(base + ".planes", dtype=np.uint64)
+    scales = np.fromfile(base + ".scales", dtype=np.float32)
+    return sign, planes, scales
+# Keep references to prevent GC
+_refs = []
+def keep(arr):
+    """Keep numpy array alive and return its ctypes pointer"""
+    _refs.append(arr)
+    return arr.ctypes.data
+print("Allocating model...")
+model = lib.model_alloc(N_PLANES)
+print(f"Model pointer: {model:#x}")
+# Load embeddings (FP16, passed as uint16)
+print("Loading embeddings...")
+embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
+print(f"  embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
+lib.model_set_embed(model, keep(embed))
+# Load final norm (FP16 -> FP32)
+print("Loading final norm...")
+final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
+lib.model_set_final_norm(model, keep(final_norm))
+# Load lm_head (FP16, passed as uint16)
+print("Loading lm_head...")
+lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
+lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
+# Load layers
+PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
+              "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
+# Layer dimensions: [out_dim, in_dim]
+PROJ_DIMS = {
+    "self_attn_q_proj": (1536, 1536),
+    "self_attn_k_proj": (256, 1536),
+    "self_attn_v_proj": (256, 1536),
+    "self_attn_o_proj": (1536, 1536),
+    "mlp_gate_proj": (8960, 1536),
+    "mlp_up_proj": (8960, 1536),
+    "mlp_down_proj": (1536, 8960),
+}
+for l in range(N_LAYERS):
+    if l % 7 == 0:
+        print(f"Loading layer {l}/{N_LAYERS}...")
+    # Norms (FP16 -> FP32)
+    input_norm = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
+    post_norm = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
+    lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
+    # Biases (FP16 -> FP32)
+    q_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
+    k_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
+    v_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
+    lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
+    # Unary linear layers
+    proj_args = []
+    for proj_name in PROJ_NAMES:
+        weight_name = f"model_layers_{l}_{proj_name}_weight"
+        sign, planes, scales = load_unary(weight_name)
+        out_dim, in_dim = PROJ_DIMS[proj_name]
+        proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
+    lib.layer_set_linears(model, l, *proj_args, N_PLANES)
+print("Model loaded!")
+# Load tokenizer
+print("Loading tokenizer...")
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
+eos_id = tokenizer.eos_token_id
+print(f"Tokenizer loaded, EOS={eos_id}")
+# Generate
+prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
+print(f"\nPrompt: {prompt}")
+input_ids = tokenizer.encode(prompt, return_tensors=None)
+input_arr = np.array(input_ids, dtype=np.int32)
+max_new = 256
+out_arr = np.zeros(max_new, dtype=np.int32)
+lib.model_reset_cache(model)
+print("Generating...")
+t0 = time.time()
+n_gen = lib.generate(
+    model,
+    input_arr.ctypes.data, len(input_ids),
+    out_arr.ctypes.data, max_new,
+    ctypes.c_float(0.6), ctypes.c_float(0.9),
+    eos_id
+)
+dt = time.time() - t0
+output_ids = out_arr[:n_gen].tolist()
+text = tokenizer.decode(output_ids, skip_special_tokens=False)
+tok_s = n_gen / dt if dt > 0 else 0
+print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
+print(text)
+print(f"--- End ---")
+lib.model_free(model)

unary_run16.py ADDED Viewed

	@@ -0,0 +1,203 @@

+#!/usr/bin/env python3
+"""
+Unary Engine Runner - Loads weights into the C engine and generates text.
+(c) 2026 OpenTransformers Ltd / Scott Bisset
+"""
+import ctypes, numpy as np, os, sys, time, struct
+MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
+HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
+ENGINE = "/root/ternary_engine/unary_engine.so"
+N_PLANES = 7
+N_LAYERS = 28
+HIDDEN = 1536
+VOCAB = 151936
+# Load engine
+lib = ctypes.CDLL(ENGINE)
+# Define function signatures
+lib.model_alloc.restype = ctypes.c_void_p
+lib.model_alloc.argtypes = [ctypes.c_int]
+lib.model_set_embed.restype = None
+lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_final_norm.restype = None
+lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+lib.model_set_lm_head.restype = None
+lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
+lib.layer_set_norms.restype = None
+lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_bias.restype = None
+lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
+lib.layer_set_linears.restype = None
+# 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
+lib.layer_set_linears.argtypes = [
+    ctypes.c_void_p, ctypes.c_int,  # model, layer_idx
+    # q_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # k_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # v_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # o_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # gate_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # up_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    # down_proj
+    ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
+    ctypes.c_int  # n_planes
+]
+lib.generate.restype = ctypes.c_int
+lib.generate.argtypes = [
+    ctypes.c_void_p,  # model
+    ctypes.c_void_p, ctypes.c_int,  # prompt_ids, prompt_len
+    ctypes.c_void_p, ctypes.c_int,  # out_tokens, max_new_tokens
+    ctypes.c_float, ctypes.c_float,  # temperature, top_p
+    ctypes.c_int  # eos_token
+]
+lib.model_reset_cache.restype = None
+lib.model_reset_cache.argtypes = [ctypes.c_void_p]
+lib.model_free.restype = None
+lib.model_free.argtypes = [ctypes.c_void_p]
+def load_fp16_as_uint16(path):
+    """Load FP16 file as raw uint16 array (for passing to C as FP16)"""
+    return np.fromfile(path, dtype=np.uint16)
+def load_fp16_as_f32(path):
+    """Load FP16 file and convert to FP32"""
+    raw = np.fromfile(path, dtype=np.float16)
+    return raw.astype(np.float32)
+def load_unary(name):
+    """Load sign, planes, scales for a unary layer"""
+    base = os.path.join(MODEL_DIR, name)
+    sign = np.fromfile(base + ".sign", dtype=np.uint64)
+    planes = np.fromfile(base + ".planes", dtype=np.uint64)
+    scales = np.fromfile(base + ".scales", dtype=np.float32)
+    return sign, planes, scales
+# Keep references to prevent GC
+_refs = []
+def keep(arr):
+    """Keep numpy array alive and return its ctypes pointer"""
+    _refs.append(arr)
+    return arr.ctypes.data
+print("Allocating model...")
+model = lib.model_alloc(N_PLANES)
+print(f"Model pointer: {model:#x}")
+# Load embeddings (FP16, passed as uint16)
+print("Loading embeddings...")
+embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
+print(f"  embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
+lib.model_set_embed(model, keep(embed))
+# Load final norm (FP16 -> FP32)
+print("Loading final norm...")
+final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
+lib.model_set_final_norm(model, keep(final_norm))
+# Load lm_head (FP16, passed as uint16)
+print("Loading lm_head...")
+lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
+lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
+# Load layers
+PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
+              "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
+# Layer dimensions: [out_dim, in_dim]
+PROJ_DIMS = {
+    "self_attn_q_proj": (1536, 1536),
+    "self_attn_k_proj": (256, 1536),
+    "self_attn_v_proj": (256, 1536),
+    "self_attn_o_proj": (1536, 1536),
+    "mlp_gate_proj": (8960, 1536),
+    "mlp_up_proj": (8960, 1536),
+    "mlp_down_proj": (1536, 8960),
+}
+for l in range(N_LAYERS):
+    if l % 7 == 0:
+        print(f"Loading layer {l}/{N_LAYERS}...")
+    # Norms (FP16 -> FP32)
+    input_norm = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
+    post_norm = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
+    lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
+    # Biases (FP16 -> FP32)
+    q_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
+    k_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
+    v_bias = load_fp16_as_f32(
+        os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
+    lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
+    # Unary linear layers
+    proj_args = []
+    for proj_name in PROJ_NAMES:
+        weight_name = f"model_layers_{l}_{proj_name}_weight"
+        sign, planes, scales = load_unary(weight_name)
+        out_dim, in_dim = PROJ_DIMS[proj_name]
+        proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
+    lib.layer_set_linears(model, l, *proj_args, N_PLANES)
+print("Model loaded!")
+# Load tokenizer
+print("Loading tokenizer...")
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
+eos_id = tokenizer.eos_token_id
+print(f"Tokenizer loaded, EOS={eos_id}")
+# Generate
+prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
+print(f"\nPrompt: {prompt}")
+input_ids = tokenizer.encode(prompt, return_tensors=None)
+input_arr = np.array(input_ids, dtype=np.int32)
+max_new = 16
+out_arr = np.zeros(max_new, dtype=np.int32)
+lib.model_reset_cache(model)
+print("Generating...")
+t0 = time.time()
+n_gen = lib.generate(
+    model,
+    input_arr.ctypes.data, len(input_ids),
+    out_arr.ctypes.data, max_new,
+    ctypes.c_float(0.6), ctypes.c_float(0.9),
+    eos_id
+)
+dt = time.time() - t0
+output_ids = out_arr[:n_gen].tolist()
+text = tokenizer.decode(output_ids, skip_special_tokens=False)
+tok_s = n_gen / dt if dt > 0 else 0
+print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
+print(text)
+print(f"--- End ---")
+lib.model_free(model)