Add files using upload-large-folder tool
Browse files- README.md +66 -0
- bench_fwd.py +70 -0
- bench_gen.py +71 -0
- bench_prompt.py +86 -0
- build.sh +82 -0
- concat_unary +0 -0
- concat_unary.c +608 -0
- convert.py +205 -0
- convert_fast.py +226 -0
- convert_log_unary.py +159 -0
- convert_proper_unary.py +164 -0
- convert_proper_unary_v2.py +247 -0
- convert_qwen3.py +149 -0
- convert_qwen3_v2.py +161 -0
- deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales +0 -0
- deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16 +0 -0
- deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales +0 -0
- deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg +0 -0
- deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales +3 -0
- deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos +0 -0
- inference.py +503 -0
- log_unary_engine.c +598 -0
- logunary_tensor.c +534 -0
- packed_convert.py +79 -0
- packed_engine.c +408 -0
- packed_loader.py +134 -0
- proper_unary +0 -0
- proper_unary.c +563 -0
- pure_unary_engine.c +658 -0
- run_convert.py +76 -0
- run_log_unary.py +123 -0
- run_pure_unary.py +176 -0
- run_qwen3_4b.py +221 -0
- server.py +107 -0
- ternary_kernel.c +265 -0
- test_logunary +0 -0
- test_logunary.c +153 -0
- test_popcount.py +99 -0
- true_unary +0 -0
- true_unary.c +552 -0
- unary_convert.py +189 -0
- unary_convert_v2.py +134 -0
- unary_engine.c +381 -0
- unary_engine_v2.c +629 -0
- unary_full.c +742 -0
- unary_group_convert.py +192 -0
- unary_kernel.c +120 -0
- unary_loader.py +202 -0
- unary_run.py +203 -0
- unary_run16.py +203 -0
README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: apache-2.0
|
| 3 |
+
tags:
|
| 4 |
+
- quantization
|
| 5 |
+
- unary
|
| 6 |
+
- thermometer-encoding
|
| 7 |
+
- inference-engine
|
| 8 |
+
- low-bit
|
| 9 |
+
language:
|
| 10 |
+
- en
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Unary Quantization Research
|
| 14 |
+
|
| 15 |
+
True unary (base-1) quantization for neural network weights. NOT binary.
|
| 16 |
+
|
| 17 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 18 |
+
|
| 19 |
+
## Overview
|
| 20 |
+
|
| 21 |
+
Unary means magnitude N = N consecutive 1-bits across N bitplanes. Each bitplane contributes value=1, not binary powers. This eliminates multiplication from inference — only addition and popcount.
|
| 22 |
+
|
| 23 |
+
7-plane unary gives 8 magnitude levels (15 distinct values with sign), achieving 0.97 cosine similarity per layer against FP32 originals.
|
| 24 |
+
|
| 25 |
+
## Contents
|
| 26 |
+
|
| 27 |
+
### Converters (Python)
|
| 28 |
+
- `unary_convert.py` / `unary_convert_v2.py` — Base unary thermometer conversion
|
| 29 |
+
- `convert_proper_unary.py` / `convert_proper_unary_v2.py` — Proper unary with group quantization
|
| 30 |
+
- `convert_log_unary.py` — Log-spaced unary variant
|
| 31 |
+
- `convert_fast.py` — Optimised conversion pipeline
|
| 32 |
+
- `packed_convert.py` / `packed_loader.py` — Packed binary format
|
| 33 |
+
- `convert_qwen3.py` / `convert_qwen3_v2.py` — Qwen3-4B specific converters
|
| 34 |
+
|
| 35 |
+
### C Inference Engines (AVX-512 + POPCNT)
|
| 36 |
+
- `unary_engine.c` / `unary_engine_v2.c` — Core unary inference
|
| 37 |
+
- `pure_unary_engine.c` — Pure unary (no FP in linear layers)
|
| 38 |
+
- `log_unary_engine.c` — Log-unary engine
|
| 39 |
+
- `proper_unary.c` — Proper unary with group scales
|
| 40 |
+
- `true_unary.c` — True base-1 unary engine
|
| 41 |
+
- `concat_unary.c` — Concatenated unary engine
|
| 42 |
+
- `packed_engine.c` — Packed bitplane engine
|
| 43 |
+
- `unary_full.c` — Full forward pass engine
|
| 44 |
+
|
| 45 |
+
### Converted Models
|
| 46 |
+
- `deepseek-r1-1.5b-*` — DeepSeek-R1-1.5B in multiple unary variants (4-plane, 7-plane, 31-plane, grouped, packed, ternary baseline)
|
| 47 |
+
- `qwen3-4b-*` — Qwen3-4B-Thinking in unary, log-unary, and proper-unary variants
|
| 48 |
+
|
| 49 |
+
### Benchmarks and Runners
|
| 50 |
+
- `bench_fwd.py` / `bench_gen.py` / `bench_prompt.py` — Performance benchmarks
|
| 51 |
+
- `inference.py` / `server.py` — Python inference and API server
|
| 52 |
+
- Various `run_*.py` — Model-specific runners
|
| 53 |
+
|
| 54 |
+
## Key Insight
|
| 55 |
+
|
| 56 |
+
Unary quantization trades bits-per-weight for computational simplicity. All multiply-accumulate operations become popcount + addition, making this particularly suited for edge/CPU inference where SIMD popcount is fast.
|
| 57 |
+
|
| 58 |
+
## Building
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
gcc -O3 -mavx512f -mavx512bw -mpopcnt -o unary_engine unary_engine.c -lm
|
| 62 |
+
```
|
| 63 |
+
|
| 64 |
+
## License
|
| 65 |
+
|
| 66 |
+
Apache 2.0
|
bench_fwd.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ctypes, numpy as np, os, time, sys
|
| 2 |
+
|
| 3 |
+
MODEL_DIR = "deepseek-r1-1.5b-unary"
|
| 4 |
+
HF_DIR = "deepseek-r1-1.5b-hf"
|
| 5 |
+
lib = ctypes.CDLL("./unary_engine.so")
|
| 6 |
+
|
| 7 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 8 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 9 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 10 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 11 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 12 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 13 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 14 |
+
lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
|
| 15 |
+
lib.forward_token.restype = ctypes.c_void_p
|
| 16 |
+
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 17 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 18 |
+
|
| 19 |
+
_refs = []
|
| 20 |
+
def keep(a):
|
| 21 |
+
_refs.append(a)
|
| 22 |
+
return a.ctypes.data
|
| 23 |
+
|
| 24 |
+
N_PLANES = 7
|
| 25 |
+
N_LAYERS = 28
|
| 26 |
+
PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
|
| 27 |
+
DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
|
| 28 |
+
|
| 29 |
+
print("Loading model...")
|
| 30 |
+
m = lib.model_alloc(N_PLANES)
|
| 31 |
+
e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
|
| 32 |
+
lib.model_set_embed(m, keep(e))
|
| 33 |
+
n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
|
| 34 |
+
lib.model_set_final_norm(m, keep(n))
|
| 35 |
+
h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
|
| 36 |
+
lib.model_set_lm_head(m, keep(h), 151936, 1536)
|
| 37 |
+
for l in range(N_LAYERS):
|
| 38 |
+
inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 39 |
+
pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 40 |
+
lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
|
| 41 |
+
qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 42 |
+
kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 43 |
+
vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 44 |
+
lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
|
| 45 |
+
pa = []
|
| 46 |
+
for pn in PROJS:
|
| 47 |
+
base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
|
| 48 |
+
s = np.fromfile(base+'.sign',dtype=np.uint64)
|
| 49 |
+
p = np.fromfile(base+'.planes',dtype=np.uint64)
|
| 50 |
+
sc = np.fromfile(base+'.scales',dtype=np.float32)
|
| 51 |
+
od,id = DIMS[pn]
|
| 52 |
+
pa.extend([keep(s),keep(p),keep(sc),od,id])
|
| 53 |
+
lib.layer_set_linears(m, l, *pa, N_PLANES)
|
| 54 |
+
|
| 55 |
+
print("Model loaded, benchmarking single forward pass...")
|
| 56 |
+
lib.model_reset_cache(m)
|
| 57 |
+
|
| 58 |
+
# Time single forward pass (token_id=1, pos=0)
|
| 59 |
+
times = []
|
| 60 |
+
for i in range(3):
|
| 61 |
+
lib.model_reset_cache(m)
|
| 62 |
+
t0 = time.time()
|
| 63 |
+
lib.forward_token(m, 1, 0)
|
| 64 |
+
dt = time.time() - t0
|
| 65 |
+
times.append(dt)
|
| 66 |
+
print(f" forward_token run {i}: {dt:.3f}s")
|
| 67 |
+
|
| 68 |
+
avg = sum(times)/len(times)
|
| 69 |
+
print(f"\nAvg: {avg:.3f}s per token = {1/avg:.1f} tok/s")
|
| 70 |
+
print(f"OMP threads: {os.environ.get('OMP_NUM_THREADS', 'default')}")
|
bench_gen.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ctypes, numpy as np, os, time
|
| 2 |
+
|
| 3 |
+
MODEL_DIR = "deepseek-r1-1.5b-unary"
|
| 4 |
+
HF_DIR = "deepseek-r1-1.5b-hf"
|
| 5 |
+
lib = ctypes.CDLL("./unary_engine.so")
|
| 6 |
+
|
| 7 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 8 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 9 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 10 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 11 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 12 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 13 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 14 |
+
lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
|
| 15 |
+
lib.forward_token.restype = ctypes.c_void_p
|
| 16 |
+
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 17 |
+
lib.generate.restype = ctypes.c_int
|
| 18 |
+
lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
|
| 19 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 20 |
+
|
| 21 |
+
_refs = []
|
| 22 |
+
def keep(a):
|
| 23 |
+
_refs.append(a)
|
| 24 |
+
return a.ctypes.data
|
| 25 |
+
|
| 26 |
+
N_PLANES = 7
|
| 27 |
+
N_LAYERS = 28
|
| 28 |
+
PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
|
| 29 |
+
DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
|
| 30 |
+
|
| 31 |
+
m = lib.model_alloc(N_PLANES)
|
| 32 |
+
e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
|
| 33 |
+
lib.model_set_embed(m, keep(e))
|
| 34 |
+
n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
|
| 35 |
+
lib.model_set_final_norm(m, keep(n))
|
| 36 |
+
h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
|
| 37 |
+
lib.model_set_lm_head(m, keep(h), 151936, 1536)
|
| 38 |
+
for l in range(N_LAYERS):
|
| 39 |
+
inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 40 |
+
pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 41 |
+
lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
|
| 42 |
+
qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 43 |
+
kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 44 |
+
vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 45 |
+
lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
|
| 46 |
+
pa = []
|
| 47 |
+
for pn in PROJS:
|
| 48 |
+
base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
|
| 49 |
+
s = np.fromfile(base+'.sign',dtype=np.uint64)
|
| 50 |
+
p = np.fromfile(base+'.planes',dtype=np.uint64)
|
| 51 |
+
sc = np.fromfile(base+'.scales',dtype=np.float32)
|
| 52 |
+
od,id = DIMS[pn]
|
| 53 |
+
pa.extend([keep(s),keep(p),keep(sc),od,id])
|
| 54 |
+
lib.layer_set_linears(m, l, *pa, N_PLANES)
|
| 55 |
+
|
| 56 |
+
print("Loaded. Testing generate with greedy (temp=0)...")
|
| 57 |
+
lib.model_reset_cache(m)
|
| 58 |
+
|
| 59 |
+
inp = np.array([1], dtype=np.int32) # just BOS token
|
| 60 |
+
out = np.zeros(8, dtype=np.int32)
|
| 61 |
+
|
| 62 |
+
t0 = time.time()
|
| 63 |
+
ng = lib.generate(m, inp.ctypes.data, 1, out.ctypes.data, 8,
|
| 64 |
+
ctypes.c_float(0.0), ctypes.c_float(0.9), 151643)
|
| 65 |
+
dt = time.time() - t0
|
| 66 |
+
print(f"Generated {ng} tokens in {dt:.1f}s = {ng/dt:.1f} tok/s")
|
| 67 |
+
print(f"Token IDs: {out[:ng].tolist()}")
|
| 68 |
+
|
| 69 |
+
from transformers import AutoTokenizer
|
| 70 |
+
tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
|
| 71 |
+
print(f"Text: {tok.decode(out[:ng].tolist())}")
|
bench_prompt.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ctypes, numpy as np, os, time
|
| 2 |
+
|
| 3 |
+
MODEL_DIR = "deepseek-r1-1.5b-unary"
|
| 4 |
+
HF_DIR = "deepseek-r1-1.5b-hf"
|
| 5 |
+
lib = ctypes.CDLL("./unary_engine.so")
|
| 6 |
+
|
| 7 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 8 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 9 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 10 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 11 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 12 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 13 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 14 |
+
lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
|
| 15 |
+
lib.generate.restype = ctypes.c_int
|
| 16 |
+
lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
|
| 17 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 18 |
+
|
| 19 |
+
_refs = []
|
| 20 |
+
def keep(a):
|
| 21 |
+
_refs.append(a)
|
| 22 |
+
return a.ctypes.data
|
| 23 |
+
|
| 24 |
+
N_PLANES = 7
|
| 25 |
+
N_LAYERS = 28
|
| 26 |
+
PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
|
| 27 |
+
DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
|
| 28 |
+
|
| 29 |
+
m = lib.model_alloc(N_PLANES)
|
| 30 |
+
e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
|
| 31 |
+
lib.model_set_embed(m, keep(e))
|
| 32 |
+
n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
|
| 33 |
+
lib.model_set_final_norm(m, keep(n))
|
| 34 |
+
h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
|
| 35 |
+
lib.model_set_lm_head(m, keep(h), 151936, 1536)
|
| 36 |
+
for l in range(N_LAYERS):
|
| 37 |
+
inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 38 |
+
pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
|
| 39 |
+
lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
|
| 40 |
+
qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 41 |
+
kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 42 |
+
vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
|
| 43 |
+
lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
|
| 44 |
+
pa = []
|
| 45 |
+
for pn in PROJS:
|
| 46 |
+
base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
|
| 47 |
+
s = np.fromfile(base+'.sign',dtype=np.uint64)
|
| 48 |
+
p = np.fromfile(base+'.planes',dtype=np.uint64)
|
| 49 |
+
sc = np.fromfile(base+'.scales',dtype=np.float32)
|
| 50 |
+
od,id = DIMS[pn]
|
| 51 |
+
pa.extend([keep(s),keep(p),keep(sc),od,id])
|
| 52 |
+
lib.layer_set_linears(m, l, *pa, N_PLANES)
|
| 53 |
+
|
| 54 |
+
from transformers import AutoTokenizer
|
| 55 |
+
tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
|
| 56 |
+
|
| 57 |
+
# Test with actual prompt
|
| 58 |
+
prompt = "What is 2+2? Think step by step."
|
| 59 |
+
ids = tok.encode(prompt)
|
| 60 |
+
inp = np.array(ids, dtype=np.int32)
|
| 61 |
+
out = np.zeros(64, dtype=np.int32)
|
| 62 |
+
lib.model_reset_cache(m)
|
| 63 |
+
|
| 64 |
+
print(f"Prompt: {prompt} ({len(ids)} tokens)")
|
| 65 |
+
|
| 66 |
+
# Test greedy first
|
| 67 |
+
print("\n--- Greedy ---")
|
| 68 |
+
t0 = time.time()
|
| 69 |
+
ng = lib.generate(m, inp.ctypes.data, len(ids), out.ctypes.data, 64,
|
| 70 |
+
ctypes.c_float(0.0), ctypes.c_float(0.9), tok.eos_token_id)
|
| 71 |
+
dt = time.time() - t0
|
| 72 |
+
text = tok.decode(out[:ng].tolist(), skip_special_tokens=False)
|
| 73 |
+
print(f"{ng} tokens, {dt:.1f}s, {ng/dt:.1f} tok/s")
|
| 74 |
+
print(f"Output: {text}")
|
| 75 |
+
|
| 76 |
+
# Test with temperature
|
| 77 |
+
print("\n--- Temperature=0.6 ---")
|
| 78 |
+
lib.model_reset_cache(m)
|
| 79 |
+
out2 = np.zeros(64, dtype=np.int32)
|
| 80 |
+
t0 = time.time()
|
| 81 |
+
ng2 = lib.generate(m, inp.ctypes.data, len(ids), out2.ctypes.data, 64,
|
| 82 |
+
ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
|
| 83 |
+
dt2 = time.time() - t0
|
| 84 |
+
text2 = tok.decode(out2[:ng2].tolist(), skip_special_tokens=False)
|
| 85 |
+
print(f"{ng2} tokens, {dt2:.1f}s, {ng2/dt2:.1f} tok/s")
|
| 86 |
+
print(f"Output: {text2}")
|
build.sh
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Build and deploy ternary inference engine
|
| 3 |
+
# (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 4 |
+
|
| 5 |
+
set -e
|
| 6 |
+
|
| 7 |
+
WORKDIR=/root/ternary_engine
|
| 8 |
+
MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
|
| 9 |
+
MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf
|
| 10 |
+
TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary
|
| 11 |
+
|
| 12 |
+
echo "=== Ternary Inference Engine Build ==="
|
| 13 |
+
echo "Target: AVX-512 Skylake"
|
| 14 |
+
echo ""
|
| 15 |
+
|
| 16 |
+
mkdir -p $WORKDIR
|
| 17 |
+
cd $WORKDIR
|
| 18 |
+
|
| 19 |
+
# Step 1: Compile C kernel with AVX-512
|
| 20 |
+
echo "[1/4] Compiling AVX-512 kernel..."
|
| 21 |
+
gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \
|
| 22 |
+
-shared -fPIC -lm \
|
| 23 |
+
-o ternary_kernel.so ternary_kernel.c
|
| 24 |
+
echo " -> ternary_kernel.so built"
|
| 25 |
+
ls -lh ternary_kernel.so
|
| 26 |
+
|
| 27 |
+
# Step 2: Download model from HuggingFace
|
| 28 |
+
echo ""
|
| 29 |
+
echo "[2/4] Downloading model weights..."
|
| 30 |
+
pip install --break-system-packages -q safetensors tokenizers 2>/dev/null
|
| 31 |
+
python3 -c "
|
| 32 |
+
from huggingface_hub import snapshot_download
|
| 33 |
+
snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR',
|
| 34 |
+
ignore_patterns=['*.md', '*.txt', 'figures/*'])
|
| 35 |
+
print('Download complete')
|
| 36 |
+
"
|
| 37 |
+
|
| 38 |
+
# Step 3: Convert to ternary
|
| 39 |
+
echo ""
|
| 40 |
+
echo "[3/4] Converting to ternary format..."
|
| 41 |
+
python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7
|
| 42 |
+
|
| 43 |
+
# Step 4: Verify
|
| 44 |
+
echo ""
|
| 45 |
+
echo "[4/4] Verifying..."
|
| 46 |
+
ls -lh $TERNARY_DIR/ | head -20
|
| 47 |
+
echo ""
|
| 48 |
+
du -sh $TERNARY_DIR/
|
| 49 |
+
echo ""
|
| 50 |
+
|
| 51 |
+
# Quick test
|
| 52 |
+
echo "Running speed test..."
|
| 53 |
+
python3 -c "
|
| 54 |
+
from inference import TernaryQwen, load_kernel
|
| 55 |
+
import time
|
| 56 |
+
import os
|
| 57 |
+
|
| 58 |
+
kernel = load_kernel('$WORKDIR/ternary_kernel.so')
|
| 59 |
+
model = TernaryQwen('$TERNARY_DIR', kernel)
|
| 60 |
+
|
| 61 |
+
# Warm up
|
| 62 |
+
import numpy as np
|
| 63 |
+
cache_module = __import__('inference')
|
| 64 |
+
cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
|
| 65 |
+
hidden = model.forward_token(9707, cache, 0) # 'Hello'
|
| 66 |
+
|
| 67 |
+
# Benchmark single token
|
| 68 |
+
times = []
|
| 69 |
+
for i in range(5):
|
| 70 |
+
cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
|
| 71 |
+
t0 = time.time()
|
| 72 |
+
h = model.forward_token(9707, cache2, 0)
|
| 73 |
+
times.append(time.time() - t0)
|
| 74 |
+
|
| 75 |
+
avg = sum(times) / len(times)
|
| 76 |
+
print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)')
|
| 77 |
+
print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}')
|
| 78 |
+
"
|
| 79 |
+
|
| 80 |
+
echo ""
|
| 81 |
+
echo "=== Build complete ==="
|
| 82 |
+
echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py"
|
concat_unary
ADDED
|
Binary file (26.1 kB). View file
|
|
|
concat_unary.c
ADDED
|
@@ -0,0 +1,608 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* CONCATENATIVE UNARY ENGINE
|
| 3 |
+
*
|
| 4 |
+
* In base-1, the value IS the count of ones.
|
| 5 |
+
* Addition = concatenation of bitstreams.
|
| 6 |
+
* Multiplication = AND + count.
|
| 7 |
+
*
|
| 8 |
+
* REPRESENTATION:
|
| 9 |
+
* Each element of a vector has:
|
| 10 |
+
* - A sign bit (positive/negative)
|
| 11 |
+
* - A magnitude = number of 1-bits across K "slots"
|
| 12 |
+
*
|
| 13 |
+
* But crucially, when we ADD two unary vectors (residual connection),
|
| 14 |
+
* we DON'T dequantize-add-requantize. We CONCATENATE the slots.
|
| 15 |
+
*
|
| 16 |
+
* If vector A has K_a slots and vector B has K_b slots,
|
| 17 |
+
* A + B has K_a + K_b slots. The magnitude of element j is
|
| 18 |
+
* just the total count of 1-bits at position j across ALL slots.
|
| 19 |
+
*
|
| 20 |
+
* This means the residual stream GROWS through the network:
|
| 21 |
+
* After embed: K_0 slots
|
| 22 |
+
* After layer 1: K_0 + K_attn + K_mlp slots
|
| 23 |
+
* After layer L: K_0 + L*(K_attn + K_mlp) slots
|
| 24 |
+
*
|
| 25 |
+
* No information is ever destroyed by requantization.
|
| 26 |
+
*
|
| 27 |
+
* MATMUL:
|
| 28 |
+
* y = W @ x where W has K_w slots and x has K_x slots.
|
| 29 |
+
* For each output element y[i]:
|
| 30 |
+
* For each slot pair (p from W, q from x):
|
| 31 |
+
* count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
|
| 32 |
+
* - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
|
| 33 |
+
* Output gets K_out = some fixed number of slots (requantized)
|
| 34 |
+
* because matmul output magnitude is in a different scale.
|
| 35 |
+
*
|
| 36 |
+
* SAME-SIGN ADD (residual):
|
| 37 |
+
* Just append slots. Zero compute.
|
| 38 |
+
* For different signs: need cancellation.
|
| 39 |
+
* In practice residual connections are same-sign-dominant,
|
| 40 |
+
* so we track sign separately and concat magnitudes,
|
| 41 |
+
* deferring cancellation to the next norm.
|
| 42 |
+
*
|
| 43 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 44 |
+
*/
|
| 45 |
+
|
| 46 |
+
#define _POSIX_C_SOURCE 199309L
|
| 47 |
+
#include <immintrin.h>
|
| 48 |
+
#include <omp.h>
|
| 49 |
+
#include <stdint.h>
|
| 50 |
+
#include <stdlib.h>
|
| 51 |
+
#include <string.h>
|
| 52 |
+
#include <math.h>
|
| 53 |
+
#include <stdio.h>
|
| 54 |
+
#include <time.h>
|
| 55 |
+
|
| 56 |
+
/* ============================================================
|
| 57 |
+
* GROWABLE UNARY VECTOR
|
| 58 |
+
*
|
| 59 |
+
* The key data structure. Slots can be appended (concat = add).
|
| 60 |
+
* Each slot is a bitplane of dim bits packed into uint64 chunks.
|
| 61 |
+
*
|
| 62 |
+
* sign: uint64[chunks] — per-element sign
|
| 63 |
+
* slots: uint64[n_slots * chunks] — each slot is chunks uint64s
|
| 64 |
+
* n_slots: current number of slots (grows via concat)
|
| 65 |
+
* max_slots: allocated capacity
|
| 66 |
+
*
|
| 67 |
+
* For element j:
|
| 68 |
+
* magnitude = number of slots where bit j is set
|
| 69 |
+
* value = sign * magnitude * scale
|
| 70 |
+
*
|
| 71 |
+
* ============================================================ */
|
| 72 |
+
typedef struct {
|
| 73 |
+
uint64_t *sign;
|
| 74 |
+
uint64_t *slots; /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */
|
| 75 |
+
float scale; /* per-vector scale factor */
|
| 76 |
+
int dim;
|
| 77 |
+
int chunks; /* (dim+63)/64 */
|
| 78 |
+
int n_slots; /* current slot count */
|
| 79 |
+
int max_slots; /* allocated capacity */
|
| 80 |
+
} GrowVec;
|
| 81 |
+
|
| 82 |
+
/* Fixed-size unary matrix (weights don't grow) */
|
| 83 |
+
typedef struct {
|
| 84 |
+
uint64_t *sign; /* [rows * chunks] */
|
| 85 |
+
uint64_t *slots; /* [K * rows * chunks] */
|
| 86 |
+
float *scales; /* [rows] per-row scale */
|
| 87 |
+
int rows, cols, chunks, K;
|
| 88 |
+
} FixedMat;
|
| 89 |
+
|
| 90 |
+
/* ============================================================
|
| 91 |
+
* ALLOCATION
|
| 92 |
+
* ============================================================ */
|
| 93 |
+
GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
|
| 94 |
+
GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec));
|
| 95 |
+
v->dim = dim;
|
| 96 |
+
v->chunks = (dim + 63) / 64;
|
| 97 |
+
v->n_slots = 0;
|
| 98 |
+
v->max_slots = max_slots;
|
| 99 |
+
v->scale = 1.0f;
|
| 100 |
+
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
|
| 101 |
+
v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t));
|
| 102 |
+
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
|
| 103 |
+
memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
|
| 104 |
+
return v;
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
void gv_free(GrowVec *v) {
|
| 108 |
+
if (v) { free(v->sign); free(v->slots); free(v); }
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
FixedMat* fm_alloc(int rows, int cols, int K) {
|
| 112 |
+
FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat));
|
| 113 |
+
m->rows = rows; m->cols = cols; m->K = K;
|
| 114 |
+
m->chunks = (cols + 63) / 64;
|
| 115 |
+
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 116 |
+
m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 117 |
+
m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
|
| 118 |
+
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 119 |
+
memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 120 |
+
return m;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
void fm_free(FixedMat *m) {
|
| 124 |
+
if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
/* ============================================================
|
| 128 |
+
* FLOAT → UNARY CONVERSION (only at boundaries)
|
| 129 |
+
* ============================================================ */
|
| 130 |
+
void gv_from_float(GrowVec *v, const float *x, int K) {
|
| 131 |
+
int dim = v->dim, chunks = v->chunks;
|
| 132 |
+
|
| 133 |
+
v->n_slots = K;
|
| 134 |
+
memset(v->sign, 0, chunks * sizeof(uint64_t));
|
| 135 |
+
memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
|
| 136 |
+
|
| 137 |
+
float amax = 0.0f;
|
| 138 |
+
for (int i = 0; i < dim; i++) {
|
| 139 |
+
float a = fabsf(x[i]);
|
| 140 |
+
if (a > amax) amax = a;
|
| 141 |
+
}
|
| 142 |
+
if (amax == 0.0f) { v->scale = 1.0f; return; }
|
| 143 |
+
v->scale = amax / K;
|
| 144 |
+
float inv = K / amax;
|
| 145 |
+
|
| 146 |
+
for (int i = 0; i < dim; i++) {
|
| 147 |
+
int c = i / 64;
|
| 148 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 149 |
+
|
| 150 |
+
if (x[i] < 0.0f) v->sign[c] |= bit;
|
| 151 |
+
|
| 152 |
+
int mag = (int)(fabsf(x[i]) * inv + 0.5f);
|
| 153 |
+
if (mag > K) mag = K;
|
| 154 |
+
for (int s = 0; s < mag; s++)
|
| 155 |
+
v->slots[(size_t)s * chunks + c] |= bit;
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
void gv_to_float(const GrowVec *v, float *out) {
|
| 160 |
+
int dim = v->dim, chunks = v->chunks;
|
| 161 |
+
|
| 162 |
+
for (int i = 0; i < dim; i++) {
|
| 163 |
+
int c = i / 64;
|
| 164 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 165 |
+
|
| 166 |
+
int mag = 0;
|
| 167 |
+
for (int s = 0; s < v->n_slots; s++) {
|
| 168 |
+
if (v->slots[(size_t)s * chunks + c] & bit)
|
| 169 |
+
mag++;
|
| 170 |
+
}
|
| 171 |
+
|
| 172 |
+
float val = (float)mag * v->scale;
|
| 173 |
+
out[i] = (v->sign[c] & bit) ? -val : val;
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
void fm_from_float(FixedMat *m, const float *data) {
|
| 178 |
+
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
|
| 179 |
+
|
| 180 |
+
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
|
| 181 |
+
memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
|
| 182 |
+
|
| 183 |
+
for (int r = 0; r < rows; r++) {
|
| 184 |
+
const float *row = data + (size_t)r * cols;
|
| 185 |
+
float amax = 0.0f;
|
| 186 |
+
for (int j = 0; j < cols; j++) {
|
| 187 |
+
float a = fabsf(row[j]);
|
| 188 |
+
if (a > amax) amax = a;
|
| 189 |
+
}
|
| 190 |
+
if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
|
| 191 |
+
m->scales[r] = amax / K;
|
| 192 |
+
float inv = K / amax;
|
| 193 |
+
|
| 194 |
+
uint64_t *rs = m->sign + (size_t)r * chunks;
|
| 195 |
+
for (int j = 0; j < cols; j++) {
|
| 196 |
+
int c = j / 64;
|
| 197 |
+
uint64_t bit = 1ULL << (j % 64);
|
| 198 |
+
if (row[j] < 0.0f) rs[c] |= bit;
|
| 199 |
+
|
| 200 |
+
int mag = (int)(fabsf(row[j]) * inv + 0.5f);
|
| 201 |
+
if (mag > K) mag = K;
|
| 202 |
+
for (int s = 0; s < mag; s++)
|
| 203 |
+
m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
/* ============================================================
|
| 209 |
+
* CONCATENATION = ADDITION
|
| 210 |
+
*
|
| 211 |
+
* gv_concat(dst, src):
|
| 212 |
+
* Appends src's slots to dst.
|
| 213 |
+
* Same-sign: just append.
|
| 214 |
+
* Different-sign: cancel bits (remove from both).
|
| 215 |
+
*
|
| 216 |
+
* For efficiency with residual connections where scales differ:
|
| 217 |
+
* We track a "slot_scales" or use a single scale with normalization.
|
| 218 |
+
*
|
| 219 |
+
* SIMPLE VERSION: assumes same scale (works after norm).
|
| 220 |
+
* ============================================================ */
|
| 221 |
+
|
| 222 |
+
/* Simple concat: append src slots to dst. Handles sign cancellation. */
|
| 223 |
+
void gv_concat(GrowVec *dst, const GrowVec *src) {
|
| 224 |
+
int chunks = dst->chunks;
|
| 225 |
+
|
| 226 |
+
/* For each source slot, process element-wise:
|
| 227 |
+
* Where signs agree: copy bit to new dst slot
|
| 228 |
+
* Where signs differ: cancel - find a dst slot with that bit set and clear it
|
| 229 |
+
*
|
| 230 |
+
* Optimization: for most transformer residuals, signs mostly agree.
|
| 231 |
+
* So we do the simple thing: compute per-element sign agreement,
|
| 232 |
+
* then for agreeing elements just append, for disagreeing elements cancel.
|
| 233 |
+
*/
|
| 234 |
+
|
| 235 |
+
/* Sign agreement mask */
|
| 236 |
+
/* agree[c] = ~(dst_sign[c] ^ src_sign[c]) — bits where signs match */
|
| 237 |
+
|
| 238 |
+
for (int s = 0; s < src->n_slots; s++) {
|
| 239 |
+
const uint64_t *src_slot = src->slots + (size_t)s * chunks;
|
| 240 |
+
|
| 241 |
+
/* Split into agree and disagree portions */
|
| 242 |
+
int new_slot = dst->n_slots;
|
| 243 |
+
if (new_slot >= dst->max_slots) {
|
| 244 |
+
/* Out of room — would need realloc in production */
|
| 245 |
+
printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
|
| 246 |
+
return;
|
| 247 |
+
}
|
| 248 |
+
uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks;
|
| 249 |
+
|
| 250 |
+
for (int c = 0; c < chunks; c++) {
|
| 251 |
+
uint64_t src_bits = src_slot[c];
|
| 252 |
+
uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
|
| 253 |
+
uint64_t disagree = dst->sign[c] ^ src->sign[c];
|
| 254 |
+
|
| 255 |
+
/* Same sign: just append to new slot */
|
| 256 |
+
uint64_t to_add = src_bits & agree;
|
| 257 |
+
|
| 258 |
+
/* Different sign: cancel from existing dst slots */
|
| 259 |
+
uint64_t to_cancel = src_bits & disagree;
|
| 260 |
+
|
| 261 |
+
/* Cancel by walking backwards through dst slots */
|
| 262 |
+
for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
|
| 263 |
+
uint64_t *dslot = dst->slots + (size_t)d * chunks + c;
|
| 264 |
+
uint64_t overlap = *dslot & to_cancel;
|
| 265 |
+
*dslot &= ~overlap; /* clear cancelled bits in dst */
|
| 266 |
+
to_cancel &= ~overlap; /* mark as cancelled */
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
/* Any remaining to_cancel means src > dst for those elements
|
| 270 |
+
* — flip the sign and add to new slot */
|
| 271 |
+
if (to_cancel) {
|
| 272 |
+
dst->sign[c] ^= to_cancel; /* flip sign for these elements */
|
| 273 |
+
to_add |= to_cancel;
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
dst_new[c] = to_add;
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
/* Only increment if new slot is non-empty */
|
| 280 |
+
int non_empty = 0;
|
| 281 |
+
for (int c = 0; c < chunks && !non_empty; c++)
|
| 282 |
+
if (dst_new[c]) non_empty = 1;
|
| 283 |
+
if (non_empty)
|
| 284 |
+
dst->n_slots++;
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
/* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
|
| 289 |
+
void gv_concat_fast(GrowVec *dst, const GrowVec *src) {
|
| 290 |
+
int chunks = dst->chunks;
|
| 291 |
+
int src_slots = src->n_slots;
|
| 292 |
+
|
| 293 |
+
if (dst->n_slots + src_slots > dst->max_slots) {
|
| 294 |
+
printf("WARNING: GrowVec overflow\n");
|
| 295 |
+
src_slots = dst->max_slots - dst->n_slots;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
/* Just memcpy the slots — handles same-sign correctly,
|
| 299 |
+
* defers opposite-sign cancellation to next norm */
|
| 300 |
+
memcpy(dst->slots + (size_t)dst->n_slots * chunks,
|
| 301 |
+
src->slots,
|
| 302 |
+
(size_t)src_slots * chunks * sizeof(uint64_t));
|
| 303 |
+
dst->n_slots += src_slots;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
/* ============================================================
|
| 307 |
+
* MATMUL: y = M @ x
|
| 308 |
+
*
|
| 309 |
+
* M is fixed (K_w slots), x is growable (n_slots slots).
|
| 310 |
+
* Output is a NEW GrowVec with K_out slots.
|
| 311 |
+
*
|
| 312 |
+
* Core: for each output element i, accumulate:
|
| 313 |
+
* acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
|
| 314 |
+
* - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
|
| 315 |
+
*
|
| 316 |
+
* Then quantize acc to K_out unary slots.
|
| 317 |
+
* ============================================================ */
|
| 318 |
+
void gv_matmul(
|
| 319 |
+
const FixedMat *M,
|
| 320 |
+
const GrowVec *x,
|
| 321 |
+
GrowVec *y, /* output — gets filled with K_out slots */
|
| 322 |
+
int K_out /* how many output slots */
|
| 323 |
+
) {
|
| 324 |
+
int out_dim = M->rows;
|
| 325 |
+
int chunks = M->chunks;
|
| 326 |
+
int wK = M->K;
|
| 327 |
+
int xK = x->n_slots;
|
| 328 |
+
|
| 329 |
+
float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
|
| 330 |
+
|
| 331 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 332 |
+
for (int i = 0; i < out_dim; i++) {
|
| 333 |
+
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
|
| 334 |
+
long long acc = 0;
|
| 335 |
+
|
| 336 |
+
for (int c = 0; c < chunks; c++) {
|
| 337 |
+
uint64_t ws = w_sign_row[c];
|
| 338 |
+
uint64_t xs = x->sign[c];
|
| 339 |
+
uint64_t same = ~(ws ^ xs);
|
| 340 |
+
uint64_t diff = ws ^ xs;
|
| 341 |
+
|
| 342 |
+
for (int p = 0; p < wK; p++) {
|
| 343 |
+
uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
|
| 344 |
+
|
| 345 |
+
for (int q = 0; q < xK; q++) {
|
| 346 |
+
uint64_t xq = x->slots[(size_t)q * chunks + c];
|
| 347 |
+
uint64_t active = wp & xq;
|
| 348 |
+
acc += __builtin_popcountll(active & same)
|
| 349 |
+
- __builtin_popcountll(active & diff);
|
| 350 |
+
}
|
| 351 |
+
}
|
| 352 |
+
}
|
| 353 |
+
|
| 354 |
+
y_float[i] = (float)acc * M->scales[i] * x->scale;
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
/* Quantize to K_out slots */
|
| 358 |
+
gv_from_float(y, y_float, K_out);
|
| 359 |
+
free(y_float);
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
/* ============================================================
|
| 363 |
+
* NORM: GrowVec → GrowVec with controlled slot count
|
| 364 |
+
*
|
| 365 |
+
* RMSNorm dequantizes (counting), normalizes (float),
|
| 366 |
+
* then requantizes to a fixed K.
|
| 367 |
+
* This is where slot count gets reset.
|
| 368 |
+
* ============================================================ */
|
| 369 |
+
void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) {
|
| 370 |
+
int dim = x->dim;
|
| 371 |
+
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 372 |
+
gv_to_float(x, xf);
|
| 373 |
+
|
| 374 |
+
float ss = 0.0f;
|
| 375 |
+
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
|
| 376 |
+
float rms = 1.0f / sqrtf(ss / dim + eps);
|
| 377 |
+
for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
|
| 378 |
+
|
| 379 |
+
gv_from_float(out, xf, K_out);
|
| 380 |
+
free(xf);
|
| 381 |
+
}
|
| 382 |
+
|
| 383 |
+
/* ============================================================
|
| 384 |
+
* SILU_MUL: out = SiLU(gate) * up
|
| 385 |
+
* Dequant, compute, requant. O(dim).
|
| 386 |
+
* ============================================================ */
|
| 387 |
+
void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) {
|
| 388 |
+
int dim = gate->dim;
|
| 389 |
+
float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 390 |
+
float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 391 |
+
gv_to_float(gate, gf);
|
| 392 |
+
gv_to_float(up, uf);
|
| 393 |
+
|
| 394 |
+
for (int i = 0; i < dim; i++)
|
| 395 |
+
gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
|
| 396 |
+
|
| 397 |
+
gv_from_float(out, gf, K_out);
|
| 398 |
+
free(gf); free(uf);
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
/* ============================================================
|
| 402 |
+
* TEST: demonstrate growing residual stream
|
| 403 |
+
* ============================================================ */
|
| 404 |
+
void test_concat_add() {
|
| 405 |
+
printf("=== CONCATENATION = ADDITION TEST ===\n\n");
|
| 406 |
+
|
| 407 |
+
int dim = 16;
|
| 408 |
+
|
| 409 |
+
/* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
|
| 410 |
+
float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
|
| 411 |
+
float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};
|
| 412 |
+
|
| 413 |
+
GrowVec *a = gv_alloc(dim, 8, 64);
|
| 414 |
+
GrowVec *b = gv_alloc(dim, 8, 64);
|
| 415 |
+
gv_from_float(a, a_vals, 8);
|
| 416 |
+
gv_from_float(b, b_vals, 8);
|
| 417 |
+
|
| 418 |
+
printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
|
| 419 |
+
float af[16], bf[16];
|
| 420 |
+
gv_to_float(a, af);
|
| 421 |
+
printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
|
| 422 |
+
printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");
|
| 423 |
+
|
| 424 |
+
printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
|
| 425 |
+
gv_to_float(b, bf);
|
| 426 |
+
printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
|
| 427 |
+
printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");
|
| 428 |
+
|
| 429 |
+
/* Concatenate (= add) */
|
| 430 |
+
printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);
|
| 431 |
+
|
| 432 |
+
/* Need same scale for concat to work correctly */
|
| 433 |
+
/* In a real network, both come from norm so they have comparable scale */
|
| 434 |
+
/* For this test, use fast concat (no cancellation) */
|
| 435 |
+
gv_concat(a, b);
|
| 436 |
+
printf(" -> %d):\n", a->n_slots);
|
| 437 |
+
|
| 438 |
+
float result[16], ref[16];
|
| 439 |
+
gv_to_float(a, result);
|
| 440 |
+
for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];
|
| 441 |
+
|
| 442 |
+
/* NOTE: concat addition only works correctly when scales match.
|
| 443 |
+
* When scales differ, we'd need to adjust. In a transformer,
|
| 444 |
+
* the norm before each sublayer ensures comparable scales. */
|
| 445 |
+
|
| 446 |
+
printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
|
| 447 |
+
printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");
|
| 448 |
+
|
| 449 |
+
gv_free(a); gv_free(b);
|
| 450 |
+
}
|
| 451 |
+
|
| 452 |
+
void test_growing_residual() {
|
| 453 |
+
printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
|
| 454 |
+
printf("Simulating 6 transformer layers with concat residuals\n\n");
|
| 455 |
+
|
| 456 |
+
int dim = 2560;
|
| 457 |
+
int K_embed = 16; /* initial embedding quantization */
|
| 458 |
+
int K_sublayer = 8; /* each sublayer output */
|
| 459 |
+
int n_layers = 6;
|
| 460 |
+
|
| 461 |
+
/* Create random embedding */
|
| 462 |
+
float *embed = (float *)malloc(dim * sizeof(float));
|
| 463 |
+
srand(42);
|
| 464 |
+
for (int i = 0; i < dim; i++) {
|
| 465 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 466 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 467 |
+
embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
/* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
|
| 471 |
+
int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
|
| 472 |
+
GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
|
| 473 |
+
gv_from_float(residual, embed, K_embed);
|
| 474 |
+
|
| 475 |
+
printf("After embedding: %d slots (%.1f KB)\n",
|
| 476 |
+
residual->n_slots,
|
| 477 |
+
(float)residual->n_slots * residual->chunks * 8 / 1024);
|
| 478 |
+
|
| 479 |
+
for (int l = 0; l < n_layers; l++) {
|
| 480 |
+
/* Simulate attention output */
|
| 481 |
+
GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
|
| 482 |
+
float *fake_attn = (float *)malloc(dim * sizeof(float));
|
| 483 |
+
for (int i = 0; i < dim; i++) {
|
| 484 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 485 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 486 |
+
fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
|
| 487 |
+
}
|
| 488 |
+
gv_from_float(attn_out, fake_attn, K_sublayer);
|
| 489 |
+
/* Scale must match for concat to work — in real net, norm handles this */
|
| 490 |
+
attn_out->scale = residual->scale;
|
| 491 |
+
|
| 492 |
+
/* RESIDUAL ADD = CONCATENATION */
|
| 493 |
+
gv_concat_fast(residual, attn_out);
|
| 494 |
+
|
| 495 |
+
/* Simulate MLP output */
|
| 496 |
+
GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
|
| 497 |
+
float *fake_mlp = (float *)malloc(dim * sizeof(float));
|
| 498 |
+
for (int i = 0; i < dim; i++) {
|
| 499 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 500 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 501 |
+
fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
|
| 502 |
+
}
|
| 503 |
+
gv_from_float(mlp_out, fake_mlp, K_sublayer);
|
| 504 |
+
mlp_out->scale = residual->scale;
|
| 505 |
+
|
| 506 |
+
/* RESIDUAL ADD = CONCATENATION */
|
| 507 |
+
gv_concat_fast(residual, mlp_out);
|
| 508 |
+
|
| 509 |
+
printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
|
| 510 |
+
l + 1, residual->n_slots,
|
| 511 |
+
(float)residual->n_slots * residual->chunks * 8 / 1024,
|
| 512 |
+
K_sublayer, K_sublayer);
|
| 513 |
+
|
| 514 |
+
gv_free(attn_out); gv_free(mlp_out);
|
| 515 |
+
free(fake_attn); free(fake_mlp);
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
printf("\nResidual grew from %d to %d slots through %d layers\n",
|
| 519 |
+
K_embed, residual->n_slots, n_layers);
|
| 520 |
+
printf("Information accumulated, never lost to requantization\n");
|
| 521 |
+
|
| 522 |
+
gv_free(residual);
|
| 523 |
+
free(embed);
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
void test_matmul_accuracy() {
|
| 527 |
+
printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");
|
| 528 |
+
|
| 529 |
+
int rows = 512, cols = 2560;
|
| 530 |
+
int wK = 32;
|
| 531 |
+
|
| 532 |
+
printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
|
| 533 |
+
printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");
|
| 534 |
+
|
| 535 |
+
srand(42);
|
| 536 |
+
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
|
| 537 |
+
float *xf = (float *)malloc(cols * sizeof(float));
|
| 538 |
+
float *y_ref = (float *)calloc(rows, sizeof(float));
|
| 539 |
+
|
| 540 |
+
for (size_t i = 0; i < (size_t)rows * cols; i++) {
|
| 541 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 542 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 543 |
+
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 544 |
+
}
|
| 545 |
+
for (int i = 0; i < cols; i++) {
|
| 546 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 547 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 548 |
+
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 549 |
+
}
|
| 550 |
+
for (int i = 0; i < rows; i++)
|
| 551 |
+
for (int j = 0; j < cols; j++)
|
| 552 |
+
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
|
| 553 |
+
|
| 554 |
+
FixedMat *M = fm_alloc(rows, cols, wK);
|
| 555 |
+
fm_from_float(M, Mf);
|
| 556 |
+
|
| 557 |
+
/* Test with different x slot counts (simulating growing residual) */
|
| 558 |
+
int x_slots[] = {8, 16, 32, 48, 64, 96};
|
| 559 |
+
for (int t = 0; t < 6; t++) {
|
| 560 |
+
int xK = x_slots[t];
|
| 561 |
+
GrowVec *x = gv_alloc(cols, xK, xK);
|
| 562 |
+
GrowVec *y = gv_alloc(rows, xK, xK);
|
| 563 |
+
gv_from_float(x, xf, xK);
|
| 564 |
+
|
| 565 |
+
struct timespec t0, t1;
|
| 566 |
+
float *yf = (float *)malloc(rows * sizeof(float));
|
| 567 |
+
|
| 568 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 569 |
+
gv_matmul(M, x, y, xK);
|
| 570 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 571 |
+
double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
|
| 572 |
+
|
| 573 |
+
gv_to_float(y, yf);
|
| 574 |
+
|
| 575 |
+
float dot = 0, na = 0, nb = 0, noise = 0;
|
| 576 |
+
for (int i = 0; i < rows; i++) {
|
| 577 |
+
dot += y_ref[i] * yf[i];
|
| 578 |
+
na += y_ref[i] * y_ref[i];
|
| 579 |
+
nb += yf[i] * yf[i];
|
| 580 |
+
float e = y_ref[i] - yf[i];
|
| 581 |
+
noise += e * e;
|
| 582 |
+
}
|
| 583 |
+
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 584 |
+
float snr = 10.0f * log10f(na / (noise + 1e-10f));
|
| 585 |
+
|
| 586 |
+
printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);
|
| 587 |
+
|
| 588 |
+
gv_free(x); gv_free(y); free(yf);
|
| 589 |
+
}
|
| 590 |
+
|
| 591 |
+
fm_free(M);
|
| 592 |
+
free(Mf); free(xf); free(y_ref);
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
int main() {
|
| 596 |
+
printf("========================================\n");
|
| 597 |
+
printf(" CONCATENATIVE UNARY ENGINE TESTS\n");
|
| 598 |
+
printf(" Addition = Concatenation\n");
|
| 599 |
+
printf(" Value = Count of Ones\n");
|
| 600 |
+
printf("========================================\n");
|
| 601 |
+
|
| 602 |
+
test_concat_add();
|
| 603 |
+
test_growing_residual();
|
| 604 |
+
test_matmul_accuracy();
|
| 605 |
+
|
| 606 |
+
printf("\n=== ALL TESTS DONE ===\n");
|
| 607 |
+
return 0;
|
| 608 |
+
}
|
convert.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert DeepSeek-R1-Distill-Qwen-1.5B to ternary format.
|
| 4 |
+
|
| 5 |
+
Stores linear weights as bitplanes (pos_mask, neg_mask) + per-row scale.
|
| 6 |
+
Embeddings and layernorms stay FP16. LM head stays FP16.
|
| 7 |
+
|
| 8 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import json
|
| 13 |
+
import struct
|
| 14 |
+
import numpy as np
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
import time
|
| 17 |
+
|
| 18 |
+
def load_safetensors(model_dir):
|
| 19 |
+
"""Load all tensors from safetensors files."""
|
| 20 |
+
import torch; from safetensors.torch import load_file
|
| 21 |
+
|
| 22 |
+
tensors = {}
|
| 23 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 24 |
+
print(f"Loading {f.name}...")
|
| 25 |
+
state = load_file(str(f))
|
| 26 |
+
for key, val in state.items():
|
| 27 |
+
tensors[key] = val.float().numpy()
|
| 28 |
+
return tensors
|
| 29 |
+
|
| 30 |
+
def quantize_row_ternary(row, alpha=0.7):
|
| 31 |
+
"""Quantize a single row to ternary {-1, 0, +1}. Vectorized bitpacking."""
|
| 32 |
+
row = row.astype(np.float32)
|
| 33 |
+
mean_abs = np.mean(np.abs(row))
|
| 34 |
+
threshold = alpha * mean_abs
|
| 35 |
+
|
| 36 |
+
pos = row >= threshold
|
| 37 |
+
neg = row <= -threshold
|
| 38 |
+
|
| 39 |
+
nz_mask = pos | neg
|
| 40 |
+
scale = np.mean(np.abs(row[nz_mask])) if nz_mask.any() else np.float32(1.0)
|
| 41 |
+
|
| 42 |
+
# Pad to multiple of 64
|
| 43 |
+
in_dim = len(row)
|
| 44 |
+
pad = (64 - in_dim % 64) % 64
|
| 45 |
+
if pad:
|
| 46 |
+
pos = np.concatenate([pos, np.zeros(pad, dtype=bool)])
|
| 47 |
+
neg = np.concatenate([neg, np.zeros(pad, dtype=bool)])
|
| 48 |
+
|
| 49 |
+
# Vectorized bitpack: reshape to [chunks, 64], multiply by bit positions, sum
|
| 50 |
+
pos_r = pos.reshape(-1, 64).astype(np.uint64)
|
| 51 |
+
neg_r = neg.reshape(-1, 64).astype(np.uint64)
|
| 52 |
+
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
|
| 53 |
+
pos_bits = np.bitwise_or.reduce(pos_r * bit_positions, axis=1)
|
| 54 |
+
neg_bits = np.bitwise_or.reduce(neg_r * bit_positions, axis=1)
|
| 55 |
+
|
| 56 |
+
return pos_bits, neg_bits, np.float32(scale)
|
| 57 |
+
|
| 58 |
+
return pos_bits, neg_bits, np.float32(scale)
|
| 59 |
+
|
| 60 |
+
def quantize_weight_matrix(weight, alpha=0.7):
|
| 61 |
+
"""Quantize entire weight matrix [out_dim, in_dim] to ternary. Fully vectorized."""
|
| 62 |
+
w = weight.astype(np.float32)
|
| 63 |
+
out_dim, in_dim = w.shape
|
| 64 |
+
|
| 65 |
+
# Per-row thresholds
|
| 66 |
+
row_means = np.mean(np.abs(w), axis=1, keepdims=True)
|
| 67 |
+
thresholds = alpha * row_means
|
| 68 |
+
|
| 69 |
+
pos = w >= thresholds # [out_dim, in_dim]
|
| 70 |
+
neg = w <= -thresholds
|
| 71 |
+
|
| 72 |
+
# Per-row scales
|
| 73 |
+
nz = pos | neg
|
| 74 |
+
# Use row means of absolute values where non-zero
|
| 75 |
+
scales = np.zeros(out_dim, dtype=np.float32)
|
| 76 |
+
for i in range(out_dim):
|
| 77 |
+
if nz[i].any():
|
| 78 |
+
scales[i] = np.mean(np.abs(w[i, nz[i]]))
|
| 79 |
+
else:
|
| 80 |
+
scales[i] = 1.0
|
| 81 |
+
|
| 82 |
+
# Sparsity
|
| 83 |
+
total = out_dim * in_dim
|
| 84 |
+
sparsity = 1.0 - np.sum(nz) / total
|
| 85 |
+
|
| 86 |
+
# Pad to multiple of 64
|
| 87 |
+
pad = (64 - in_dim % 64) % 64
|
| 88 |
+
if pad:
|
| 89 |
+
pos = np.concatenate([pos, np.zeros((out_dim, pad), dtype=bool)], axis=1)
|
| 90 |
+
neg = np.concatenate([neg, np.zeros((out_dim, pad), dtype=bool)], axis=1)
|
| 91 |
+
|
| 92 |
+
padded_dim = pos.shape[1]
|
| 93 |
+
chunks = padded_dim // 64
|
| 94 |
+
|
| 95 |
+
# Vectorized bitpacking for entire matrix at once
|
| 96 |
+
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64)) # [64]
|
| 97 |
+
|
| 98 |
+
pos_r = pos.reshape(out_dim, chunks, 64).astype(np.uint64) # [out, chunks, 64]
|
| 99 |
+
neg_r = neg.reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 100 |
+
|
| 101 |
+
all_pos = np.bitwise_or.reduce(pos_r * bit_positions, axis=2) # [out, chunks]
|
| 102 |
+
all_neg = np.bitwise_or.reduce(neg_r * bit_positions, axis=2)
|
| 103 |
+
|
| 104 |
+
return all_pos, all_neg, scales, sparsity
|
| 105 |
+
|
| 106 |
+
def save_ternary_model(tensors, output_dir, alpha=0.7):
|
| 107 |
+
"""Convert and save full model to ternary format."""
|
| 108 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 109 |
+
|
| 110 |
+
config = {
|
| 111 |
+
"hidden_size": 1536,
|
| 112 |
+
"intermediate_size": 8960,
|
| 113 |
+
"num_attention_heads": 12,
|
| 114 |
+
"num_key_value_heads": 2,
|
| 115 |
+
"num_hidden_layers": 28,
|
| 116 |
+
"vocab_size": 151936,
|
| 117 |
+
"head_dim": 128,
|
| 118 |
+
"rope_theta": 1000000.0,
|
| 119 |
+
"rms_norm_eps": 1e-6,
|
| 120 |
+
"alpha": alpha,
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
# Identify which tensors to ternarize vs keep as-is
|
| 124 |
+
ternary_keys = [] # Linear weights to ternarize
|
| 125 |
+
keep_keys = [] # Embeddings, norms, biases to keep as FP16
|
| 126 |
+
|
| 127 |
+
for key in tensors:
|
| 128 |
+
if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
|
| 129 |
+
'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
|
| 130 |
+
'down_proj.weight']):
|
| 131 |
+
ternary_keys.append(key)
|
| 132 |
+
else:
|
| 133 |
+
keep_keys.append(key)
|
| 134 |
+
|
| 135 |
+
print(f"\nTernary layers: {len(ternary_keys)}")
|
| 136 |
+
print(f"FP16 layers: {len(keep_keys)}")
|
| 137 |
+
|
| 138 |
+
# Save config
|
| 139 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 140 |
+
json.dump(config, f, indent=2)
|
| 141 |
+
|
| 142 |
+
# Save ternary weights
|
| 143 |
+
total_ternary_bytes = 0
|
| 144 |
+
total_original_bytes = 0
|
| 145 |
+
|
| 146 |
+
for key in ternary_keys:
|
| 147 |
+
w = tensors[key].astype(np.float32)
|
| 148 |
+
out_dim, in_dim = w.shape
|
| 149 |
+
total_original_bytes += w.nbytes
|
| 150 |
+
|
| 151 |
+
t0 = time.time()
|
| 152 |
+
pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
|
| 153 |
+
dt = time.time() - t0
|
| 154 |
+
|
| 155 |
+
# Save as binary
|
| 156 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 157 |
+
pos.tofile(prefix + ".pos")
|
| 158 |
+
neg.tofile(prefix + ".neg")
|
| 159 |
+
scales.tofile(prefix + ".scales")
|
| 160 |
+
|
| 161 |
+
ternary_bytes = pos.nbytes + neg.nbytes + scales.nbytes
|
| 162 |
+
total_ternary_bytes += ternary_bytes
|
| 163 |
+
ratio = w.nbytes / ternary_bytes
|
| 164 |
+
|
| 165 |
+
print(f" {key}: {w.shape} -> ternary ({ternary_bytes/1024:.0f}KB, "
|
| 166 |
+
f"{ratio:.1f}x compression, {sparsity:.1%} sparse, {dt:.1f}s)")
|
| 167 |
+
|
| 168 |
+
# Save FP16 weights
|
| 169 |
+
total_fp16_bytes = 0
|
| 170 |
+
for key in keep_keys:
|
| 171 |
+
w = tensors[key].astype(np.float16)
|
| 172 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 173 |
+
w.tofile(prefix + ".fp16")
|
| 174 |
+
total_fp16_bytes += w.nbytes
|
| 175 |
+
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
|
| 176 |
+
|
| 177 |
+
# Save tensor manifest
|
| 178 |
+
manifest = {
|
| 179 |
+
"ternary": {k: list(tensors[k].shape) for k in ternary_keys},
|
| 180 |
+
"fp16": {k: list(tensors[k].shape) for k in keep_keys},
|
| 181 |
+
}
|
| 182 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 183 |
+
json.dump(manifest, f, indent=2)
|
| 184 |
+
|
| 185 |
+
total_bytes = total_ternary_bytes + total_fp16_bytes
|
| 186 |
+
orig_bytes = total_original_bytes + total_fp16_bytes
|
| 187 |
+
print(f"\n=== Summary ===")
|
| 188 |
+
print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
|
| 189 |
+
print(f"Ternary linear weights: {total_ternary_bytes/1024/1024:.1f} MB")
|
| 190 |
+
print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
|
| 191 |
+
print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
|
| 192 |
+
print(f"Compression vs FP32: {orig_bytes/total_bytes:.1f}x")
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
import sys
|
| 196 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
|
| 197 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-ternary"
|
| 198 |
+
alpha = float(sys.argv[3]) if len(sys.argv) > 3 else 0.7
|
| 199 |
+
|
| 200 |
+
print(f"Loading model from {model_dir}...")
|
| 201 |
+
tensors = load_safetensors(model_dir)
|
| 202 |
+
|
| 203 |
+
print(f"Converting to ternary (alpha={alpha})...")
|
| 204 |
+
save_ternary_model(tensors, output_dir, alpha)
|
| 205 |
+
print("Done!")
|
convert_fast.py
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
FAST proper unary converter — vectorized bitpacking via numpy.
|
| 4 |
+
|
| 5 |
+
Instead of iterating columns one at a time, processes plane-by-plane
|
| 6 |
+
with vectorized comparisons, then packs to uint64 using np.packbits.
|
| 7 |
+
|
| 8 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import torch, json, os, sys, gc, shutil
|
| 12 |
+
from safetensors import safe_open
|
| 13 |
+
import numpy as np
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def pack_bits_to_uint64(bool_matrix):
|
| 17 |
+
"""
|
| 18 |
+
Pack [rows, cols] boolean → [rows, chunks] uint64
|
| 19 |
+
where chunks = ceil(cols/64).
|
| 20 |
+
|
| 21 |
+
Bit j of element (r, c) corresponds to column c*64+j.
|
| 22 |
+
Uses little-endian bit ordering within each uint64.
|
| 23 |
+
"""
|
| 24 |
+
rows, cols = bool_matrix.shape
|
| 25 |
+
chunks = (cols + 63) // 64
|
| 26 |
+
|
| 27 |
+
# Pad cols to multiple of 64
|
| 28 |
+
if cols % 64:
|
| 29 |
+
padded = np.zeros((rows, chunks * 64), dtype=np.uint8)
|
| 30 |
+
padded[:, :cols] = bool_matrix.astype(np.uint8)
|
| 31 |
+
else:
|
| 32 |
+
padded = bool_matrix.astype(np.uint8)
|
| 33 |
+
|
| 34 |
+
# Reshape to [rows, chunks, 64]
|
| 35 |
+
reshaped = padded.reshape(rows, chunks, 64)
|
| 36 |
+
|
| 37 |
+
# Pack: bit j of uint64 = reshaped[r, c, j]
|
| 38 |
+
# Build uint64 from 64 bits using shifts
|
| 39 |
+
result = np.zeros((rows, chunks), dtype=np.uint64)
|
| 40 |
+
for bit in range(64):
|
| 41 |
+
result |= reshaped[:, :, bit].astype(np.uint64) << np.uint64(bit)
|
| 42 |
+
|
| 43 |
+
return result
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def encode_fast(weight_f32_np, quantum, K):
|
| 47 |
+
"""
|
| 48 |
+
Fast vectorized proper unary encoding.
|
| 49 |
+
weight_f32_np: [rows, cols] numpy float32
|
| 50 |
+
Returns: sign [rows, chunks] uint64, slots [K, rows, chunks] uint64, clip_count
|
| 51 |
+
"""
|
| 52 |
+
rows, cols = weight_f32_np.shape
|
| 53 |
+
chunks = (cols + 63) // 64
|
| 54 |
+
|
| 55 |
+
inv_q = 1.0 / quantum
|
| 56 |
+
magnitudes = np.round(np.abs(weight_f32_np) * inv_q).astype(np.int32)
|
| 57 |
+
clip_count = int(np.sum(magnitudes > K))
|
| 58 |
+
magnitudes = np.clip(magnitudes, 0, K)
|
| 59 |
+
|
| 60 |
+
# Sign: negative elements
|
| 61 |
+
signs_bool = weight_f32_np < 0 # [rows, cols]
|
| 62 |
+
sign_packed = pack_bits_to_uint64(signs_bool) # [rows, chunks]
|
| 63 |
+
|
| 64 |
+
# Unary slots: plane p is set where magnitude > p
|
| 65 |
+
# Process plane by plane (K iterations, each vectorized over entire matrix)
|
| 66 |
+
slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
|
| 67 |
+
|
| 68 |
+
for p in range(K):
|
| 69 |
+
active = magnitudes > p # [rows, cols] boolean, fully vectorized
|
| 70 |
+
slots_packed[p] = pack_bits_to_uint64(active)
|
| 71 |
+
|
| 72 |
+
if (p + 1) % 8 == 0 or p == K - 1:
|
| 73 |
+
print(f" plane {p+1}/{K}", end="\r", flush=True)
|
| 74 |
+
|
| 75 |
+
print(f" {K}/{K} planes done, {clip_count} clipped")
|
| 76 |
+
return sign_packed, slots_packed, clip_count
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def convert(model_dir, output_dir, K=32, clip_pct=99.9):
|
| 80 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 81 |
+
|
| 82 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 83 |
+
print(f"Model: {config.get('model_type', '?')}")
|
| 84 |
+
print(f" Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
|
| 85 |
+
|
| 86 |
+
# Index
|
| 87 |
+
index_path = os.path.join(model_dir, "model.safetensors.index.json")
|
| 88 |
+
if os.path.exists(index_path):
|
| 89 |
+
index = json.load(open(index_path))
|
| 90 |
+
shards = sorted(set(index["weight_map"].values()))
|
| 91 |
+
weight_map = index["weight_map"]
|
| 92 |
+
else:
|
| 93 |
+
shards = ["model.safetensors"]
|
| 94 |
+
weight_map = None
|
| 95 |
+
|
| 96 |
+
# Scan for quantum
|
| 97 |
+
print("\nScanning weights...")
|
| 98 |
+
all_abs = []
|
| 99 |
+
linear_names = []
|
| 100 |
+
global_max = 0.0
|
| 101 |
+
|
| 102 |
+
for shard in shards:
|
| 103 |
+
path = os.path.join(model_dir, shard)
|
| 104 |
+
print(f" {shard}...")
|
| 105 |
+
with safe_open(path, framework="pt") as f:
|
| 106 |
+
for name in f.keys():
|
| 107 |
+
t = f.get_tensor(name).float()
|
| 108 |
+
if t.dim() == 2 and "norm" not in name and "embed" not in name:
|
| 109 |
+
linear_names.append(name)
|
| 110 |
+
am = t.abs().max().item()
|
| 111 |
+
if am > global_max: global_max = am
|
| 112 |
+
idx = torch.randint(0, t.numel(), (2000,))
|
| 113 |
+
all_abs.append(t.flatten()[idx].abs())
|
| 114 |
+
|
| 115 |
+
all_abs_t = torch.cat(all_abs)
|
| 116 |
+
clip_val = torch.quantile(all_abs_t, clip_pct / 100.0).item()
|
| 117 |
+
quantum = clip_val / K
|
| 118 |
+
|
| 119 |
+
print(f"\n Absmax={global_max:.6f} P{clip_pct}={clip_val:.6f}")
|
| 120 |
+
print(f" K={K} quantum={quantum:.8f}")
|
| 121 |
+
|
| 122 |
+
mags = (all_abs_t / quantum).round().clamp(0, K)
|
| 123 |
+
print(f" Mean mag={mags.mean():.1f} Median={mags.median():.1f} Zero={100*(mags==0).float().mean():.1f}% Clipped={100*(mags==K).float().mean():.1f}%")
|
| 124 |
+
|
| 125 |
+
del all_abs, all_abs_t, mags
|
| 126 |
+
gc.collect()
|
| 127 |
+
|
| 128 |
+
manifest = {
|
| 129 |
+
"format": "proper_unary",
|
| 130 |
+
"quantum": float(quantum),
|
| 131 |
+
"K": K,
|
| 132 |
+
"clip_pct": clip_pct,
|
| 133 |
+
"clip_val": float(clip_val),
|
| 134 |
+
"global_absmax": float(global_max),
|
| 135 |
+
"unary": {},
|
| 136 |
+
"fp16": [],
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
total_unary = 0
|
| 140 |
+
total_fp16 = 0
|
| 141 |
+
total_clip = 0
|
| 142 |
+
done = 0
|
| 143 |
+
|
| 144 |
+
for shard in shards:
|
| 145 |
+
path = os.path.join(model_dir, shard)
|
| 146 |
+
|
| 147 |
+
# Get linear names in this shard
|
| 148 |
+
shard_lins = [n for n in linear_names if (weight_map or {}).get(n, "model.safetensors") == shard]
|
| 149 |
+
print(f"\n{shard}: {len(shard_lins)} linear layers")
|
| 150 |
+
|
| 151 |
+
with safe_open(path, framework="pt") as f:
|
| 152 |
+
# Non-linear → FP16
|
| 153 |
+
for name in f.keys():
|
| 154 |
+
if name in linear_names:
|
| 155 |
+
continue
|
| 156 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 157 |
+
out_path = os.path.join(output_dir, fname)
|
| 158 |
+
if not os.path.exists(out_path):
|
| 159 |
+
t = f.get_tensor(name).half().numpy()
|
| 160 |
+
t.view(np.uint16).tofile(out_path)
|
| 161 |
+
total_fp16 += os.path.getsize(out_path)
|
| 162 |
+
manifest["fp16"].append(name)
|
| 163 |
+
print(f" FP16: {name} {t.shape}")
|
| 164 |
+
|
| 165 |
+
# Linear → proper unary
|
| 166 |
+
for name in shard_lins:
|
| 167 |
+
fname = name.replace(".", "_")
|
| 168 |
+
sign_path = os.path.join(output_dir, f"{fname}.usign")
|
| 169 |
+
slots_path = os.path.join(output_dir, f"{fname}.uslots")
|
| 170 |
+
|
| 171 |
+
if os.path.exists(sign_path) and os.path.exists(slots_path):
|
| 172 |
+
t_shape = list(f.get_tensor(name).shape)
|
| 173 |
+
manifest["unary"][name] = t_shape
|
| 174 |
+
total_unary += os.path.getsize(sign_path) + os.path.getsize(slots_path)
|
| 175 |
+
done += 1
|
| 176 |
+
print(f" Skip: {name}")
|
| 177 |
+
continue
|
| 178 |
+
|
| 179 |
+
t = f.get_tensor(name).float().numpy()
|
| 180 |
+
rows, cols = t.shape
|
| 181 |
+
print(f" {name} [{rows}x{cols}]", flush=True)
|
| 182 |
+
|
| 183 |
+
sign_p, slots_p, clip_c = encode_fast(t, quantum, K)
|
| 184 |
+
total_clip += clip_c
|
| 185 |
+
|
| 186 |
+
sign_p.tofile(sign_path)
|
| 187 |
+
slots_p.tofile(slots_path)
|
| 188 |
+
|
| 189 |
+
s_sz = os.path.getsize(sign_path)
|
| 190 |
+
sl_sz = os.path.getsize(slots_path)
|
| 191 |
+
total_unary += s_sz + sl_sz
|
| 192 |
+
|
| 193 |
+
manifest["unary"][name] = [rows, cols]
|
| 194 |
+
done += 1
|
| 195 |
+
mb = (s_sz + sl_sz) / 1e6
|
| 196 |
+
print(f" → {mb:.1f} MB ({s_sz//1024}KB sign + {sl_sz//1024}KB slots)")
|
| 197 |
+
|
| 198 |
+
del t, sign_p, slots_p
|
| 199 |
+
gc.collect()
|
| 200 |
+
|
| 201 |
+
# Copy tokenizer/config files
|
| 202 |
+
for fname in os.listdir(model_dir):
|
| 203 |
+
if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
|
| 204 |
+
src = os.path.join(model_dir, fname)
|
| 205 |
+
dst = os.path.join(output_dir, fname)
|
| 206 |
+
if not os.path.exists(dst):
|
| 207 |
+
shutil.copy2(src, dst)
|
| 208 |
+
|
| 209 |
+
json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
|
| 210 |
+
|
| 211 |
+
total = total_unary + total_fp16
|
| 212 |
+
print(f"\n{'='*60}")
|
| 213 |
+
print(f"DONE: {done} layers, quantum={quantum:.8f}, K={K}")
|
| 214 |
+
print(f" Unary: {total_unary/1e9:.2f} GB")
|
| 215 |
+
print(f" FP16: {total_fp16/1e6:.1f} MB")
|
| 216 |
+
print(f" Total: {total/1e9:.2f} GB (vs ~7.6 GB BF16 = {total/7.6e9:.1f}x)")
|
| 217 |
+
print(f" Clipped: {total_clip} values")
|
| 218 |
+
print(f"{'='*60}")
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
if __name__ == "__main__":
|
| 222 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
|
| 223 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
|
| 224 |
+
K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
|
| 225 |
+
clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
|
| 226 |
+
convert(model_dir, output_dir, K=K, clip_pct=clip)
|
convert_log_unary.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Log-unary converter.
|
| 4 |
+
Instead of thermometer (plane p = mag > p), uses binary decomposition
|
| 5 |
+
(plane p = bit p of magnitude). Fewer planes, wider dynamic range.
|
| 6 |
+
|
| 7 |
+
3 log-planes: 9 levels (-4 to +4), storage = 3 bitplanes
|
| 8 |
+
vs 7 linear planes: 15 levels (-7 to +7), storage = 7 bitplanes
|
| 9 |
+
|
| 10 |
+
4 log-planes: 17 levels (-8 to +8), storage = 4 bitplanes <-- sweet spot
|
| 11 |
+
5 log-planes: 33 levels (-16 to +16), storage = 5 bitplanes
|
| 12 |
+
|
| 13 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 14 |
+
"""
|
| 15 |
+
import numpy as np
|
| 16 |
+
import os, sys, json, time, gc
|
| 17 |
+
|
| 18 |
+
def quantize_log_unary(w_fp32, n_planes):
|
| 19 |
+
"""Quantize weight matrix to log-unary format (binary magnitude planes)"""
|
| 20 |
+
out_dim, in_dim = w_fp32.shape
|
| 21 |
+
max_level = (1 << n_planes) - 1 # 2^n - 1
|
| 22 |
+
|
| 23 |
+
# Per-row scale
|
| 24 |
+
abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
|
| 25 |
+
abs_max = np.where(abs_max == 0, 1.0, abs_max)
|
| 26 |
+
scales = (abs_max.flatten() / max_level).astype(np.float32)
|
| 27 |
+
|
| 28 |
+
# Quantize to integer magnitudes
|
| 29 |
+
scaled = w_fp32 / abs_max * max_level
|
| 30 |
+
rounded = np.clip(np.round(scaled), -max_level, max_level).astype(np.int32)
|
| 31 |
+
|
| 32 |
+
signs = (rounded < 0)
|
| 33 |
+
magnitudes = np.abs(rounded)
|
| 34 |
+
|
| 35 |
+
# Pad to 64-bit chunks
|
| 36 |
+
chunks = (in_dim + 63) // 64
|
| 37 |
+
padded = chunks * 64
|
| 38 |
+
if padded > in_dim:
|
| 39 |
+
signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
|
| 40 |
+
magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
|
| 41 |
+
|
| 42 |
+
# Pack sign bits
|
| 43 |
+
sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
|
| 44 |
+
sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
|
| 45 |
+
|
| 46 |
+
# Pack log-planes: plane p = bit p of magnitude
|
| 47 |
+
plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
|
| 48 |
+
for p in range(n_planes):
|
| 49 |
+
bit_mask = (magnitudes >> p) & 1 # extract bit p
|
| 50 |
+
packed = np.packbits(bit_mask.astype(np.uint8), axis=1, bitorder='little')
|
| 51 |
+
plane_bits[p] = packed.view(np.uint64)[:, :chunks]
|
| 52 |
+
|
| 53 |
+
return sign_u64, plane_bits, scales
|
| 54 |
+
|
| 55 |
+
def convert_model(model_dir, output_dir, n_planes=4):
|
| 56 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 57 |
+
|
| 58 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 59 |
+
n_layers = config["num_hidden_layers"]
|
| 60 |
+
hidden = config["hidden_size"]
|
| 61 |
+
max_level = (1 << n_planes) - 1
|
| 62 |
+
|
| 63 |
+
index_file = os.path.join(model_dir, "model.safetensors.index.json")
|
| 64 |
+
if os.path.exists(index_file):
|
| 65 |
+
index = json.load(open(index_file))
|
| 66 |
+
weight_map = index["weight_map"]
|
| 67 |
+
shards = sorted(set(weight_map.values()))
|
| 68 |
+
else:
|
| 69 |
+
shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
|
| 70 |
+
weight_map = None
|
| 71 |
+
|
| 72 |
+
print(f"LOG-UNARY CONVERSION")
|
| 73 |
+
print(f" Model: {n_layers} layers, hidden={hidden}")
|
| 74 |
+
print(f" Log-planes: {n_planes} -> {2*max_level+1} levels (range -{max_level}..+{max_level})")
|
| 75 |
+
print(f" Shards: {len(shards)}")
|
| 76 |
+
|
| 77 |
+
manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers,
|
| 78 |
+
"encoding": "log_unary", "config": config}
|
| 79 |
+
|
| 80 |
+
total_linear = sum(1 for k in (weight_map or {}) if k.endswith(".weight") and "proj" in k)
|
| 81 |
+
converted = 0
|
| 82 |
+
|
| 83 |
+
import torch
|
| 84 |
+
from safetensors import safe_open
|
| 85 |
+
|
| 86 |
+
for si, shard in enumerate(shards):
|
| 87 |
+
path = os.path.join(model_dir, shard)
|
| 88 |
+
print(f"\n=== Shard {si+1}/{len(shards)}: {shard} ===")
|
| 89 |
+
|
| 90 |
+
with safe_open(path, framework="pt") as f:
|
| 91 |
+
for key in sorted(f.keys()):
|
| 92 |
+
fname = key.replace(".", "_")
|
| 93 |
+
is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
|
| 94 |
+
|
| 95 |
+
if is_linear:
|
| 96 |
+
sign_path = os.path.join(output_dir, f"{fname}.sign")
|
| 97 |
+
if os.path.exists(sign_path):
|
| 98 |
+
manifest["unary"][key] = list(f.get_tensor(key).shape)
|
| 99 |
+
converted += 1
|
| 100 |
+
print(f" [SKIP] {key}")
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
w = f.get_tensor(key).float().numpy()
|
| 104 |
+
t0 = time.time()
|
| 105 |
+
sign, planes, scales = quantize_log_unary(w, n_planes)
|
| 106 |
+
dt = time.time() - t0
|
| 107 |
+
|
| 108 |
+
np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
|
| 109 |
+
np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
|
| 110 |
+
np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
|
| 111 |
+
|
| 112 |
+
manifest["unary"][key] = list(w.shape)
|
| 113 |
+
converted += 1
|
| 114 |
+
orig_mb = w.nbytes / 1e6
|
| 115 |
+
comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
|
| 116 |
+
print(f" [{converted}/{total_linear}] {key}: {list(w.shape)} "
|
| 117 |
+
f"-> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
|
| 118 |
+
del w, sign, planes, scales
|
| 119 |
+
else:
|
| 120 |
+
fp16_path = os.path.join(output_dir, f"{fname}.fp16")
|
| 121 |
+
if os.path.exists(fp16_path):
|
| 122 |
+
manifest["fp16"][key] = list(f.get_tensor(key).shape)
|
| 123 |
+
print(f" [SKIP] {key}")
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
w = f.get_tensor(key).float().numpy()
|
| 127 |
+
w_fp16 = w.astype(np.float16)
|
| 128 |
+
w_fp16.view(np.uint16).tofile(fp16_path)
|
| 129 |
+
manifest["fp16"][key] = list(w.shape)
|
| 130 |
+
print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
|
| 131 |
+
del w, w_fp16
|
| 132 |
+
|
| 133 |
+
gc.collect()
|
| 134 |
+
|
| 135 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 136 |
+
json.dump(manifest, f, indent=2)
|
| 137 |
+
|
| 138 |
+
import shutil
|
| 139 |
+
for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
|
| 140 |
+
src = os.path.join(model_dir, cf)
|
| 141 |
+
if os.path.exists(src):
|
| 142 |
+
shutil.copy(src, os.path.join(output_dir, cf))
|
| 143 |
+
|
| 144 |
+
total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 145 |
+
for f in os.listdir(output_dir) if f.endswith((".sign",".planes",".scales")))
|
| 146 |
+
total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 147 |
+
for f in os.listdir(output_dir) if f.endswith(".fp16"))
|
| 148 |
+
|
| 149 |
+
print(f"\n=== LOG-UNARY CONVERSION COMPLETE ===")
|
| 150 |
+
print(f" Encoding: {n_planes} log-planes (binary magnitude)")
|
| 151 |
+
print(f" Unary: {total_unary/1e9:.2f} GB")
|
| 152 |
+
print(f" FP16: {total_fp16/1e9:.2f} GB")
|
| 153 |
+
print(f" Total: {(total_unary+total_fp16)/1e9:.2f} GB")
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
|
| 157 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-log-unary"
|
| 158 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 4
|
| 159 |
+
convert_model(model_dir, output_dir, n_planes)
|
convert_proper_unary.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert Qwen3-4B BF16 safetensors → Proper Unary.
|
| 4 |
+
Reads safetensors raw bytes (no framework dependency for BF16).
|
| 5 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 6 |
+
"""
|
| 7 |
+
import numpy as np
|
| 8 |
+
import json, os, sys, gc, shutil, struct, time
|
| 9 |
+
|
| 10 |
+
class SafeTensorReader:
|
| 11 |
+
"""Read safetensors one tensor at a time (memory efficient)."""
|
| 12 |
+
def __init__(self, path):
|
| 13 |
+
self.f = open(path, "rb")
|
| 14 |
+
header_size = struct.unpack("<Q", self.f.read(8))[0]
|
| 15 |
+
self.header = json.loads(self.f.read(header_size).decode("utf-8"))
|
| 16 |
+
self.data_start = 8 + header_size
|
| 17 |
+
self._meta = {k: v for k, v in self.header.items() if k != "__metadata__"}
|
| 18 |
+
|
| 19 |
+
def keys(self):
|
| 20 |
+
return list(self._meta.keys())
|
| 21 |
+
|
| 22 |
+
def get(self, name):
|
| 23 |
+
meta = self._meta[name]
|
| 24 |
+
dtype = meta["dtype"]
|
| 25 |
+
shape = tuple(meta["shape"])
|
| 26 |
+
start, end = meta["data_offsets"]
|
| 27 |
+
self.f.seek(self.data_start + start)
|
| 28 |
+
raw = self.f.read(end - start)
|
| 29 |
+
|
| 30 |
+
if dtype == "BF16":
|
| 31 |
+
u16 = np.frombuffer(raw, dtype=np.uint16)
|
| 32 |
+
u32 = u16.astype(np.uint32) << 16
|
| 33 |
+
return u32.view(np.float32).reshape(shape)
|
| 34 |
+
elif dtype == "F16":
|
| 35 |
+
return np.frombuffer(raw, dtype=np.float16).reshape(shape).astype(np.float32)
|
| 36 |
+
elif dtype == "F32":
|
| 37 |
+
return np.frombuffer(raw, dtype=np.float32).reshape(shape).copy()
|
| 38 |
+
else:
|
| 39 |
+
raise ValueError(f"Unknown dtype {dtype}")
|
| 40 |
+
|
| 41 |
+
def close(self):
|
| 42 |
+
self.f.close()
|
| 43 |
+
|
| 44 |
+
def encode_proper_unary(weight_f32, K):
|
| 45 |
+
"""Encode 2D float32 matrix to proper unary."""
|
| 46 |
+
rows, cols = weight_f32.shape
|
| 47 |
+
chunks = (cols + 63) // 64
|
| 48 |
+
|
| 49 |
+
row_absmax = np.abs(weight_f32).max(axis=1).astype(np.float32)
|
| 50 |
+
row_absmax = np.maximum(row_absmax, 1e-10)
|
| 51 |
+
row_scales = (row_absmax / K).astype(np.float32)
|
| 52 |
+
|
| 53 |
+
inv_scales = K / row_absmax
|
| 54 |
+
magnitudes = np.clip(
|
| 55 |
+
np.round(np.abs(weight_f32) * inv_scales[:, None]).astype(np.int32), 0, K)
|
| 56 |
+
|
| 57 |
+
sign_bits = np.zeros((rows, chunks), dtype=np.uint64)
|
| 58 |
+
slot_planes = np.zeros((K, rows, chunks), dtype=np.uint64)
|
| 59 |
+
|
| 60 |
+
negative = weight_f32 < 0
|
| 61 |
+
|
| 62 |
+
for j in range(cols):
|
| 63 |
+
c = j // 64
|
| 64 |
+
b = np.uint64(j % 64)
|
| 65 |
+
bit = np.uint64(1) << b
|
| 66 |
+
|
| 67 |
+
neg_mask = negative[:, j]
|
| 68 |
+
if neg_mask.any():
|
| 69 |
+
sign_bits[neg_mask, c] |= bit
|
| 70 |
+
|
| 71 |
+
mag_col = magnitudes[:, j]
|
| 72 |
+
for s in range(K):
|
| 73 |
+
active = mag_col > s
|
| 74 |
+
if not active.any():
|
| 75 |
+
break
|
| 76 |
+
slot_planes[s, active, c] |= bit
|
| 77 |
+
|
| 78 |
+
return sign_bits, slot_planes, row_scales
|
| 79 |
+
|
| 80 |
+
def convert_model(model_dir, output_dir, K=32):
|
| 81 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 82 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 83 |
+
|
| 84 |
+
for f in ["config.json", "tokenizer.json", "tokenizer_config.json",
|
| 85 |
+
"special_tokens_map.json", "generation_config.json"]:
|
| 86 |
+
src = os.path.join(model_dir, f)
|
| 87 |
+
if os.path.exists(src):
|
| 88 |
+
shutil.copy2(src, output_dir)
|
| 89 |
+
|
| 90 |
+
index_path = os.path.join(model_dir, "model.safetensors.index.json")
|
| 91 |
+
if os.path.exists(index_path):
|
| 92 |
+
index = json.load(open(index_path))
|
| 93 |
+
shard_files = sorted(set(index["weight_map"].values()))
|
| 94 |
+
else:
|
| 95 |
+
shard_files = ["model.safetensors"]
|
| 96 |
+
|
| 97 |
+
linear_names = ["q_proj", "k_proj", "v_proj", "o_proj",
|
| 98 |
+
"gate_proj", "up_proj", "down_proj"]
|
| 99 |
+
|
| 100 |
+
manifest = {"K": K, "format": "proper_unary", "unary": {}, "fp16": []}
|
| 101 |
+
total_linear = 0
|
| 102 |
+
total_size = 0
|
| 103 |
+
|
| 104 |
+
for shard_name in shard_files:
|
| 105 |
+
shard_path = os.path.join(model_dir, shard_name)
|
| 106 |
+
print(f"\n=== {shard_name} ===", flush=True)
|
| 107 |
+
|
| 108 |
+
reader = SafeTensorReader(shard_path)
|
| 109 |
+
print(f" {len(reader.keys())} tensors", flush=True)
|
| 110 |
+
|
| 111 |
+
for key in sorted(reader.keys()):
|
| 112 |
+
tensor = reader.get(key)
|
| 113 |
+
fname = key.replace(".", "_")
|
| 114 |
+
|
| 115 |
+
is_linear = any(ln + ".weight" in key for ln in linear_names)
|
| 116 |
+
|
| 117 |
+
if is_linear and tensor.ndim == 2:
|
| 118 |
+
rows, cols = tensor.shape
|
| 119 |
+
t0 = time.time()
|
| 120 |
+
print(f" {key}: {rows}x{cols} K={K}...", end="", flush=True)
|
| 121 |
+
|
| 122 |
+
sign_bits, slot_planes, row_scales = encode_proper_unary(tensor, K)
|
| 123 |
+
dt = time.time() - t0
|
| 124 |
+
|
| 125 |
+
sign_bits.tofile(os.path.join(output_dir, fname + ".sign"))
|
| 126 |
+
slot_planes.tofile(os.path.join(output_dir, fname + ".slots"))
|
| 127 |
+
row_scales.tofile(os.path.join(output_dir, fname + ".scales"))
|
| 128 |
+
|
| 129 |
+
manifest["unary"][key] = [rows, cols]
|
| 130 |
+
sz = sign_bits.nbytes + slot_planes.nbytes + row_scales.nbytes
|
| 131 |
+
total_size += sz
|
| 132 |
+
total_linear += 1
|
| 133 |
+
|
| 134 |
+
ratio = sz / (rows * cols * 2)
|
| 135 |
+
print(f" {sz/1e6:.1f}MB ({ratio:.1f}x) [{dt:.0f}s]", flush=True)
|
| 136 |
+
|
| 137 |
+
del sign_bits, slot_planes, row_scales
|
| 138 |
+
else:
|
| 139 |
+
# FP16
|
| 140 |
+
t_f16 = tensor.astype(np.float16)
|
| 141 |
+
out_data = t_f16.view(np.uint16)
|
| 142 |
+
out_data.tofile(os.path.join(output_dir, fname + ".fp16"))
|
| 143 |
+
manifest["fp16"].append(key)
|
| 144 |
+
sz = out_data.nbytes
|
| 145 |
+
total_size += sz
|
| 146 |
+
print(f" {key}: {tensor.shape} -> FP16 ({sz/1e6:.1f}MB)", flush=True)
|
| 147 |
+
del t_f16, out_data
|
| 148 |
+
|
| 149 |
+
del tensor
|
| 150 |
+
|
| 151 |
+
reader.close()
|
| 152 |
+
gc.collect()
|
| 153 |
+
|
| 154 |
+
json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
|
| 155 |
+
|
| 156 |
+
print(f"\n{'='*50}", flush=True)
|
| 157 |
+
print(f"DONE: {total_linear} layers, K={K}", flush=True)
|
| 158 |
+
print(f"Total: {total_size/1e9:.2f} GB (orig ~7.6 GB, ratio {total_size/7.6e9:.1f}x)", flush=True)
|
| 159 |
+
|
| 160 |
+
if __name__ == "__main__":
|
| 161 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "/root/ternary_engine/qwen3-4b-thinking-hf"
|
| 162 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "/root/ternary_engine/qwen3-4b-proper-unary"
|
| 163 |
+
K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
|
| 164 |
+
convert_model(model_dir, output_dir, K)
|
convert_proper_unary_v2.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
PROPER UNARY CONVERTER — Global quantum, torch-based, BF16 support
|
| 4 |
+
|
| 5 |
+
Clips at P99.9 of |weights| instead of absmax to avoid wasting
|
| 6 |
+
quantization range on rare outliers. Values above clip point
|
| 7 |
+
saturate at K (still represented, just capped).
|
| 8 |
+
|
| 9 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import torch, json, os, sys, gc, shutil
|
| 13 |
+
from safetensors import safe_open
|
| 14 |
+
import numpy as np
|
| 15 |
+
|
| 16 |
+
def scan_all_linears(model_dir):
|
| 17 |
+
"""Scan linear layers, return global stats."""
|
| 18 |
+
index_path = os.path.join(model_dir, "model.safetensors.index.json")
|
| 19 |
+
if os.path.exists(index_path):
|
| 20 |
+
index = json.load(open(index_path))
|
| 21 |
+
shards = sorted(set(index["weight_map"].values()))
|
| 22 |
+
else:
|
| 23 |
+
shards = ["model.safetensors"]
|
| 24 |
+
|
| 25 |
+
all_abs_samples = []
|
| 26 |
+
linear_names = []
|
| 27 |
+
global_max = 0.0
|
| 28 |
+
|
| 29 |
+
for shard in shards:
|
| 30 |
+
path = os.path.join(model_dir, shard)
|
| 31 |
+
print(f" Scanning {shard}...")
|
| 32 |
+
with safe_open(path, framework="pt") as f:
|
| 33 |
+
for name in f.keys():
|
| 34 |
+
t = f.get_tensor(name).float()
|
| 35 |
+
if t.dim() == 2 and "norm" not in name and "embed" not in name:
|
| 36 |
+
linear_names.append(name)
|
| 37 |
+
am = t.abs().max().item()
|
| 38 |
+
if am > global_max:
|
| 39 |
+
global_max = am
|
| 40 |
+
# Sample 2000 values for distribution
|
| 41 |
+
idx = torch.randint(0, t.numel(), (2000,))
|
| 42 |
+
all_abs_samples.append(t.flatten()[idx].abs())
|
| 43 |
+
|
| 44 |
+
all_abs = torch.cat(all_abs_samples)
|
| 45 |
+
return global_max, all_abs, linear_names, shards
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def encode_to_proper_unary_torch(weight_f32, quantum, K):
|
| 49 |
+
"""
|
| 50 |
+
Encode [rows, cols] float32 tensor to proper unary.
|
| 51 |
+
Returns sign_packed [rows, chunks] uint64, slots_packed [K, rows, chunks] uint64
|
| 52 |
+
"""
|
| 53 |
+
rows, cols = weight_f32.shape
|
| 54 |
+
chunks = (cols + 63) // 64
|
| 55 |
+
|
| 56 |
+
inv_q = 1.0 / quantum
|
| 57 |
+
magnitudes = (weight_f32.abs() * inv_q).round().long().clamp(0, K)
|
| 58 |
+
signs = weight_f32 < 0
|
| 59 |
+
clip_count = int((weight_f32.abs() * inv_q > K).sum().item())
|
| 60 |
+
|
| 61 |
+
# Pack to uint64 bitplanes using numpy (torch lacks bit manipulation)
|
| 62 |
+
sign_packed = np.zeros((rows, chunks), dtype=np.uint64)
|
| 63 |
+
slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
|
| 64 |
+
|
| 65 |
+
mags_np = magnitudes.numpy()
|
| 66 |
+
signs_np = signs.numpy()
|
| 67 |
+
|
| 68 |
+
for j in range(cols):
|
| 69 |
+
c = j // 64
|
| 70 |
+
bit = np.uint64(1) << np.uint64(j % 64)
|
| 71 |
+
|
| 72 |
+
# Sign
|
| 73 |
+
mask = signs_np[:, j]
|
| 74 |
+
sign_packed[mask, c] |= bit
|
| 75 |
+
|
| 76 |
+
# Unary slots: for each element, set slots 0..mag-1
|
| 77 |
+
col_mags = mags_np[:, j]
|
| 78 |
+
for p in range(K):
|
| 79 |
+
active = col_mags > p
|
| 80 |
+
slots_packed[p, active, c] |= bit
|
| 81 |
+
|
| 82 |
+
if (j + 1) % 256 == 0:
|
| 83 |
+
print(f" col {j+1}/{cols}", end="\r", flush=True)
|
| 84 |
+
|
| 85 |
+
print(f" {cols}/{cols} done, {clip_count} clipped")
|
| 86 |
+
return sign_packed, slots_packed, clip_count
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def convert(model_dir, output_dir, K=32, clip_pct=99.9):
|
| 90 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 93 |
+
print(f"Model: {config.get('_name_or_path', config.get('model_type', '?'))}")
|
| 94 |
+
print(f" Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
|
| 95 |
+
|
| 96 |
+
# Scan
|
| 97 |
+
print("\nScanning weights...")
|
| 98 |
+
global_max, all_abs, linear_names, shards = scan_all_linears(model_dir)
|
| 99 |
+
|
| 100 |
+
# Pick quantum from clip percentile
|
| 101 |
+
clip_val = torch.quantile(all_abs, clip_pct / 100.0).item()
|
| 102 |
+
quantum = clip_val / K
|
| 103 |
+
|
| 104 |
+
print(f"\n Global absmax: {global_max:.6f}")
|
| 105 |
+
print(f" P{clip_pct} clip: {clip_val:.6f}")
|
| 106 |
+
print(f" K = {K}")
|
| 107 |
+
print(f" Quantum = {quantum:.8f}")
|
| 108 |
+
print(f" Values > clip ({clip_pct}%): saturate at K={K}")
|
| 109 |
+
|
| 110 |
+
# Distribution with chosen quantum
|
| 111 |
+
mags = (all_abs / quantum).round().clamp(0, K)
|
| 112 |
+
print(f"\n Mean magnitude: {mags.mean():.1f} slots")
|
| 113 |
+
print(f" Median: {mags.median():.1f} slots")
|
| 114 |
+
print(f" Zero fraction: {100*(mags==0).float().mean():.1f}%")
|
| 115 |
+
print(f" At K (clipped): {100*(mags==K).float().mean():.1f}%")
|
| 116 |
+
print(f" Unique levels: {len(mags.unique())} / {K+1}")
|
| 117 |
+
|
| 118 |
+
# Memory estimate
|
| 119 |
+
# Per linear: sign=rows*chunks*8 bytes, slots=K*rows*chunks*8 bytes
|
| 120 |
+
# Approx: (K+1) bits per element vs 16 bits BF16
|
| 121 |
+
bits_per_elem = K + 1 # K slot bits + 1 sign bit (stored in uint64 chunks)
|
| 122 |
+
ratio = bits_per_elem / 16.0
|
| 123 |
+
print(f"\n Bits per weight: {bits_per_elem}")
|
| 124 |
+
print(f" vs BF16 (16 bit): {ratio:.1f}x")
|
| 125 |
+
print(f" Original: ~7.6 GB → Estimated: ~{7.6 * ratio:.1f} GB")
|
| 126 |
+
|
| 127 |
+
# Build weight map
|
| 128 |
+
index_path = os.path.join(model_dir, "model.safetensors.index.json")
|
| 129 |
+
if os.path.exists(index_path):
|
| 130 |
+
weight_map = json.load(open(index_path))["weight_map"]
|
| 131 |
+
else:
|
| 132 |
+
weight_map = None
|
| 133 |
+
|
| 134 |
+
manifest = {
|
| 135 |
+
"format": "proper_unary",
|
| 136 |
+
"quantum": float(quantum),
|
| 137 |
+
"K": K,
|
| 138 |
+
"clip_pct": clip_pct,
|
| 139 |
+
"clip_val": float(clip_val),
|
| 140 |
+
"global_absmax": float(global_max),
|
| 141 |
+
"unary": {},
|
| 142 |
+
"fp16": [],
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
# Group linears by shard
|
| 146 |
+
shard_linears = {}
|
| 147 |
+
for name in linear_names:
|
| 148 |
+
shard = weight_map[name] if weight_map else "model.safetensors"
|
| 149 |
+
shard_linears.setdefault(shard, []).append(name)
|
| 150 |
+
|
| 151 |
+
total_unary_bytes = 0
|
| 152 |
+
total_fp16_bytes = 0
|
| 153 |
+
total_clipped = 0
|
| 154 |
+
done = 0
|
| 155 |
+
|
| 156 |
+
for shard in shards:
|
| 157 |
+
path = os.path.join(model_dir, shard)
|
| 158 |
+
shard_lins = shard_linears.get(shard, [])
|
| 159 |
+
print(f"\nProcessing {shard} ({len(shard_lins)} linear layers)...")
|
| 160 |
+
|
| 161 |
+
with safe_open(path, framework="pt") as f:
|
| 162 |
+
all_keys = list(f.keys())
|
| 163 |
+
|
| 164 |
+
# Non-linear weights → FP16
|
| 165 |
+
for name in all_keys:
|
| 166 |
+
if name in linear_names:
|
| 167 |
+
continue
|
| 168 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 169 |
+
out_path = os.path.join(output_dir, fname)
|
| 170 |
+
if not os.path.exists(out_path):
|
| 171 |
+
t = f.get_tensor(name).half()
|
| 172 |
+
t.numpy().view(np.uint16).tofile(out_path)
|
| 173 |
+
sz = os.path.getsize(out_path)
|
| 174 |
+
total_fp16_bytes += sz
|
| 175 |
+
manifest["fp16"].append(name)
|
| 176 |
+
print(f" FP16: {name} {list(t.shape)} ({sz//1024}KB)")
|
| 177 |
+
|
| 178 |
+
# Linear weights → proper unary
|
| 179 |
+
for name in shard_lins:
|
| 180 |
+
fname = name.replace(".", "_")
|
| 181 |
+
sign_path = os.path.join(output_dir, f"{fname}.usign")
|
| 182 |
+
slots_path = os.path.join(output_dir, f"{fname}.uslots")
|
| 183 |
+
|
| 184 |
+
if os.path.exists(sign_path) and os.path.exists(slots_path):
|
| 185 |
+
t = f.get_tensor(name)
|
| 186 |
+
manifest["unary"][name] = list(t.shape)
|
| 187 |
+
total_unary_bytes += os.path.getsize(sign_path) + os.path.getsize(slots_path)
|
| 188 |
+
done += 1
|
| 189 |
+
print(f" Skip: {name}")
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
t = f.get_tensor(name).float()
|
| 193 |
+
rows, cols = t.shape
|
| 194 |
+
print(f" Converting: {name} [{rows}x{cols}]...", flush=True)
|
| 195 |
+
|
| 196 |
+
sign_p, slots_p, clip_c = encode_to_proper_unary_torch(t, quantum, K)
|
| 197 |
+
total_clipped += clip_c
|
| 198 |
+
|
| 199 |
+
sign_p.tofile(sign_path)
|
| 200 |
+
slots_p.tofile(slots_path)
|
| 201 |
+
|
| 202 |
+
s_sz = os.path.getsize(sign_path)
|
| 203 |
+
sl_sz = os.path.getsize(slots_path)
|
| 204 |
+
total_unary_bytes += s_sz + sl_sz
|
| 205 |
+
|
| 206 |
+
manifest["unary"][name] = [rows, cols]
|
| 207 |
+
done += 1
|
| 208 |
+
print(f" sign={s_sz//1024}KB slots={sl_sz//1024}KB total={( s_sz+sl_sz)//1024//1024}MB")
|
| 209 |
+
|
| 210 |
+
del t, sign_p, slots_p
|
| 211 |
+
gc.collect()
|
| 212 |
+
|
| 213 |
+
# Copy config and tokenizer
|
| 214 |
+
for fname in os.listdir(model_dir):
|
| 215 |
+
if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
|
| 216 |
+
src = os.path.join(model_dir, fname)
|
| 217 |
+
dst = os.path.join(output_dir, fname)
|
| 218 |
+
if not os.path.exists(dst):
|
| 219 |
+
shutil.copy2(src, dst)
|
| 220 |
+
|
| 221 |
+
manifest_path = os.path.join(output_dir, "manifest.json")
|
| 222 |
+
json.dump(manifest, open(manifest_path, "w"), indent=2)
|
| 223 |
+
|
| 224 |
+
total = total_unary_bytes + total_fp16_bytes
|
| 225 |
+
print(f"\n{'='*60}")
|
| 226 |
+
print(f"PROPER UNARY CONVERSION COMPLETE")
|
| 227 |
+
print(f"{'='*60}")
|
| 228 |
+
print(f" Quantum: {quantum:.8f}")
|
| 229 |
+
print(f" K: {K}")
|
| 230 |
+
print(f" Clip at P{clip_pct}: {clip_val:.6f}")
|
| 231 |
+
print(f" Linear layers: {done}")
|
| 232 |
+
print(f" Clipped vals: {total_clipped}")
|
| 233 |
+
print(f" Unary: {total_unary_bytes/1e9:.2f} GB")
|
| 234 |
+
print(f" FP16 (norms): {total_fp16_bytes/1e6:.1f} MB")
|
| 235 |
+
print(f" Total: {total/1e9:.2f} GB")
|
| 236 |
+
print(f" Original BF16: ~7.6 GB")
|
| 237 |
+
print(f" Ratio: {total/7.6e9:.1f}x")
|
| 238 |
+
print(f" Output dir: {output_dir}")
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
if __name__ == "__main__":
|
| 242 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
|
| 243 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
|
| 244 |
+
K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
|
| 245 |
+
clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
|
| 246 |
+
|
| 247 |
+
convert(model_dir, output_dir, K=K, clip_pct=clip)
|
convert_qwen3.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unary converter for Qwen3 models.
|
| 4 |
+
Converts safetensors to unary bitplane format.
|
| 5 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 6 |
+
"""
|
| 7 |
+
import numpy as np
|
| 8 |
+
import os, sys, json, time
|
| 9 |
+
|
| 10 |
+
def load_safetensors_torch(model_dir):
|
| 11 |
+
"""Load all safetensors shards using torch backend"""
|
| 12 |
+
import torch
|
| 13 |
+
from safetensors import safe_open
|
| 14 |
+
|
| 15 |
+
weights = {}
|
| 16 |
+
shard_files = sorted([f for f in os.listdir(model_dir) if f.endswith('.safetensors')])
|
| 17 |
+
print(f"Loading {len(shard_files)} shard(s)...")
|
| 18 |
+
|
| 19 |
+
for sf in shard_files:
|
| 20 |
+
path = os.path.join(model_dir, sf)
|
| 21 |
+
print(f" {sf}...")
|
| 22 |
+
with safe_open(path, framework="pt") as f:
|
| 23 |
+
for key in f.keys():
|
| 24 |
+
t = f.get_tensor(key)
|
| 25 |
+
weights[key] = t.float().numpy() # Convert BF16->FP32
|
| 26 |
+
return weights
|
| 27 |
+
|
| 28 |
+
def quantize_unary_vectorized(w_fp32, n_planes):
|
| 29 |
+
"""Quantize a weight matrix to unary bitplane format using vectorized numpy"""
|
| 30 |
+
out_dim, in_dim = w_fp32.shape
|
| 31 |
+
max_val = n_planes # values from -n_planes to +n_planes
|
| 32 |
+
|
| 33 |
+
# Scale to [-max_val, max_val]
|
| 34 |
+
abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
|
| 35 |
+
abs_max = np.where(abs_max == 0, 1.0, abs_max)
|
| 36 |
+
scaled = w_fp32 / abs_max * max_val
|
| 37 |
+
rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
|
| 38 |
+
|
| 39 |
+
# Per-row scales
|
| 40 |
+
scales = (abs_max.flatten() / max_val).astype(np.float32)
|
| 41 |
+
|
| 42 |
+
# Sign and magnitude
|
| 43 |
+
signs = (rounded < 0) # True = negative
|
| 44 |
+
magnitudes = np.abs(rounded) # 0 to n_planes
|
| 45 |
+
|
| 46 |
+
# Pack into uint64 bitplanes
|
| 47 |
+
chunks = (in_dim + 63) // 64
|
| 48 |
+
padded = chunks * 64
|
| 49 |
+
|
| 50 |
+
# Pad to multiple of 64
|
| 51 |
+
if padded > in_dim:
|
| 52 |
+
signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
|
| 53 |
+
magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
|
| 54 |
+
|
| 55 |
+
# Pack sign bits: [out_dim, chunks] as uint64
|
| 56 |
+
sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
|
| 57 |
+
sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
|
| 58 |
+
|
| 59 |
+
# Pack magnitude planes: for each plane p, bit is set if magnitude > p
|
| 60 |
+
plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
|
| 61 |
+
for p in range(n_planes):
|
| 62 |
+
mask = (magnitudes > p)
|
| 63 |
+
packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
|
| 64 |
+
plane_bits[p] = packed.view(np.uint64)[:, :chunks]
|
| 65 |
+
|
| 66 |
+
return sign_u64, plane_bits, scales
|
| 67 |
+
|
| 68 |
+
def convert_model(model_dir, output_dir, n_planes=7):
|
| 69 |
+
"""Convert a Qwen3 model to unary format"""
|
| 70 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 71 |
+
|
| 72 |
+
# Load config
|
| 73 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 74 |
+
n_layers = config["num_hidden_layers"]
|
| 75 |
+
hidden = config["hidden_size"]
|
| 76 |
+
print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
|
| 77 |
+
|
| 78 |
+
# Load weights
|
| 79 |
+
weights = load_safetensors_torch(model_dir)
|
| 80 |
+
print(f"Loaded {len(weights)} tensors")
|
| 81 |
+
|
| 82 |
+
# Identify linear layers (2D weight matrices in attn/mlp)
|
| 83 |
+
linear_keys = [k for k in weights if k.endswith(".weight") and weights[k].ndim == 2
|
| 84 |
+
and ("proj" in k)]
|
| 85 |
+
|
| 86 |
+
manifest = {"unary": {}, "fp16": {}}
|
| 87 |
+
|
| 88 |
+
# Convert linear layers to unary
|
| 89 |
+
total = len(linear_keys)
|
| 90 |
+
for idx, key in enumerate(sorted(linear_keys)):
|
| 91 |
+
w = weights[key]
|
| 92 |
+
t0 = time.time()
|
| 93 |
+
sign, planes, scales = quantize_unary_vectorized(w, n_planes)
|
| 94 |
+
dt = time.time() - t0
|
| 95 |
+
|
| 96 |
+
# Flatten name for filesystem
|
| 97 |
+
fname = key.replace(".", "_")
|
| 98 |
+
np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
|
| 99 |
+
np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
|
| 100 |
+
np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
|
| 101 |
+
|
| 102 |
+
manifest["unary"][key] = list(w.shape)
|
| 103 |
+
sparsity = 1.0 - np.count_nonzero(np.abs(np.round(w / np.abs(w).max(axis=1, keepdims=True) * n_planes)).astype(int)) / w.size
|
| 104 |
+
orig_mb = w.nbytes / 1e6
|
| 105 |
+
comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
|
| 106 |
+
print(f" [{idx+1}/{total}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
|
| 107 |
+
|
| 108 |
+
# Save FP16 weights (norms, embeddings, QK-norms)
|
| 109 |
+
fp16_keys = [k for k in weights if k not in linear_keys]
|
| 110 |
+
for key in sorted(fp16_keys):
|
| 111 |
+
w = weights[key]
|
| 112 |
+
fname = key.replace(".", "_")
|
| 113 |
+
w_fp16 = w.astype(np.float16)
|
| 114 |
+
w_fp16.view(np.uint16).tofile(os.path.join(output_dir, f"{fname}.fp16"))
|
| 115 |
+
manifest["fp16"][key] = list(w.shape)
|
| 116 |
+
print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
|
| 117 |
+
|
| 118 |
+
# Save manifest and config
|
| 119 |
+
manifest["n_planes"] = n_planes
|
| 120 |
+
manifest["n_layers"] = n_layers
|
| 121 |
+
manifest["config"] = config
|
| 122 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 123 |
+
json.dump(manifest, f, indent=2)
|
| 124 |
+
|
| 125 |
+
# Copy config
|
| 126 |
+
import shutil
|
| 127 |
+
shutil.copy(os.path.join(model_dir, "config.json"), os.path.join(output_dir, "config.json"))
|
| 128 |
+
|
| 129 |
+
# Size summary
|
| 130 |
+
total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 131 |
+
for f in os.listdir(output_dir)
|
| 132 |
+
if f.endswith((".sign", ".planes", ".scales")))
|
| 133 |
+
total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 134 |
+
for f in os.listdir(output_dir)
|
| 135 |
+
if f.endswith(".fp16"))
|
| 136 |
+
orig_total = sum(w.nbytes for w in weights.values())
|
| 137 |
+
|
| 138 |
+
print(f"\n=== CONVERSION COMPLETE ===")
|
| 139 |
+
print(f"Original FP32: {orig_total/1e9:.2f} GB")
|
| 140 |
+
print(f"Unary linear: {total_unary/1e9:.2f} GB")
|
| 141 |
+
print(f"FP16 other: {total_fp16/1e9:.2f} GB")
|
| 142 |
+
print(f"Total: {(total_unary+total_fp16)/1e9:.2f} GB")
|
| 143 |
+
print(f"Compression: {orig_total/(total_unary+total_fp16):.1f}x")
|
| 144 |
+
|
| 145 |
+
if __name__ == "__main__":
|
| 146 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
|
| 147 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
|
| 148 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
|
| 149 |
+
convert_model(model_dir, output_dir, n_planes)
|
convert_qwen3_v2.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Memory-efficient unary converter for Qwen3.
|
| 4 |
+
Processes one safetensors shard at a time to avoid OOM.
|
| 5 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 6 |
+
"""
|
| 7 |
+
import numpy as np
|
| 8 |
+
import os, sys, json, time, gc
|
| 9 |
+
|
| 10 |
+
def quantize_unary(w_fp32, n_planes):
|
| 11 |
+
"""Quantize weight matrix to unary bitplane format"""
|
| 12 |
+
out_dim, in_dim = w_fp32.shape
|
| 13 |
+
max_val = n_planes
|
| 14 |
+
|
| 15 |
+
abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
|
| 16 |
+
abs_max = np.where(abs_max == 0, 1.0, abs_max)
|
| 17 |
+
scaled = w_fp32 / abs_max * max_val
|
| 18 |
+
rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
|
| 19 |
+
|
| 20 |
+
scales = (abs_max.flatten() / max_val).astype(np.float32)
|
| 21 |
+
signs = (rounded < 0)
|
| 22 |
+
magnitudes = np.abs(rounded)
|
| 23 |
+
|
| 24 |
+
chunks = (in_dim + 63) // 64
|
| 25 |
+
padded = chunks * 64
|
| 26 |
+
|
| 27 |
+
if padded > in_dim:
|
| 28 |
+
signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
|
| 29 |
+
magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
|
| 30 |
+
|
| 31 |
+
sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
|
| 32 |
+
sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
|
| 33 |
+
|
| 34 |
+
plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
|
| 35 |
+
for p in range(n_planes):
|
| 36 |
+
mask = (magnitudes > p)
|
| 37 |
+
packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
|
| 38 |
+
plane_bits[p] = packed.view(np.uint64)[:, :chunks]
|
| 39 |
+
|
| 40 |
+
return sign_u64, plane_bits, scales
|
| 41 |
+
|
| 42 |
+
def convert_model(model_dir, output_dir, n_planes=7):
|
| 43 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 44 |
+
|
| 45 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 46 |
+
n_layers = config["num_hidden_layers"]
|
| 47 |
+
hidden = config["hidden_size"]
|
| 48 |
+
|
| 49 |
+
# Load index to know which keys are in which shard
|
| 50 |
+
index_file = os.path.join(model_dir, "model.safetensors.index.json")
|
| 51 |
+
if os.path.exists(index_file):
|
| 52 |
+
index = json.load(open(index_file))
|
| 53 |
+
weight_map = index["weight_map"]
|
| 54 |
+
shards = sorted(set(weight_map.values()))
|
| 55 |
+
else:
|
| 56 |
+
# Single shard
|
| 57 |
+
shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
|
| 58 |
+
weight_map = None
|
| 59 |
+
|
| 60 |
+
print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
|
| 61 |
+
print(f"Shards: {len(shards)}")
|
| 62 |
+
|
| 63 |
+
manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers, "config": config}
|
| 64 |
+
total_converted = 0
|
| 65 |
+
total_linear = 0
|
| 66 |
+
|
| 67 |
+
# Count total linear layers
|
| 68 |
+
if weight_map:
|
| 69 |
+
total_linear = sum(1 for k in weight_map if k.endswith(".weight") and "proj" in k)
|
| 70 |
+
print(f"Total linear layers to convert: {total_linear}")
|
| 71 |
+
|
| 72 |
+
import torch
|
| 73 |
+
from safetensors import safe_open
|
| 74 |
+
|
| 75 |
+
for shard_idx, shard in enumerate(shards):
|
| 76 |
+
shard_path = os.path.join(model_dir, shard)
|
| 77 |
+
print(f"\n=== Shard {shard_idx+1}/{len(shards)}: {shard} ===")
|
| 78 |
+
|
| 79 |
+
with safe_open(shard_path, framework="pt") as f:
|
| 80 |
+
keys = list(f.keys())
|
| 81 |
+
print(f" {len(keys)} tensors in shard")
|
| 82 |
+
|
| 83 |
+
for key in sorted(keys):
|
| 84 |
+
fname = key.replace(".", "_")
|
| 85 |
+
is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
|
| 86 |
+
|
| 87 |
+
if is_linear:
|
| 88 |
+
# Check if already converted
|
| 89 |
+
sign_path = os.path.join(output_dir, f"{fname}.sign")
|
| 90 |
+
if os.path.exists(sign_path):
|
| 91 |
+
w = f.get_tensor(key)
|
| 92 |
+
manifest["unary"][key] = list(w.shape)
|
| 93 |
+
total_converted += 1
|
| 94 |
+
print(f" [SKIP] {key} already converted")
|
| 95 |
+
continue
|
| 96 |
+
|
| 97 |
+
w = f.get_tensor(key).float().numpy()
|
| 98 |
+
t0 = time.time()
|
| 99 |
+
sign, planes, scales = quantize_unary(w, n_planes)
|
| 100 |
+
dt = time.time() - t0
|
| 101 |
+
|
| 102 |
+
np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
|
| 103 |
+
np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
|
| 104 |
+
np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
|
| 105 |
+
|
| 106 |
+
orig_mb = w.nbytes / 1e6
|
| 107 |
+
comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
|
| 108 |
+
total_converted += 1
|
| 109 |
+
manifest["unary"][key] = list(w.shape)
|
| 110 |
+
print(f" [{total_converted}/{total_linear}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
|
| 111 |
+
|
| 112 |
+
del w, sign, planes, scales
|
| 113 |
+
else:
|
| 114 |
+
# FP16 weight (norms, embeddings, etc)
|
| 115 |
+
fp16_path = os.path.join(output_dir, f"{fname}.fp16")
|
| 116 |
+
if os.path.exists(fp16_path):
|
| 117 |
+
w = f.get_tensor(key)
|
| 118 |
+
manifest["fp16"][key] = list(w.shape)
|
| 119 |
+
print(f" [SKIP] {key} already saved")
|
| 120 |
+
continue
|
| 121 |
+
|
| 122 |
+
w = f.get_tensor(key).float().numpy()
|
| 123 |
+
w_fp16 = w.astype(np.float16)
|
| 124 |
+
w_fp16.view(np.uint16).tofile(fp16_path)
|
| 125 |
+
manifest["fp16"][key] = list(w.shape)
|
| 126 |
+
print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
|
| 127 |
+
del w, w_fp16
|
| 128 |
+
|
| 129 |
+
# Force GC between shards
|
| 130 |
+
gc.collect()
|
| 131 |
+
print(f" Shard done, memory freed")
|
| 132 |
+
|
| 133 |
+
# Save manifest
|
| 134 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 135 |
+
json.dump(manifest, f, indent=2)
|
| 136 |
+
|
| 137 |
+
# Copy config
|
| 138 |
+
import shutil
|
| 139 |
+
for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
|
| 140 |
+
src = os.path.join(model_dir, cf)
|
| 141 |
+
if os.path.exists(src):
|
| 142 |
+
shutil.copy(src, os.path.join(output_dir, cf))
|
| 143 |
+
|
| 144 |
+
# Summary
|
| 145 |
+
total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 146 |
+
for f in os.listdir(output_dir)
|
| 147 |
+
if f.endswith((".sign", ".planes", ".scales")))
|
| 148 |
+
total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
|
| 149 |
+
for f in os.listdir(output_dir)
|
| 150 |
+
if f.endswith(".fp16"))
|
| 151 |
+
|
| 152 |
+
print(f"\n=== CONVERSION COMPLETE ===")
|
| 153 |
+
print(f"Unary linear: {total_unary/1e9:.2f} GB")
|
| 154 |
+
print(f"FP16 other: {total_fp16/1e9:.2f} GB")
|
| 155 |
+
print(f"Total: {(total_unary+total_fp16)/1e9:.2f} GB")
|
| 156 |
+
|
| 157 |
+
if __name__ == "__main__":
|
| 158 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
|
| 159 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
|
| 160 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
|
| 161 |
+
convert_model(model_dir, output_dir, n_planes)
|
deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales
ADDED
|
Binary file (35.8 kB). View file
|
|
|
deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16
ADDED
|
Binary file (3.07 kB). View file
|
|
|
deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales
ADDED
|
Binary file (1.02 kB). View file
|
|
|
deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg
ADDED
|
Binary file (49.2 kB). View file
|
|
|
deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
E�=�r�=ě�=�6�=e��=)�=ڍ=Q�=룋=�d�=n��=L��=q2�=
|
| 2 |
+
�=D�=��=�=S�=���=�A�=�=\%�=�F�= J�=��=6v�=bՃ=a
|
| 3 |
+
�=���=ƞn=�P�=~j�=��=�*�=ő=�(�=���=J6�=N+�=Cs�=�S�=Uu�=Vԋ=�і=� �=�ɓ=Zړ=^;�=�1�=s�=�=Uj�=�v=ڱ�=���=b�=c@�=W��=C)�=�܋=�,�=ᵓ=Z~�=�`�=��=i��=0W�=�.�=���=�َ=S��=�H�=0�=���=�0�=6'�=��=�<�=沕=��=���=�Љ=ޏ=H5�=f�~=z;�=�u�=���=�"�=��=�i�=+~�<�-�=�g�=�)�=�u�=��=��=ܨ�=�k�=~[�=�=�l�=���=G(�=��=�/�=��=6Y�=1 �=�= Ջ=2�=��=蓏=)G�=.l�=N�=:��=.��=���=���=WA�=�>�=ѡ�=�ӏ=��=U�=㟙=q:�=F�=T��=���=o�y=�Ѝ=+
|
deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos
ADDED
|
Binary file (49.2 kB). View file
|
|
|
inference.py
ADDED
|
@@ -0,0 +1,503 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Ternary Transformer Inference Engine
|
| 4 |
+
|
| 5 |
+
Full Qwen2 architecture inference using ternary (1.58-bit) linear layers
|
| 6 |
+
with AVX-512 optimized kernels. Zero multiplications in linear layers.
|
| 7 |
+
|
| 8 |
+
Architecture: DeepSeek-R1-Distill-Qwen-1.5B
|
| 9 |
+
- 28 layers, hidden=1536, intermediate=8960
|
| 10 |
+
- GQA: 12 heads, 2 KV heads, head_dim=128
|
| 11 |
+
- SwiGLU MLP, RoPE, RMSNorm
|
| 12 |
+
|
| 13 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
import ctypes
|
| 19 |
+
import numpy as np
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
import time
|
| 22 |
+
|
| 23 |
+
# ============================================================
|
| 24 |
+
# Load C kernel
|
| 25 |
+
# ============================================================
|
| 26 |
+
def load_kernel(so_path="ternary_kernel.so"):
|
| 27 |
+
lib = ctypes.CDLL(so_path)
|
| 28 |
+
|
| 29 |
+
# ternary_matvec_avx512
|
| 30 |
+
lib.ternary_matvec_avx512.restype = None
|
| 31 |
+
lib.ternary_matvec_avx512.argtypes = [
|
| 32 |
+
ctypes.c_void_p, # pos_bits
|
| 33 |
+
ctypes.c_void_p, # neg_bits
|
| 34 |
+
ctypes.c_void_p, # scales
|
| 35 |
+
ctypes.c_void_p, # x
|
| 36 |
+
ctypes.c_void_p, # y
|
| 37 |
+
ctypes.c_int, # out_dim
|
| 38 |
+
ctypes.c_int, # in_dim
|
| 39 |
+
]
|
| 40 |
+
|
| 41 |
+
# rmsnorm
|
| 42 |
+
lib.rmsnorm_avx512.restype = None
|
| 43 |
+
lib.rmsnorm_avx512.argtypes = [
|
| 44 |
+
ctypes.c_void_p, # x
|
| 45 |
+
ctypes.c_void_p, # weight
|
| 46 |
+
ctypes.c_void_p, # y
|
| 47 |
+
ctypes.c_int, # dim
|
| 48 |
+
ctypes.c_float, # eps
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# silu
|
| 52 |
+
lib.silu_avx512.restype = None
|
| 53 |
+
lib.silu_avx512.argtypes = [ctypes.c_void_p, ctypes.c_int]
|
| 54 |
+
|
| 55 |
+
# elemwise_mul
|
| 56 |
+
lib.elemwise_mul_avx512.restype = None
|
| 57 |
+
lib.elemwise_mul_avx512.argtypes = [
|
| 58 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
# softmax
|
| 62 |
+
lib.softmax.restype = None
|
| 63 |
+
lib.softmax.argtypes = [ctypes.c_void_p, ctypes.c_int]
|
| 64 |
+
|
| 65 |
+
# rope
|
| 66 |
+
lib.apply_rope.restype = None
|
| 67 |
+
lib.apply_rope.argtypes = [
|
| 68 |
+
ctypes.c_void_p, ctypes.c_void_p,
|
| 69 |
+
ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
| 70 |
+
ctypes.c_int, ctypes.c_float
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
return lib
|
| 74 |
+
|
| 75 |
+
# ============================================================
|
| 76 |
+
# Ternary Linear Layer
|
| 77 |
+
# ============================================================
|
| 78 |
+
class TernaryLinear:
|
| 79 |
+
def __init__(self, pos_bits, neg_bits, scales, out_dim, in_dim, kernel):
|
| 80 |
+
self.pos = pos_bits # uint64 contiguous array
|
| 81 |
+
self.neg = neg_bits
|
| 82 |
+
self.scales = scales # float32
|
| 83 |
+
self.out_dim = out_dim
|
| 84 |
+
self.in_dim = in_dim
|
| 85 |
+
self.kernel = kernel
|
| 86 |
+
|
| 87 |
+
def forward(self, x):
|
| 88 |
+
"""x: float32[in_dim] -> float32[out_dim]"""
|
| 89 |
+
y = np.zeros(self.out_dim, dtype=np.float32)
|
| 90 |
+
self.kernel.ternary_matvec_avx512(
|
| 91 |
+
self.pos.ctypes.data,
|
| 92 |
+
self.neg.ctypes.data,
|
| 93 |
+
self.scales.ctypes.data,
|
| 94 |
+
x.ctypes.data,
|
| 95 |
+
y.ctypes.data,
|
| 96 |
+
self.out_dim,
|
| 97 |
+
self.in_dim,
|
| 98 |
+
)
|
| 99 |
+
return y
|
| 100 |
+
|
| 101 |
+
# ============================================================
|
| 102 |
+
# KV Cache
|
| 103 |
+
# ============================================================
|
| 104 |
+
class KVCache:
|
| 105 |
+
def __init__(self, n_layers, n_kv_heads, head_dim, max_seq=4096):
|
| 106 |
+
self.n_layers = n_layers
|
| 107 |
+
self.max_seq = max_seq
|
| 108 |
+
# Pre-allocate
|
| 109 |
+
self.k = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
|
| 110 |
+
self.v = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
|
| 111 |
+
self.seq_len = 0
|
| 112 |
+
|
| 113 |
+
def append(self, layer, k, v):
|
| 114 |
+
"""k, v: [n_kv_heads, head_dim]"""
|
| 115 |
+
pos = self.seq_len
|
| 116 |
+
self.k[layer][pos] = k
|
| 117 |
+
self.v[layer][pos] = v
|
| 118 |
+
|
| 119 |
+
def get(self, layer):
|
| 120 |
+
"""Returns k, v up to current position: [seq_len, n_kv_heads, head_dim]"""
|
| 121 |
+
return self.k[layer][:self.seq_len + 1], self.v[layer][:self.seq_len + 1]
|
| 122 |
+
|
| 123 |
+
def advance(self):
|
| 124 |
+
self.seq_len += 1
|
| 125 |
+
|
| 126 |
+
# ============================================================
|
| 127 |
+
# Model
|
| 128 |
+
# ============================================================
|
| 129 |
+
class TernaryQwen:
|
| 130 |
+
def __init__(self, model_dir, kernel):
|
| 131 |
+
self.kernel = kernel
|
| 132 |
+
self.model_dir = model_dir
|
| 133 |
+
|
| 134 |
+
with open(os.path.join(model_dir, "config.json")) as f:
|
| 135 |
+
self.config = json.load(f)
|
| 136 |
+
with open(os.path.join(model_dir, "manifest.json")) as f:
|
| 137 |
+
self.manifest = json.load(f)
|
| 138 |
+
|
| 139 |
+
self.hidden = self.config["hidden_size"] # 1536
|
| 140 |
+
self.inter = self.config["intermediate_size"] # 8960
|
| 141 |
+
self.n_heads = self.config["num_attention_heads"] # 12
|
| 142 |
+
self.n_kv = self.config["num_key_value_heads"] # 2
|
| 143 |
+
self.head_dim = self.config["head_dim"] # 128
|
| 144 |
+
self.n_layers = self.config["num_hidden_layers"] # 28
|
| 145 |
+
self.vocab = self.config["vocab_size"] # 151936
|
| 146 |
+
self.rope_theta = self.config["rope_theta"]
|
| 147 |
+
self.eps = self.config["rms_norm_eps"]
|
| 148 |
+
|
| 149 |
+
print(f"Loading ternary model: {self.n_layers} layers, "
|
| 150 |
+
f"hidden={self.hidden}, heads={self.n_heads}/{self.n_kv}")
|
| 151 |
+
|
| 152 |
+
t0 = time.time()
|
| 153 |
+
self._load_weights()
|
| 154 |
+
print(f"Model loaded in {time.time()-t0:.1f}s")
|
| 155 |
+
|
| 156 |
+
self._compute_memory()
|
| 157 |
+
|
| 158 |
+
def _load_ternary(self, key):
|
| 159 |
+
"""Load a ternary linear layer."""
|
| 160 |
+
prefix = os.path.join(self.model_dir, key.replace(".", "_"))
|
| 161 |
+
shape = self.manifest["ternary"][key]
|
| 162 |
+
out_dim, in_dim = shape
|
| 163 |
+
chunks = (in_dim + 63) // 64
|
| 164 |
+
|
| 165 |
+
pos = np.fromfile(prefix + ".pos", dtype=np.uint64).reshape(out_dim, chunks)
|
| 166 |
+
neg = np.fromfile(prefix + ".neg", dtype=np.uint64).reshape(out_dim, chunks)
|
| 167 |
+
scales = np.fromfile(prefix + ".scales", dtype=np.float32)
|
| 168 |
+
|
| 169 |
+
# Make contiguous
|
| 170 |
+
pos = np.ascontiguousarray(pos)
|
| 171 |
+
neg = np.ascontiguousarray(neg)
|
| 172 |
+
|
| 173 |
+
return TernaryLinear(pos, neg, scales, out_dim, in_dim, self.kernel)
|
| 174 |
+
|
| 175 |
+
def _load_fp16(self, key):
|
| 176 |
+
"""Load an FP16 tensor."""
|
| 177 |
+
prefix = os.path.join(self.model_dir, key.replace(".", "_"))
|
| 178 |
+
shape = self.manifest["fp16"][key]
|
| 179 |
+
return np.fromfile(prefix + ".fp16", dtype=np.float16).reshape(shape).astype(np.float32)
|
| 180 |
+
|
| 181 |
+
def _load_weights(self):
|
| 182 |
+
"""Load all weights."""
|
| 183 |
+
# Embedding (FP16)
|
| 184 |
+
self.embed = self._load_fp16("model.embed_tokens.weight") # [vocab, hidden]
|
| 185 |
+
|
| 186 |
+
# Final norm
|
| 187 |
+
self.final_norm = self._load_fp16("model.norm.weight") # [hidden]
|
| 188 |
+
|
| 189 |
+
# LM head — check if it exists as ternary or fp16
|
| 190 |
+
if "lm_head.weight" in self.manifest.get("ternary", {}):
|
| 191 |
+
self.lm_head = self._load_ternary("lm_head.weight")
|
| 192 |
+
self.lm_head_ternary = True
|
| 193 |
+
elif "lm_head.weight" in self.manifest.get("fp16", {}):
|
| 194 |
+
self.lm_head_w = self._load_fp16("lm_head.weight")
|
| 195 |
+
self.lm_head_ternary = False
|
| 196 |
+
else:
|
| 197 |
+
# Tied embeddings
|
| 198 |
+
self.lm_head_w = self.embed
|
| 199 |
+
self.lm_head_ternary = False
|
| 200 |
+
|
| 201 |
+
# Layers
|
| 202 |
+
self.layers = []
|
| 203 |
+
for i in range(self.n_layers):
|
| 204 |
+
layer = {}
|
| 205 |
+
prefix = f"model.layers.{i}"
|
| 206 |
+
|
| 207 |
+
# Attention
|
| 208 |
+
layer["q_proj"] = self._load_ternary(f"{prefix}.self_attn.q_proj.weight")
|
| 209 |
+
layer["k_proj"] = self._load_ternary(f"{prefix}.self_attn.k_proj.weight")
|
| 210 |
+
layer["v_proj"] = self._load_ternary(f"{prefix}.self_attn.v_proj.weight")
|
| 211 |
+
layer["o_proj"] = self._load_ternary(f"{prefix}.self_attn.o_proj.weight")
|
| 212 |
+
|
| 213 |
+
# MLP
|
| 214 |
+
layer["gate_proj"] = self._load_ternary(f"{prefix}.mlp.gate_proj.weight")
|
| 215 |
+
layer["up_proj"] = self._load_ternary(f"{prefix}.mlp.up_proj.weight")
|
| 216 |
+
layer["down_proj"] = self._load_ternary(f"{prefix}.mlp.down_proj.weight")
|
| 217 |
+
|
| 218 |
+
# Norms (FP16 -> FP32)
|
| 219 |
+
layer["input_norm"] = self._load_fp16(f"{prefix}.input_layernorm.weight")
|
| 220 |
+
layer["post_norm"] = self._load_fp16(f"{prefix}.post_attention_layernorm.weight")
|
| 221 |
+
|
| 222 |
+
# Load biases if they exist
|
| 223 |
+
for proj in ["q_proj", "k_proj", "v_proj"]:
|
| 224 |
+
bias_key = f"{prefix}.self_attn.{proj}.bias"
|
| 225 |
+
if bias_key in self.manifest.get("fp16", {}):
|
| 226 |
+
layer[f"{proj}_bias"] = self._load_fp16(bias_key)
|
| 227 |
+
|
| 228 |
+
self.layers.append(layer)
|
| 229 |
+
if (i + 1) % 7 == 0:
|
| 230 |
+
print(f" Loaded {i+1}/{self.n_layers} layers")
|
| 231 |
+
|
| 232 |
+
print(f" Loaded {self.n_layers}/{self.n_layers} layers")
|
| 233 |
+
|
| 234 |
+
def _compute_memory(self):
|
| 235 |
+
"""Report memory usage."""
|
| 236 |
+
ternary_bytes = 0
|
| 237 |
+
fp_bytes = 0
|
| 238 |
+
|
| 239 |
+
for layer in self.layers:
|
| 240 |
+
for key in ["q_proj", "k_proj", "v_proj", "o_proj",
|
| 241 |
+
"gate_proj", "up_proj", "down_proj"]:
|
| 242 |
+
tl = layer[key]
|
| 243 |
+
ternary_bytes += tl.pos.nbytes + tl.neg.nbytes + tl.scales.nbytes
|
| 244 |
+
for key in ["input_norm", "post_norm"]:
|
| 245 |
+
fp_bytes += layer[key].nbytes
|
| 246 |
+
|
| 247 |
+
fp_bytes += self.embed.nbytes + self.final_norm.nbytes
|
| 248 |
+
if not self.lm_head_ternary:
|
| 249 |
+
fp_bytes += self.lm_head_w.nbytes if hasattr(self, 'lm_head_w') else 0
|
| 250 |
+
|
| 251 |
+
total = ternary_bytes + fp_bytes
|
| 252 |
+
print(f"\nMemory: ternary={ternary_bytes/1024/1024:.1f}MB, "
|
| 253 |
+
f"fp={fp_bytes/1024/1024:.1f}MB, total={total/1024/1024:.1f}MB")
|
| 254 |
+
|
| 255 |
+
def _rmsnorm(self, x, weight):
|
| 256 |
+
"""RMSNorm using C kernel."""
|
| 257 |
+
y = np.zeros_like(x)
|
| 258 |
+
self.kernel.rmsnorm_avx512(
|
| 259 |
+
x.ctypes.data, weight.ctypes.data, y.ctypes.data,
|
| 260 |
+
len(x), ctypes.c_float(self.eps)
|
| 261 |
+
)
|
| 262 |
+
return y
|
| 263 |
+
|
| 264 |
+
def _attention(self, x, layer, cache, layer_idx, pos):
|
| 265 |
+
"""Grouped-Query Attention."""
|
| 266 |
+
h = self.hidden
|
| 267 |
+
n_h = self.n_heads
|
| 268 |
+
n_kv = self.n_kv
|
| 269 |
+
hd = self.head_dim
|
| 270 |
+
|
| 271 |
+
# Project Q, K, V
|
| 272 |
+
q = layer["q_proj"].forward(x) # [n_heads * head_dim]
|
| 273 |
+
k = layer["k_proj"].forward(x) # [n_kv * head_dim]
|
| 274 |
+
v = layer["v_proj"].forward(x) # [n_kv * head_dim]
|
| 275 |
+
|
| 276 |
+
# Add biases if present
|
| 277 |
+
if "q_proj_bias" in layer:
|
| 278 |
+
q += layer["q_proj_bias"]
|
| 279 |
+
if "k_proj_bias" in layer:
|
| 280 |
+
k += layer["k_proj_bias"]
|
| 281 |
+
if "v_proj_bias" in layer:
|
| 282 |
+
v += layer["v_proj_bias"]
|
| 283 |
+
|
| 284 |
+
# Reshape
|
| 285 |
+
q = q.reshape(n_h, hd)
|
| 286 |
+
k = k.reshape(n_kv, hd)
|
| 287 |
+
v = v.reshape(n_kv, hd)
|
| 288 |
+
|
| 289 |
+
# RoPE
|
| 290 |
+
self.kernel.apply_rope(
|
| 291 |
+
q.ctypes.data, k.ctypes.data,
|
| 292 |
+
n_h, n_kv, hd, pos,
|
| 293 |
+
ctypes.c_float(self.rope_theta)
|
| 294 |
+
)
|
| 295 |
+
|
| 296 |
+
# Update KV cache
|
| 297 |
+
cache.append(layer_idx, k, v)
|
| 298 |
+
|
| 299 |
+
# Get full K, V history
|
| 300 |
+
k_all, v_all = cache.get(layer_idx) # [seq_len, n_kv, head_dim]
|
| 301 |
+
seq_len = k_all.shape[0]
|
| 302 |
+
|
| 303 |
+
# GQA: repeat KV heads to match Q heads
|
| 304 |
+
heads_per_kv = n_h // n_kv
|
| 305 |
+
|
| 306 |
+
# Compute attention for each head
|
| 307 |
+
output = np.zeros(n_h * hd, dtype=np.float32)
|
| 308 |
+
scale = 1.0 / np.sqrt(hd)
|
| 309 |
+
|
| 310 |
+
for head in range(n_h):
|
| 311 |
+
kv_head = head // heads_per_kv
|
| 312 |
+
q_h = q[head] # [head_dim]
|
| 313 |
+
|
| 314 |
+
# Attention scores: q @ K^T
|
| 315 |
+
scores = np.dot(k_all[:, kv_head, :], q_h) * scale # [seq_len]
|
| 316 |
+
|
| 317 |
+
# Causal mask (all visible for single token generation)
|
| 318 |
+
# Softmax
|
| 319 |
+
scores_max = np.max(scores)
|
| 320 |
+
scores = np.exp(scores - scores_max)
|
| 321 |
+
scores /= np.sum(scores)
|
| 322 |
+
|
| 323 |
+
# Weighted sum of values
|
| 324 |
+
out_h = np.dot(scores, v_all[:, kv_head, :]) # [head_dim]
|
| 325 |
+
output[head * hd:(head + 1) * hd] = out_h
|
| 326 |
+
|
| 327 |
+
# Output projection
|
| 328 |
+
return layer["o_proj"].forward(output)
|
| 329 |
+
|
| 330 |
+
def _mlp(self, x, layer):
|
| 331 |
+
"""SwiGLU MLP."""
|
| 332 |
+
gate = layer["gate_proj"].forward(x)
|
| 333 |
+
up = layer["up_proj"].forward(x)
|
| 334 |
+
|
| 335 |
+
# SiLU on gate
|
| 336 |
+
self.kernel.silu_avx512(gate.ctypes.data, len(gate))
|
| 337 |
+
|
| 338 |
+
# gate * up
|
| 339 |
+
self.kernel.elemwise_mul_avx512(
|
| 340 |
+
gate.ctypes.data, up.ctypes.data, gate.ctypes.data, len(gate)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
# Down projection
|
| 344 |
+
return layer["down_proj"].forward(gate)
|
| 345 |
+
|
| 346 |
+
def forward_token(self, token_id, cache, pos):
|
| 347 |
+
"""Forward pass for a single token."""
|
| 348 |
+
# Embedding lookup
|
| 349 |
+
x = self.embed[token_id].copy() # [hidden]
|
| 350 |
+
|
| 351 |
+
# Transformer layers
|
| 352 |
+
for i, layer in enumerate(self.layers):
|
| 353 |
+
# Pre-attention norm
|
| 354 |
+
normed = self._rmsnorm(x, layer["input_norm"])
|
| 355 |
+
|
| 356 |
+
# Self-attention + residual
|
| 357 |
+
attn_out = self._attention(normed, layer, cache, i, pos)
|
| 358 |
+
x = x + attn_out
|
| 359 |
+
|
| 360 |
+
# Pre-MLP norm
|
| 361 |
+
normed = self._rmsnorm(x, layer["post_norm"])
|
| 362 |
+
|
| 363 |
+
# MLP + residual
|
| 364 |
+
mlp_out = self._mlp(normed, layer)
|
| 365 |
+
x = x + mlp_out
|
| 366 |
+
|
| 367 |
+
# Final norm
|
| 368 |
+
x = self._rmsnorm(x, self.final_norm)
|
| 369 |
+
|
| 370 |
+
return x
|
| 371 |
+
|
| 372 |
+
def logits(self, hidden):
|
| 373 |
+
"""Compute logits from hidden state."""
|
| 374 |
+
if self.lm_head_ternary:
|
| 375 |
+
return self.lm_head.forward(hidden)
|
| 376 |
+
else:
|
| 377 |
+
return hidden @ self.lm_head_w.T
|
| 378 |
+
|
| 379 |
+
def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95):
|
| 380 |
+
"""Generate tokens autoregressively."""
|
| 381 |
+
cache = KVCache(self.n_layers, self.n_kv, self.head_dim)
|
| 382 |
+
|
| 383 |
+
generated = []
|
| 384 |
+
all_tokens = list(token_ids)
|
| 385 |
+
|
| 386 |
+
t_start = time.time()
|
| 387 |
+
|
| 388 |
+
# Prefill: process all input tokens
|
| 389 |
+
for i, tid in enumerate(token_ids):
|
| 390 |
+
hidden = self.forward_token(tid, cache, i)
|
| 391 |
+
if i < len(token_ids) - 1:
|
| 392 |
+
cache.advance()
|
| 393 |
+
|
| 394 |
+
t_prefill = time.time() - t_start
|
| 395 |
+
|
| 396 |
+
# Decode
|
| 397 |
+
t_decode_start = time.time()
|
| 398 |
+
for step in range(max_new_tokens):
|
| 399 |
+
# Get logits
|
| 400 |
+
logit_vec = self.logits(hidden)
|
| 401 |
+
|
| 402 |
+
# Sample
|
| 403 |
+
if temperature < 0.01:
|
| 404 |
+
next_token = int(np.argmax(logit_vec))
|
| 405 |
+
else:
|
| 406 |
+
logit_vec = logit_vec / temperature
|
| 407 |
+
# Top-p sampling
|
| 408 |
+
sorted_idx = np.argsort(logit_vec)[::-1]
|
| 409 |
+
sorted_logits = logit_vec[sorted_idx]
|
| 410 |
+
|
| 411 |
+
# Softmax
|
| 412 |
+
max_l = sorted_logits[0]
|
| 413 |
+
probs = np.exp(sorted_logits - max_l)
|
| 414 |
+
probs /= probs.sum()
|
| 415 |
+
|
| 416 |
+
cumsum = np.cumsum(probs)
|
| 417 |
+
cutoff = np.searchsorted(cumsum, top_p) + 1
|
| 418 |
+
|
| 419 |
+
top_probs = probs[:cutoff]
|
| 420 |
+
top_probs /= top_probs.sum()
|
| 421 |
+
top_idx = sorted_idx[:cutoff]
|
| 422 |
+
|
| 423 |
+
next_token = int(np.random.choice(top_idx, p=top_probs))
|
| 424 |
+
|
| 425 |
+
generated.append(next_token)
|
| 426 |
+
all_tokens.append(next_token)
|
| 427 |
+
|
| 428 |
+
# Check stop tokens
|
| 429 |
+
if next_token in [151643, 151644, 151645]: # Qwen EOS tokens
|
| 430 |
+
break
|
| 431 |
+
|
| 432 |
+
cache.advance()
|
| 433 |
+
hidden = self.forward_token(next_token, cache, len(all_tokens) - 1)
|
| 434 |
+
|
| 435 |
+
t_total = time.time() - t_start
|
| 436 |
+
t_decode = time.time() - t_decode_start
|
| 437 |
+
n_gen = len(generated)
|
| 438 |
+
|
| 439 |
+
stats = {
|
| 440 |
+
"prefill_ms": t_prefill * 1000,
|
| 441 |
+
"decode_ms": t_decode * 1000,
|
| 442 |
+
"total_ms": t_total * 1000,
|
| 443 |
+
"tokens_generated": n_gen,
|
| 444 |
+
"tok_per_sec": n_gen / t_decode if t_decode > 0 else 0,
|
| 445 |
+
"prefill_tokens": len(token_ids),
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
return generated, stats
|
| 449 |
+
|
| 450 |
+
# ============================================================
|
| 451 |
+
# Tokenizer wrapper
|
| 452 |
+
# ============================================================
|
| 453 |
+
class Tokenizer:
|
| 454 |
+
def __init__(self, model_dir):
|
| 455 |
+
from tokenizers import Tokenizer as HFTokenizer
|
| 456 |
+
tok_path = os.path.join(model_dir, "tokenizer.json")
|
| 457 |
+
if os.path.exists(tok_path):
|
| 458 |
+
self.tok = HFTokenizer.from_file(tok_path)
|
| 459 |
+
else:
|
| 460 |
+
# Try loading from HF
|
| 461 |
+
from transformers import AutoTokenizer
|
| 462 |
+
self.tok = AutoTokenizer.from_pretrained(model_dir)
|
| 463 |
+
self._is_transformers = True
|
| 464 |
+
return
|
| 465 |
+
self._is_transformers = False
|
| 466 |
+
|
| 467 |
+
def encode(self, text):
|
| 468 |
+
if self._is_transformers:
|
| 469 |
+
return self.tok.encode(text)
|
| 470 |
+
return self.tok.encode(text).ids
|
| 471 |
+
|
| 472 |
+
def decode(self, ids):
|
| 473 |
+
if self._is_transformers:
|
| 474 |
+
return self.tok.decode(ids, skip_special_tokens=True)
|
| 475 |
+
return self.tok.decode(ids)
|
| 476 |
+
|
| 477 |
+
def apply_chat_template(self, messages):
|
| 478 |
+
"""Build Qwen chat format."""
|
| 479 |
+
parts = []
|
| 480 |
+
for msg in messages:
|
| 481 |
+
role = msg["role"]
|
| 482 |
+
content = msg["content"]
|
| 483 |
+
parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
|
| 484 |
+
parts.append("<|im_start|>assistant\n")
|
| 485 |
+
return "".join(parts)
|
| 486 |
+
|
| 487 |
+
if __name__ == "__main__":
|
| 488 |
+
import sys
|
| 489 |
+
|
| 490 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-ternary"
|
| 491 |
+
kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
|
| 492 |
+
|
| 493 |
+
model = TernaryQwen(model_dir, kernel)
|
| 494 |
+
|
| 495 |
+
# Quick test
|
| 496 |
+
test_ids = [151644, 8948, 198, 151645, 198, 151644, 872, 198, 9707, 151645, 198, 151644, 77091, 198]
|
| 497 |
+
|
| 498 |
+
print("\nGenerating...")
|
| 499 |
+
tokens, stats = model.generate(test_ids, max_new_tokens=50, temperature=0.6)
|
| 500 |
+
print(f"Generated {stats['tokens_generated']} tokens")
|
| 501 |
+
print(f"Speed: {stats['tok_per_sec']:.1f} tok/s")
|
| 502 |
+
print(f"Prefill: {stats['prefill_ms']:.0f}ms, Decode: {stats['decode_ms']:.0f}ms")
|
| 503 |
+
print(f"Token IDs: {tokens}")
|
log_unary_engine.c
ADDED
|
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* LOG-UNARY TRANSFORMER ENGINE
|
| 3 |
+
*
|
| 4 |
+
* Unary base-1 with logarithmic compression:
|
| 5 |
+
* Linear unary: value 7 = 1111111 (7 planes, each = +1)
|
| 6 |
+
* Log unary: value 8 = 111 (3 planes, plane p = 2^p)
|
| 7 |
+
*
|
| 8 |
+
* Matmul kernel: acc += popcount(w_plane[p] AND x_plane[q]) << (p+q)
|
| 9 |
+
* Still pure AND+popcount+shift, no float in hot path.
|
| 10 |
+
*
|
| 11 |
+
* 3 log-planes = values {0,1,2,4} with sign = {-4..+4} = 9 levels
|
| 12 |
+
* 4 log-planes = values {0,1,2,4,8} with sign = {-8..+8} = 17 levels
|
| 13 |
+
* 5 log-planes = values {0,1,2,4,8,16} with sign = {-16..+16} = 33 levels
|
| 14 |
+
*
|
| 15 |
+
* vs linear 7 planes = {-7..+7} = 15 levels using 7 planes
|
| 16 |
+
*
|
| 17 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
#include <immintrin.h>
|
| 21 |
+
#include <omp.h>
|
| 22 |
+
#include <stdint.h>
|
| 23 |
+
#include <stdlib.h>
|
| 24 |
+
#include <string.h>
|
| 25 |
+
#include <math.h>
|
| 26 |
+
#include <stdio.h>
|
| 27 |
+
#include <time.h>
|
| 28 |
+
|
| 29 |
+
#define MAX_SEQ 4096
|
| 30 |
+
#define RMS_EPS 1e-6f
|
| 31 |
+
|
| 32 |
+
/* ============================================================
|
| 33 |
+
* Config
|
| 34 |
+
* ============================================================ */
|
| 35 |
+
typedef struct {
|
| 36 |
+
int hidden;
|
| 37 |
+
int inter;
|
| 38 |
+
int n_heads;
|
| 39 |
+
int n_kv_heads;
|
| 40 |
+
int head_dim;
|
| 41 |
+
int n_layers;
|
| 42 |
+
int vocab;
|
| 43 |
+
float rope_theta;
|
| 44 |
+
int tie_embeddings;
|
| 45 |
+
int w_planes; /* weight log-planes */
|
| 46 |
+
int a_planes; /* activation log-planes */
|
| 47 |
+
} Config;
|
| 48 |
+
|
| 49 |
+
/* Log-unary weight matrix */
|
| 50 |
+
typedef struct {
|
| 51 |
+
uint64_t *sign_bits; /* [out_dim * chunks] */
|
| 52 |
+
uint64_t *log_planes; /* [n_planes][out_dim * chunks] - plane p = 2^p */
|
| 53 |
+
float *scales; /* [out_dim] */
|
| 54 |
+
int out_dim;
|
| 55 |
+
int in_dim;
|
| 56 |
+
int n_planes;
|
| 57 |
+
int chunks;
|
| 58 |
+
} LogUnaryWeight;
|
| 59 |
+
|
| 60 |
+
/* Transformer layer */
|
| 61 |
+
typedef struct {
|
| 62 |
+
LogUnaryWeight q_proj, k_proj, v_proj, o_proj;
|
| 63 |
+
LogUnaryWeight gate_proj, up_proj, down_proj;
|
| 64 |
+
float *input_norm;
|
| 65 |
+
float *post_norm;
|
| 66 |
+
float *q_norm, *k_norm;
|
| 67 |
+
} Layer;
|
| 68 |
+
|
| 69 |
+
/* Full model */
|
| 70 |
+
typedef struct {
|
| 71 |
+
Config cfg;
|
| 72 |
+
uint16_t *embed;
|
| 73 |
+
Layer *layers;
|
| 74 |
+
float *final_norm;
|
| 75 |
+
|
| 76 |
+
/* KV cache */
|
| 77 |
+
float *k_cache;
|
| 78 |
+
float *v_cache;
|
| 79 |
+
|
| 80 |
+
/* Float scratch (O(dim) ops only) */
|
| 81 |
+
float *hidden;
|
| 82 |
+
float *normed;
|
| 83 |
+
float *q_float;
|
| 84 |
+
float *k_float;
|
| 85 |
+
float *v_float;
|
| 86 |
+
float *attn_out;
|
| 87 |
+
float *gate_float;
|
| 88 |
+
float *up_float;
|
| 89 |
+
float *mlp_act;
|
| 90 |
+
float *logits;
|
| 91 |
+
float *attn_scores;
|
| 92 |
+
|
| 93 |
+
/* Unary scratch for activation quantization */
|
| 94 |
+
uint64_t *act_sign;
|
| 95 |
+
uint64_t *act_planes;
|
| 96 |
+
|
| 97 |
+
/* Larger scratch for intermediate dim */
|
| 98 |
+
uint64_t *mlp_act_sign;
|
| 99 |
+
uint64_t *mlp_act_planes;
|
| 100 |
+
} Model;
|
| 101 |
+
|
| 102 |
+
/* ============================================================
|
| 103 |
+
* LOG-UNARY ACTIVATION QUANTIZATION
|
| 104 |
+
*
|
| 105 |
+
* Encode float value as sign + log-magnitude planes
|
| 106 |
+
* Plane p is set if |x| >= threshold_p
|
| 107 |
+
* threshold_p = scale * 2^p / max_level
|
| 108 |
+
*
|
| 109 |
+
* Effectively: compute integer magnitude = round(|x|/scale * max_level)
|
| 110 |
+
* Then decompose into binary: if bit p is set in magnitude, plane p is set
|
| 111 |
+
*
|
| 112 |
+
* Wait — that's just BINARY encoding of the magnitude!
|
| 113 |
+
* Log-unary IS binary representation stored as separate bitplanes.
|
| 114 |
+
* The magic is that AND+popcount+shift MULTIPLIES them.
|
| 115 |
+
* ============================================================ */
|
| 116 |
+
static void quantize_log_unary(
|
| 117 |
+
const float *x, int dim, int n_planes,
|
| 118 |
+
uint64_t *sign_out, uint64_t *planes_out, float *scale_out
|
| 119 |
+
) {
|
| 120 |
+
int chunks = (dim + 63) / 64;
|
| 121 |
+
int max_level = (1 << n_planes) - 1; /* 2^n - 1 */
|
| 122 |
+
|
| 123 |
+
/* Find absmax */
|
| 124 |
+
float amax = 0.0f;
|
| 125 |
+
for (int i = 0; i < dim; i++) {
|
| 126 |
+
float a = fabsf(x[i]);
|
| 127 |
+
if (a > amax) amax = a;
|
| 128 |
+
}
|
| 129 |
+
if (amax == 0.0f) amax = 1.0f;
|
| 130 |
+
*scale_out = amax / max_level;
|
| 131 |
+
|
| 132 |
+
memset(sign_out, 0, chunks * sizeof(uint64_t));
|
| 133 |
+
memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
|
| 134 |
+
|
| 135 |
+
float inv_scale = max_level / amax;
|
| 136 |
+
for (int i = 0; i < dim; i++) {
|
| 137 |
+
int chunk = i / 64;
|
| 138 |
+
int bit = i % 64;
|
| 139 |
+
uint64_t mask = 1ULL << bit;
|
| 140 |
+
|
| 141 |
+
if (x[i] < 0.0f)
|
| 142 |
+
sign_out[chunk] |= mask;
|
| 143 |
+
|
| 144 |
+
int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
|
| 145 |
+
if (mag > max_level) mag = max_level;
|
| 146 |
+
|
| 147 |
+
/* Binary decomposition: plane p gets bit p of magnitude */
|
| 148 |
+
for (int p = 0; p < n_planes; p++) {
|
| 149 |
+
if (mag & (1 << p))
|
| 150 |
+
planes_out[(size_t)p * chunks + chunk] |= mask;
|
| 151 |
+
}
|
| 152 |
+
}
|
| 153 |
+
}
|
| 154 |
+
|
| 155 |
+
/* ============================================================
|
| 156 |
+
* LOG-UNARY MATVEC: y = W @ x
|
| 157 |
+
*
|
| 158 |
+
* W: log-unary (sign + wp log-planes, scales)
|
| 159 |
+
* x: log-unary (sign + xp log-planes, scale)
|
| 160 |
+
*
|
| 161 |
+
* For each output element i:
|
| 162 |
+
* acc = 0
|
| 163 |
+
* for each chunk c:
|
| 164 |
+
* same = ~(w_sign[c] ^ x_sign[c])
|
| 165 |
+
* diff = w_sign[c] ^ x_sign[c]
|
| 166 |
+
* for p in 0..wp-1:
|
| 167 |
+
* for q in 0..xp-1:
|
| 168 |
+
* active = w_plane[p][c] & x_plane[q][c]
|
| 169 |
+
* pos = popcount(active & same)
|
| 170 |
+
* neg = popcount(active & diff)
|
| 171 |
+
* acc += (pos - neg) << (p + q) <-- THE KEY: shift by p+q
|
| 172 |
+
* y[i] = acc * w_scale[i] * x_scale
|
| 173 |
+
* ============================================================ */
|
| 174 |
+
static void log_unary_matvec(
|
| 175 |
+
const LogUnaryWeight *W,
|
| 176 |
+
const uint64_t *x_sign, const uint64_t *x_planes,
|
| 177 |
+
float x_scale, int x_n_planes,
|
| 178 |
+
float *y_out
|
| 179 |
+
) {
|
| 180 |
+
int out_dim = W->out_dim;
|
| 181 |
+
int chunks = W->chunks;
|
| 182 |
+
int wp = W->n_planes;
|
| 183 |
+
int xp = x_n_planes;
|
| 184 |
+
|
| 185 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 186 |
+
for (int i = 0; i < out_dim; i++) {
|
| 187 |
+
const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
|
| 188 |
+
long long acc = 0;
|
| 189 |
+
|
| 190 |
+
for (int c = 0; c < chunks; c++) {
|
| 191 |
+
uint64_t ws = w_sign_row[c];
|
| 192 |
+
uint64_t xs = x_sign[c];
|
| 193 |
+
uint64_t same = ~(ws ^ xs);
|
| 194 |
+
uint64_t diff = ws ^ xs;
|
| 195 |
+
|
| 196 |
+
for (int p = 0; p < wp; p++) {
|
| 197 |
+
uint64_t w_mag = W->log_planes[((size_t)p * out_dim + i) * chunks + c];
|
| 198 |
+
|
| 199 |
+
for (int q = 0; q < xp; q++) {
|
| 200 |
+
uint64_t x_mag = x_planes[(size_t)q * chunks + c];
|
| 201 |
+
uint64_t active = w_mag & x_mag;
|
| 202 |
+
if (!active) continue; /* skip zero — common with log encoding */
|
| 203 |
+
|
| 204 |
+
uint64_t pos = active & same;
|
| 205 |
+
uint64_t neg = active & diff;
|
| 206 |
+
int shift = p + q;
|
| 207 |
+
acc += (long long)(__builtin_popcountll(pos) -
|
| 208 |
+
__builtin_popcountll(neg)) << shift;
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
y_out[i] = (float)acc * W->scales[i] * x_scale;
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
/* ============================================================
|
| 218 |
+
* FP16 ops (embedding, lm_head) — not in the critical per-layer path
|
| 219 |
+
* ============================================================ */
|
| 220 |
+
static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
|
| 221 |
+
const uint16_t *row = embed + (size_t)token_id * hidden;
|
| 222 |
+
int i;
|
| 223 |
+
for (i = 0; i + 16 <= hidden; i += 16) {
|
| 224 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + i));
|
| 225 |
+
__m512 fv = _mm512_cvtph_ps(h);
|
| 226 |
+
_mm512_storeu_ps(out + i, fv);
|
| 227 |
+
}
|
| 228 |
+
for (; i < hidden; i++) {
|
| 229 |
+
__m128i hv = _mm_set1_epi16(row[i]);
|
| 230 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 231 |
+
_mm_store_ss(out + i, fv);
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
|
| 236 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 237 |
+
for (int i = 0; i < out_dim; i++) {
|
| 238 |
+
__m512 acc = _mm512_setzero_ps();
|
| 239 |
+
int j;
|
| 240 |
+
for (j = 0; j + 16 <= in_dim; j += 16) {
|
| 241 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
|
| 242 |
+
__m512 wv = _mm512_cvtph_ps(h);
|
| 243 |
+
__m512 xv = _mm512_loadu_ps(x + j);
|
| 244 |
+
acc = _mm512_fmadd_ps(wv, xv, acc);
|
| 245 |
+
}
|
| 246 |
+
float sum = _mm512_reduce_add_ps(acc);
|
| 247 |
+
for (; j < in_dim; j++) {
|
| 248 |
+
__m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
|
| 249 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 250 |
+
float wf; _mm_store_ss(&wf, fv);
|
| 251 |
+
sum += wf * x[j];
|
| 252 |
+
}
|
| 253 |
+
y[i] = sum;
|
| 254 |
+
}
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
/* ============================================================
|
| 258 |
+
* O(dim) float ops — RMSNorm, SiLU, Softmax, RoPE, residual
|
| 259 |
+
* ============================================================ */
|
| 260 |
+
static void rmsnorm(const float *x, const float *w, float *y, int dim) {
|
| 261 |
+
float ss = 0.0f;
|
| 262 |
+
for (int i = 0; i < dim; i++) ss += x[i] * x[i];
|
| 263 |
+
float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
|
| 264 |
+
for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
static void silu_mul(const float *gate, const float *up, float *out, int n) {
|
| 268 |
+
for (int i = 0; i < n; i++)
|
| 269 |
+
out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
static void vec_add(float *y, const float *x, int n) {
|
| 273 |
+
for (int i = 0; i < n; i++) y[i] += x[i];
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
static void apply_rope(float *vec, int pos, int dim, float theta) {
|
| 277 |
+
for (int i = 0; i < dim; i += 2) {
|
| 278 |
+
float freq = 1.0f / powf(theta, (float)i / dim);
|
| 279 |
+
float angle = pos * freq;
|
| 280 |
+
float co = cosf(angle), si = sinf(angle);
|
| 281 |
+
float v0 = vec[i], v1 = vec[i+1];
|
| 282 |
+
vec[i] = v0*co - v1*si;
|
| 283 |
+
vec[i+1] = v0*si + v1*co;
|
| 284 |
+
}
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
static void softmax(float *x, int n) {
|
| 288 |
+
float mx = x[0];
|
| 289 |
+
for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
|
| 290 |
+
float sum = 0.0f;
|
| 291 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
|
| 292 |
+
float inv = 1.0f / sum;
|
| 293 |
+
for (int i = 0; i < n; i++) x[i] *= inv;
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
|
| 297 |
+
return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
|
| 298 |
+
(size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
/* ============================================================
|
| 302 |
+
* ATTENTION
|
| 303 |
+
* ============================================================ */
|
| 304 |
+
static void attention(Model *m, int layer_idx, int pos) {
|
| 305 |
+
Config *c = &m->cfg;
|
| 306 |
+
Layer *L = &m->layers[layer_idx];
|
| 307 |
+
int heads_per_kv = c->n_heads / c->n_kv_heads;
|
| 308 |
+
int hidden_chunks = (c->hidden + 63) / 64;
|
| 309 |
+
float act_scale;
|
| 310 |
+
|
| 311 |
+
/* Quantize normed hidden -> log-unary */
|
| 312 |
+
quantize_log_unary(m->normed, c->hidden, c->a_planes,
|
| 313 |
+
m->act_sign, m->act_planes, &act_scale);
|
| 314 |
+
|
| 315 |
+
/* Q, K, V — log-unary matmul */
|
| 316 |
+
log_unary_matvec(&L->q_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->q_float);
|
| 317 |
+
log_unary_matvec(&L->k_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->k_float);
|
| 318 |
+
log_unary_matvec(&L->v_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->v_float);
|
| 319 |
+
|
| 320 |
+
/* QK-Norm */
|
| 321 |
+
if (L->q_norm)
|
| 322 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 323 |
+
rmsnorm(m->q_float + h*c->head_dim, L->q_norm, m->q_float + h*c->head_dim, c->head_dim);
|
| 324 |
+
if (L->k_norm)
|
| 325 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 326 |
+
rmsnorm(m->k_float + h*c->head_dim, L->k_norm, m->k_float + h*c->head_dim, c->head_dim);
|
| 327 |
+
|
| 328 |
+
/* RoPE */
|
| 329 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 330 |
+
apply_rope(m->q_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 331 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 332 |
+
apply_rope(m->k_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 333 |
+
|
| 334 |
+
/* KV cache store */
|
| 335 |
+
for (int h = 0; h < c->n_kv_heads; h++) {
|
| 336 |
+
memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
|
| 337 |
+
m->k_float + h*c->head_dim, c->head_dim * sizeof(float));
|
| 338 |
+
memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
|
| 339 |
+
m->v_float + h*c->head_dim, c->head_dim * sizeof(float));
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
/* Attention dot products + softmax + weighted sum */
|
| 343 |
+
float scale = 1.0f / sqrtf((float)c->head_dim);
|
| 344 |
+
memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
|
| 345 |
+
|
| 346 |
+
for (int h = 0; h < c->n_heads; h++) {
|
| 347 |
+
int kv_h = h / heads_per_kv;
|
| 348 |
+
float *qh = m->q_float + h*c->head_dim;
|
| 349 |
+
float *oh = m->attn_out + h*c->head_dim;
|
| 350 |
+
|
| 351 |
+
for (int t = 0; t <= pos; t++) {
|
| 352 |
+
float *kc = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
|
| 353 |
+
float dot = 0.0f;
|
| 354 |
+
for (int d = 0; d < c->head_dim; d++) dot += qh[d] * kc[d];
|
| 355 |
+
m->attn_scores[t] = dot * scale;
|
| 356 |
+
}
|
| 357 |
+
softmax(m->attn_scores, pos + 1);
|
| 358 |
+
for (int t = 0; t <= pos; t++) {
|
| 359 |
+
float w = m->attn_scores[t];
|
| 360 |
+
if (w < 1e-8f) continue;
|
| 361 |
+
float *vc = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
|
| 362 |
+
for (int d = 0; d < c->head_dim; d++) oh[d] += w * vc[d];
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
/* O projection — quantize attn_out, then log-unary matmul */
|
| 367 |
+
int o_dim = c->n_heads * c->head_dim;
|
| 368 |
+
int o_chunks = (o_dim + 63) / 64;
|
| 369 |
+
uint64_t *o_sign = (uint64_t *)aligned_alloc(64, o_chunks * sizeof(uint64_t));
|
| 370 |
+
uint64_t *o_planes = (uint64_t *)aligned_alloc(64, (size_t)c->a_planes * o_chunks * sizeof(uint64_t));
|
| 371 |
+
float o_scale;
|
| 372 |
+
quantize_log_unary(m->attn_out, o_dim, c->a_planes, o_sign, o_planes, &o_scale);
|
| 373 |
+
|
| 374 |
+
float *o_tmp = m->normed; /* reuse */
|
| 375 |
+
log_unary_matvec(&L->o_proj, o_sign, o_planes, o_scale, c->a_planes, o_tmp);
|
| 376 |
+
memcpy(m->attn_out, o_tmp, c->hidden * sizeof(float));
|
| 377 |
+
|
| 378 |
+
free(o_sign); free(o_planes);
|
| 379 |
+
}
|
| 380 |
+
|
| 381 |
+
/* ============================================================
|
| 382 |
+
* MLP
|
| 383 |
+
* ============================================================ */
|
| 384 |
+
static void mlp(Model *m, int layer_idx) {
|
| 385 |
+
Config *c = &m->cfg;
|
| 386 |
+
Layer *L = &m->layers[layer_idx];
|
| 387 |
+
int hidden_chunks = (c->hidden + 63) / 64;
|
| 388 |
+
int inter_chunks = (c->inter + 63) / 64;
|
| 389 |
+
float act_scale, mlp_scale;
|
| 390 |
+
|
| 391 |
+
/* Quantize normed input */
|
| 392 |
+
quantize_log_unary(m->normed, c->hidden, c->a_planes,
|
| 393 |
+
m->act_sign, m->act_planes, &act_scale);
|
| 394 |
+
|
| 395 |
+
/* Gate + Up — log-unary */
|
| 396 |
+
log_unary_matvec(&L->gate_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->gate_float);
|
| 397 |
+
log_unary_matvec(&L->up_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->up_float);
|
| 398 |
+
|
| 399 |
+
/* SiLU(gate) * up */
|
| 400 |
+
silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
|
| 401 |
+
|
| 402 |
+
/* Quantize for down projection */
|
| 403 |
+
quantize_log_unary(m->mlp_act, c->inter, c->a_planes,
|
| 404 |
+
m->mlp_act_sign, m->mlp_act_planes, &mlp_scale);
|
| 405 |
+
|
| 406 |
+
/* Down — log-unary */
|
| 407 |
+
log_unary_matvec(&L->down_proj, m->mlp_act_sign, m->mlp_act_planes, mlp_scale, c->a_planes, m->normed);
|
| 408 |
+
}
|
| 409 |
+
|
| 410 |
+
/* ============================================================
|
| 411 |
+
* FORWARD
|
| 412 |
+
* ============================================================ */
|
| 413 |
+
float* forward_token(Model *m, int token_id, int pos) {
|
| 414 |
+
Config *c = &m->cfg;
|
| 415 |
+
|
| 416 |
+
embed_token(m->embed, token_id, m->hidden, c->hidden);
|
| 417 |
+
|
| 418 |
+
for (int l = 0; l < c->n_layers; l++) {
|
| 419 |
+
rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
|
| 420 |
+
attention(m, l, pos);
|
| 421 |
+
vec_add(m->hidden, m->attn_out, c->hidden);
|
| 422 |
+
rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
|
| 423 |
+
mlp(m, l);
|
| 424 |
+
vec_add(m->hidden, m->normed, c->hidden);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
|
| 428 |
+
|
| 429 |
+
if (c->tie_embeddings)
|
| 430 |
+
fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
|
| 431 |
+
|
| 432 |
+
return m->logits;
|
| 433 |
+
}
|
| 434 |
+
|
| 435 |
+
/* ============================================================
|
| 436 |
+
* SAMPLING
|
| 437 |
+
* ============================================================ */
|
| 438 |
+
static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
|
| 439 |
+
if (temperature > 0) {
|
| 440 |
+
float inv_t = 1.0f / temperature;
|
| 441 |
+
for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
|
| 442 |
+
}
|
| 443 |
+
softmax(logits, vocab);
|
| 444 |
+
|
| 445 |
+
float *probs = (float *)malloc(vocab * sizeof(float));
|
| 446 |
+
int *indices = (int *)malloc(vocab * sizeof(int));
|
| 447 |
+
memcpy(probs, logits, vocab * sizeof(float));
|
| 448 |
+
for (int i = 0; i < vocab; i++) indices[i] = i;
|
| 449 |
+
|
| 450 |
+
int n = 0; float cum = 0.0f;
|
| 451 |
+
while (cum < top_p && n < vocab) {
|
| 452 |
+
int best = n;
|
| 453 |
+
for (int i = n+1; i < vocab; i++) if (probs[i] > probs[best]) best = i;
|
| 454 |
+
float t = probs[n]; probs[n] = probs[best]; probs[best] = t;
|
| 455 |
+
int ti = indices[n]; indices[n] = indices[best]; indices[best] = ti;
|
| 456 |
+
cum += probs[n]; n++;
|
| 457 |
+
if (n >= 40) break;
|
| 458 |
+
}
|
| 459 |
+
float sum = 0; for (int i = 0; i < n; i++) sum += probs[i];
|
| 460 |
+
float r = (float)rand() / RAND_MAX * sum;
|
| 461 |
+
float a = 0; int ch = indices[0];
|
| 462 |
+
for (int i = 0; i < n; i++) { a += probs[i]; if (a >= r) { ch = indices[i]; break; } }
|
| 463 |
+
free(probs); free(indices);
|
| 464 |
+
return ch;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
int generate(Model *m, const int *prompt, int plen, int *out, int max_new,
|
| 468 |
+
float temperature, float top_p, int eos) {
|
| 469 |
+
srand(time(NULL));
|
| 470 |
+
for (int i = 0; i < plen; i++) forward_token(m, prompt[i], i);
|
| 471 |
+
int pos = plen, gen = 0;
|
| 472 |
+
for (int t = 0; t < max_new; t++) {
|
| 473 |
+
int next;
|
| 474 |
+
if (temperature <= 0) {
|
| 475 |
+
next = 0;
|
| 476 |
+
for (int i = 1; i < m->cfg.vocab; i++)
|
| 477 |
+
if (m->logits[i] > m->logits[next]) next = i;
|
| 478 |
+
} else {
|
| 479 |
+
next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
|
| 480 |
+
}
|
| 481 |
+
out[t] = next; gen++;
|
| 482 |
+
if (next == eos) break;
|
| 483 |
+
forward_token(m, next, pos); pos++;
|
| 484 |
+
}
|
| 485 |
+
return gen;
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
/* ============================================================
|
| 489 |
+
* ALLOCATION
|
| 490 |
+
* ============================================================ */
|
| 491 |
+
Model* model_alloc(
|
| 492 |
+
int w_planes, int a_planes,
|
| 493 |
+
int hidden, int inter, int n_heads, int n_kv_heads,
|
| 494 |
+
int head_dim, int n_layers, int vocab,
|
| 495 |
+
float rope_theta, int tie_embeddings
|
| 496 |
+
) {
|
| 497 |
+
Model *m = (Model *)calloc(1, sizeof(Model));
|
| 498 |
+
Config *c = &m->cfg;
|
| 499 |
+
c->hidden = hidden; c->inter = inter;
|
| 500 |
+
c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
|
| 501 |
+
c->head_dim = head_dim; c->n_layers = n_layers;
|
| 502 |
+
c->vocab = vocab; c->rope_theta = rope_theta;
|
| 503 |
+
c->tie_embeddings = tie_embeddings;
|
| 504 |
+
c->w_planes = w_planes; c->a_planes = a_planes;
|
| 505 |
+
|
| 506 |
+
m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
|
| 507 |
+
|
| 508 |
+
size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
|
| 509 |
+
m->k_cache = (float *)calloc(kv_size, sizeof(float));
|
| 510 |
+
m->v_cache = (float *)calloc(kv_size, sizeof(float));
|
| 511 |
+
|
| 512 |
+
int max_dim = inter > hidden ? inter : hidden;
|
| 513 |
+
m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 514 |
+
m->normed = (float *)aligned_alloc(64, max_dim * sizeof(float));
|
| 515 |
+
m->q_float = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 516 |
+
m->k_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 517 |
+
m->v_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 518 |
+
m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 519 |
+
m->gate_float = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 520 |
+
m->up_float = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 521 |
+
m->mlp_act = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 522 |
+
m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
|
| 523 |
+
m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
|
| 524 |
+
m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 525 |
+
|
| 526 |
+
/* Unary scratch for hidden dim */
|
| 527 |
+
int h_chunks = (hidden + 63) / 64;
|
| 528 |
+
m->act_sign = (uint64_t *)aligned_alloc(64, h_chunks * sizeof(uint64_t));
|
| 529 |
+
m->act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * h_chunks * sizeof(uint64_t));
|
| 530 |
+
|
| 531 |
+
/* Unary scratch for intermediate dim */
|
| 532 |
+
int i_chunks = (inter + 63) / 64;
|
| 533 |
+
m->mlp_act_sign = (uint64_t *)aligned_alloc(64, i_chunks * sizeof(uint64_t));
|
| 534 |
+
m->mlp_act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * i_chunks * sizeof(uint64_t));
|
| 535 |
+
|
| 536 |
+
int w_max = (1 << w_planes) - 1;
|
| 537 |
+
int a_max = (1 << a_planes) - 1;
|
| 538 |
+
|
| 539 |
+
printf("LOG-UNARY ENGINE\n");
|
| 540 |
+
printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
|
| 541 |
+
hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
|
| 542 |
+
printf(" Weight: %d log-planes -> %d levels (range -%d..+%d)\n",
|
| 543 |
+
w_planes, 2*w_max+1, w_max, w_max);
|
| 544 |
+
printf(" Activation: %d log-planes -> %d levels (range -%d..+%d)\n",
|
| 545 |
+
a_planes, 2*a_max+1, a_max, a_max);
|
| 546 |
+
printf(" Plane pairs per element: %d (vs %d linear)\n",
|
| 547 |
+
w_planes * a_planes, 7 * 4);
|
| 548 |
+
printf(" KV cache: %zu MB\n", kv_size * 2 * sizeof(float) / (1024*1024));
|
| 549 |
+
|
| 550 |
+
return m;
|
| 551 |
+
}
|
| 552 |
+
|
| 553 |
+
/* Weight setters */
|
| 554 |
+
void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
|
| 555 |
+
void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
|
| 556 |
+
|
| 557 |
+
void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
|
| 558 |
+
m->layers[l].input_norm = in_norm;
|
| 559 |
+
m->layers[l].post_norm = post_norm;
|
| 560 |
+
}
|
| 561 |
+
|
| 562 |
+
void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
|
| 563 |
+
m->layers[l].q_norm = q_norm;
|
| 564 |
+
m->layers[l].k_norm = k_norm;
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
static void init_weight(LogUnaryWeight *w, uint64_t *sign, uint64_t *planes, float *scales,
|
| 568 |
+
int out_dim, int in_dim, int n_planes) {
|
| 569 |
+
w->sign_bits = sign; w->log_planes = planes; w->scales = scales;
|
| 570 |
+
w->out_dim = out_dim; w->in_dim = in_dim; w->n_planes = n_planes;
|
| 571 |
+
w->chunks = (in_dim + 63) / 64;
|
| 572 |
+
}
|
| 573 |
+
|
| 574 |
+
void layer_set_linears(
|
| 575 |
+
Model *m, int l,
|
| 576 |
+
uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
|
| 577 |
+
uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
|
| 578 |
+
uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
|
| 579 |
+
uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
|
| 580 |
+
uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
|
| 581 |
+
uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
|
| 582 |
+
uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
|
| 583 |
+
int n_planes
|
| 584 |
+
) {
|
| 585 |
+
init_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
|
| 586 |
+
init_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
|
| 587 |
+
init_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
|
| 588 |
+
init_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
|
| 589 |
+
init_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
|
| 590 |
+
init_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
|
| 591 |
+
init_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
|
| 592 |
+
}
|
| 593 |
+
|
| 594 |
+
void model_reset_cache(Model *m) {
|
| 595 |
+
size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
|
| 596 |
+
memset(m->k_cache, 0, kv_size * sizeof(float));
|
| 597 |
+
memset(m->v_cache, 0, kv_size * sizeof(float));
|
| 598 |
+
}
|
logunary_tensor.c
ADDED
|
@@ -0,0 +1,534 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#define _POSIX_C_SOURCE 199309L
|
| 2 |
+
/*
|
| 3 |
+
* LOG-UNARY TENSOR LIBRARY
|
| 4 |
+
*
|
| 5 |
+
* Native tensor type where values are represented as:
|
| 6 |
+
* sign (1 bit) + log-magnitude bitplanes
|
| 7 |
+
*
|
| 8 |
+
* Plane p is set if |value| >= 2^(p - bias)
|
| 9 |
+
* With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
|
| 10 |
+
*
|
| 11 |
+
* ALL arithmetic stays in this representation:
|
| 12 |
+
* - matmul: AND + weighted_popcount (shift by p+q-2*bias)
|
| 13 |
+
* - add: bitwise merge with carry propagation
|
| 14 |
+
* - scale: shift planes up/down
|
| 15 |
+
* - negate: flip sign bits
|
| 16 |
+
*
|
| 17 |
+
* Float conversion only at boundaries (embed lookup, final logits)
|
| 18 |
+
*
|
| 19 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 20 |
+
*/
|
| 21 |
+
|
| 22 |
+
#include <immintrin.h>
|
| 23 |
+
#include <omp.h>
|
| 24 |
+
#include <stdint.h>
|
| 25 |
+
#include <stdlib.h>
|
| 26 |
+
#include <string.h>
|
| 27 |
+
#include <math.h>
|
| 28 |
+
#include <stdio.h>
|
| 29 |
+
#include <time.h>
|
| 30 |
+
|
| 31 |
+
/* ============================================================
|
| 32 |
+
* LOG-UNARY TENSOR
|
| 33 |
+
*
|
| 34 |
+
* For a vector of length `dim`:
|
| 35 |
+
* sign: uint64[chunks] - 1 bit per element
|
| 36 |
+
* planes: uint64[n_planes][chunks] - 1 bit per element per plane
|
| 37 |
+
* chunks = (dim + 63) / 64
|
| 38 |
+
*
|
| 39 |
+
* Plane p is set if |value| >= threshold[p]
|
| 40 |
+
* threshold[p] = base_scale * 2^(p - bias)
|
| 41 |
+
*
|
| 42 |
+
* This is a LOG thermometer code:
|
| 43 |
+
* value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
|
| 44 |
+
* value=1.0 with bias=10 -> planes 0-10 set
|
| 45 |
+
* value=64.0 with bias=10 -> planes 0-16 set
|
| 46 |
+
*
|
| 47 |
+
* ============================================================ */
|
| 48 |
+
typedef struct {
|
| 49 |
+
uint64_t *sign; /* [chunks] */
|
| 50 |
+
uint64_t *planes; /* [n_planes * chunks] contiguous */
|
| 51 |
+
int dim;
|
| 52 |
+
int chunks;
|
| 53 |
+
int n_planes;
|
| 54 |
+
int bias; /* log2 offset: threshold[p] = base * 2^(p-bias) */
|
| 55 |
+
float base_scale; /* per-tensor scale factor */
|
| 56 |
+
} LogUnaryTensor;
|
| 57 |
+
|
| 58 |
+
/* 2D tensor (matrix) - row-major */
|
| 59 |
+
typedef struct {
|
| 60 |
+
uint64_t *sign; /* [rows * chunks_per_row] */
|
| 61 |
+
uint64_t *planes; /* [n_planes * rows * chunks_per_row] */
|
| 62 |
+
float *row_scales; /* [rows] per-row base scales */
|
| 63 |
+
int rows;
|
| 64 |
+
int cols;
|
| 65 |
+
int chunks; /* chunks per row = (cols+63)/64 */
|
| 66 |
+
int n_planes;
|
| 67 |
+
int bias;
|
| 68 |
+
} LogUnaryMatrix;
|
| 69 |
+
|
| 70 |
+
/* ============================================================
|
| 71 |
+
* ALLOCATION
|
| 72 |
+
* ============================================================ */
|
| 73 |
+
LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
|
| 74 |
+
LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor));
|
| 75 |
+
t->dim = dim;
|
| 76 |
+
t->n_planes = n_planes;
|
| 77 |
+
t->bias = bias;
|
| 78 |
+
t->chunks = (dim + 63) / 64;
|
| 79 |
+
t->base_scale = 1.0f;
|
| 80 |
+
t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t));
|
| 81 |
+
t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t));
|
| 82 |
+
memset(t->sign, 0, t->chunks * sizeof(uint64_t));
|
| 83 |
+
memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
|
| 84 |
+
return t;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
|
| 88 |
+
LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix));
|
| 89 |
+
m->rows = rows;
|
| 90 |
+
m->cols = cols;
|
| 91 |
+
m->n_planes = n_planes;
|
| 92 |
+
m->bias = bias;
|
| 93 |
+
m->chunks = (cols + 63) / 64;
|
| 94 |
+
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 95 |
+
m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
|
| 96 |
+
m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float));
|
| 97 |
+
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 98 |
+
memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
|
| 99 |
+
for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
|
| 100 |
+
return m;
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
void lut_free(LogUnaryTensor *t) {
|
| 104 |
+
if (t) { free(t->sign); free(t->planes); free(t); }
|
| 105 |
+
}
|
| 106 |
+
void lum_free(LogUnaryMatrix *m) {
|
| 107 |
+
if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
/* ============================================================
|
| 111 |
+
* FLOAT <-> LOG-UNARY CONVERSION
|
| 112 |
+
* Only used at boundaries (embedding, final output)
|
| 113 |
+
* ============================================================ */
|
| 114 |
+
void lut_from_float(LogUnaryTensor *t, const float *x) {
|
| 115 |
+
int dim = t->dim;
|
| 116 |
+
int np = t->n_planes;
|
| 117 |
+
int bias = t->bias;
|
| 118 |
+
int chunks = t->chunks;
|
| 119 |
+
|
| 120 |
+
memset(t->sign, 0, chunks * sizeof(uint64_t));
|
| 121 |
+
memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));
|
| 122 |
+
|
| 123 |
+
/* Find absmax for base_scale */
|
| 124 |
+
float amax = 0.0f;
|
| 125 |
+
for (int i = 0; i < dim; i++) {
|
| 126 |
+
float a = fabsf(x[i]);
|
| 127 |
+
if (a > amax) amax = a;
|
| 128 |
+
}
|
| 129 |
+
if (amax == 0.0f) { t->base_scale = 1.0f; return; }
|
| 130 |
+
|
| 131 |
+
/* Set base_scale so that max value uses the highest plane */
|
| 132 |
+
/* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
|
| 133 |
+
t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);
|
| 134 |
+
|
| 135 |
+
for (int i = 0; i < dim; i++) {
|
| 136 |
+
int c = i / 64;
|
| 137 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 138 |
+
|
| 139 |
+
if (x[i] < 0.0f) t->sign[c] |= bit;
|
| 140 |
+
|
| 141 |
+
float mag = fabsf(x[i]);
|
| 142 |
+
/* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
|
| 143 |
+
for (int p = 0; p < np; p++) {
|
| 144 |
+
float thresh = t->base_scale * ldexpf(1.0f, p - bias);
|
| 145 |
+
if (mag >= thresh)
|
| 146 |
+
t->planes[(size_t)p * chunks + c] |= bit;
|
| 147 |
+
else
|
| 148 |
+
break; /* thermometer: once we stop, all higher planes are 0 */
|
| 149 |
+
}
|
| 150 |
+
}
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
void lut_to_float(const LogUnaryTensor *t, float *out) {
|
| 154 |
+
int dim = t->dim;
|
| 155 |
+
int np = t->n_planes;
|
| 156 |
+
int bias = t->bias;
|
| 157 |
+
int chunks = t->chunks;
|
| 158 |
+
|
| 159 |
+
memset(out, 0, dim * sizeof(float));
|
| 160 |
+
|
| 161 |
+
for (int i = 0; i < dim; i++) {
|
| 162 |
+
int c = i / 64;
|
| 163 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 164 |
+
|
| 165 |
+
/* Find highest set plane */
|
| 166 |
+
int highest = -1;
|
| 167 |
+
for (int p = np - 1; p >= 0; p--) {
|
| 168 |
+
if (t->planes[(size_t)p * chunks + c] & bit) {
|
| 169 |
+
highest = p;
|
| 170 |
+
break;
|
| 171 |
+
}
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
if (highest < 0) {
|
| 175 |
+
out[i] = 0.0f;
|
| 176 |
+
} else {
|
| 177 |
+
/* Value is approximately base * 2^(highest - bias) */
|
| 178 |
+
/* More precise: midpoint between this threshold and next */
|
| 179 |
+
float val = t->base_scale * ldexpf(1.0f, highest - bias);
|
| 180 |
+
if (highest < np - 1) {
|
| 181 |
+
float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
|
| 182 |
+
val = (val + next) * 0.5f; /* midpoint reconstruction */
|
| 183 |
+
}
|
| 184 |
+
out[i] = (t->sign[c] & bit) ? -val : val;
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
/* Convert float matrix to log-unary matrix (per-row scaling) */
|
| 190 |
+
void lum_from_float(LogUnaryMatrix *m, const float *data) {
|
| 191 |
+
int rows = m->rows, cols = m->cols;
|
| 192 |
+
int np = m->n_planes, bias = m->bias;
|
| 193 |
+
int chunks = m->chunks;
|
| 194 |
+
|
| 195 |
+
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
|
| 196 |
+
memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));
|
| 197 |
+
|
| 198 |
+
for (int r = 0; r < rows; r++) {
|
| 199 |
+
const float *row = data + (size_t)r * cols;
|
| 200 |
+
|
| 201 |
+
/* Per-row absmax */
|
| 202 |
+
float amax = 0.0f;
|
| 203 |
+
for (int j = 0; j < cols; j++) {
|
| 204 |
+
float a = fabsf(row[j]);
|
| 205 |
+
if (a > amax) amax = a;
|
| 206 |
+
}
|
| 207 |
+
if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
|
| 208 |
+
m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);
|
| 209 |
+
|
| 210 |
+
uint64_t *row_sign = m->sign + (size_t)r * chunks;
|
| 211 |
+
|
| 212 |
+
for (int j = 0; j < cols; j++) {
|
| 213 |
+
int c = j / 64;
|
| 214 |
+
uint64_t bit = 1ULL << (j % 64);
|
| 215 |
+
|
| 216 |
+
if (row[j] < 0.0f) row_sign[c] |= bit;
|
| 217 |
+
|
| 218 |
+
float mag = fabsf(row[j]);
|
| 219 |
+
for (int p = 0; p < np; p++) {
|
| 220 |
+
float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
|
| 221 |
+
if (mag >= thresh)
|
| 222 |
+
m->planes[((size_t)p * rows + r) * chunks + c] |= bit;
|
| 223 |
+
else
|
| 224 |
+
break;
|
| 225 |
+
}
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
/* ============================================================
|
| 231 |
+
* LOG-UNARY MATMUL: y = M @ x
|
| 232 |
+
*
|
| 233 |
+
* Both M (matrix) and x (vector) are log-unary encoded.
|
| 234 |
+
*
|
| 235 |
+
* For each output element y[i]:
|
| 236 |
+
* For each weight plane p, activation plane q:
|
| 237 |
+
* active = M.planes[p][i] AND x.planes[q]
|
| 238 |
+
* same = active AND ~(M.sign[i] XOR x.sign)
|
| 239 |
+
* diff = active AND (M.sign[i] XOR x.sign)
|
| 240 |
+
* contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
|
| 241 |
+
*
|
| 242 |
+
* Output is a LogUnaryTensor (converted from integer accumulator)
|
| 243 |
+
* ============================================================ */
|
| 244 |
+
void lum_matvec(
|
| 245 |
+
const LogUnaryMatrix *M,
|
| 246 |
+
const LogUnaryTensor *x,
|
| 247 |
+
LogUnaryTensor *y_out /* output: log-unary encoded result */
|
| 248 |
+
) {
|
| 249 |
+
int out_dim = M->rows;
|
| 250 |
+
int chunks = M->chunks;
|
| 251 |
+
int wp = M->n_planes;
|
| 252 |
+
int xp = x->n_planes;
|
| 253 |
+
int w_bias = M->bias;
|
| 254 |
+
int x_bias = x->bias;
|
| 255 |
+
|
| 256 |
+
/* Accumulate to float temporarily, then requantize to log-unary.
|
| 257 |
+
* The accumulator is integer shifts (2^(p+q-2bias)), which
|
| 258 |
+
* we can do as int64 left-shifts for small exponents.
|
| 259 |
+
*
|
| 260 |
+
* For the exponent range we're in (p+q in [0,14] with bias ~4),
|
| 261 |
+
* net shift is [-8, 6], so we use a fixed-point int64 accumulator
|
| 262 |
+
* with a base shift to keep everything positive.
|
| 263 |
+
*/
|
| 264 |
+
int base_shift = w_bias + x_bias; /* shift to add to make all exponents >= 0 */
|
| 265 |
+
|
| 266 |
+
/* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
|
| 267 |
+
/* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */
|
| 268 |
+
|
| 269 |
+
float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
|
| 270 |
+
|
| 271 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 272 |
+
for (int i = 0; i < out_dim; i++) {
|
| 273 |
+
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
|
| 274 |
+
long long acc = 0;
|
| 275 |
+
|
| 276 |
+
for (int c = 0; c < chunks; c++) {
|
| 277 |
+
uint64_t ws = w_sign_row[c];
|
| 278 |
+
uint64_t xs = x->sign[c];
|
| 279 |
+
uint64_t same = ~(ws ^ xs);
|
| 280 |
+
uint64_t diff = ws ^ xs;
|
| 281 |
+
|
| 282 |
+
for (int p = 0; p < wp; p++) {
|
| 283 |
+
uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];
|
| 284 |
+
|
| 285 |
+
for (int q = 0; q < xp; q++) {
|
| 286 |
+
uint64_t x_plane = x->planes[(size_t)q * chunks + c];
|
| 287 |
+
uint64_t active = w_plane & x_plane;
|
| 288 |
+
uint64_t pos = active & same;
|
| 289 |
+
uint64_t neg = active & diff;
|
| 290 |
+
|
| 291 |
+
int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);
|
| 292 |
+
|
| 293 |
+
/* Weighted by 2^(p + q) relative to base */
|
| 294 |
+
int shift = p + q; /* relative to 2^(-base_shift) */
|
| 295 |
+
if (count != 0)
|
| 296 |
+
acc += (long long)count << shift;
|
| 297 |
+
}
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
/* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
|
| 302 |
+
y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
|
| 303 |
+
* ldexpf(1.0f, -base_shift);
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
/* Requantize float result to log-unary */
|
| 307 |
+
lut_from_float(y_out, y_float);
|
| 308 |
+
free(y_float);
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
/* ============================================================
|
| 312 |
+
* LOG-UNARY ELEMENT-WISE ADD: z = a + b
|
| 313 |
+
*
|
| 314 |
+
* Dequant both, add as float, requant.
|
| 315 |
+
* This is O(dim) so not the bottleneck.
|
| 316 |
+
* Future: direct bitwise add with carry chains.
|
| 317 |
+
* ============================================================ */
|
| 318 |
+
void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) {
|
| 319 |
+
int dim = a->dim;
|
| 320 |
+
float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 321 |
+
float *fb = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 322 |
+
|
| 323 |
+
lut_to_float(a, fa);
|
| 324 |
+
lut_to_float(b, fb);
|
| 325 |
+
|
| 326 |
+
for (int i = 0; i < dim; i++) fa[i] += fb[i];
|
| 327 |
+
|
| 328 |
+
lut_from_float(out, fa);
|
| 329 |
+
free(fa); free(fb);
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
/* In-place add: a += b (dequant a, add float b, requant) */
|
| 333 |
+
void lut_add_float(LogUnaryTensor *a, const float *b) {
|
| 334 |
+
int dim = a->dim;
|
| 335 |
+
float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 336 |
+
lut_to_float(a, fa);
|
| 337 |
+
for (int i = 0; i < dim; i++) fa[i] += b[i];
|
| 338 |
+
lut_from_float(a, fa);
|
| 339 |
+
free(fa);
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
+
/* ============================================================
|
| 343 |
+
* LOG-UNARY RMSNORM
|
| 344 |
+
*
|
| 345 |
+
* Needs float for the sqrt/reciprocal, but O(dim).
|
| 346 |
+
* Input: log-unary, Output: log-unary
|
| 347 |
+
* ============================================================ */
|
| 348 |
+
void lut_rmsnorm(
|
| 349 |
+
const LogUnaryTensor *x,
|
| 350 |
+
const float *weight, /* norm weights stay float (tiny) */
|
| 351 |
+
LogUnaryTensor *out,
|
| 352 |
+
float eps
|
| 353 |
+
) {
|
| 354 |
+
int dim = x->dim;
|
| 355 |
+
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 356 |
+
lut_to_float(x, xf);
|
| 357 |
+
|
| 358 |
+
float ss = 0.0f;
|
| 359 |
+
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
|
| 360 |
+
float rms = 1.0f / sqrtf(ss / dim + eps);
|
| 361 |
+
|
| 362 |
+
for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];
|
| 363 |
+
|
| 364 |
+
lut_from_float(out, xf);
|
| 365 |
+
free(xf);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
/* ============================================================
|
| 369 |
+
* LOG-UNARY SILU_MUL: out = SiLU(gate) * up
|
| 370 |
+
*
|
| 371 |
+
* O(dim), not bottleneck. Dequant, compute, requant.
|
| 372 |
+
* ============================================================ */
|
| 373 |
+
void lut_silu_mul(
|
| 374 |
+
const LogUnaryTensor *gate,
|
| 375 |
+
const LogUnaryTensor *up,
|
| 376 |
+
LogUnaryTensor *out
|
| 377 |
+
) {
|
| 378 |
+
int dim = gate->dim;
|
| 379 |
+
float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 380 |
+
float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 381 |
+
|
| 382 |
+
lut_to_float(gate, gf);
|
| 383 |
+
lut_to_float(up, uf);
|
| 384 |
+
|
| 385 |
+
for (int i = 0; i < dim; i++)
|
| 386 |
+
gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
|
| 387 |
+
|
| 388 |
+
lut_from_float(out, gf);
|
| 389 |
+
free(gf); free(uf);
|
| 390 |
+
}
|
| 391 |
+
|
| 392 |
+
/* ============================================================
|
| 393 |
+
* LOG-UNARY ROPE
|
| 394 |
+
*
|
| 395 |
+
* O(dim), dequant-compute-requant per head.
|
| 396 |
+
* ============================================================ */
|
| 397 |
+
void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
|
| 398 |
+
/* Dequant the relevant slice, apply RoPE, requant */
|
| 399 |
+
float *f = (float *)aligned_alloc(64, head_dim * sizeof(float));
|
| 400 |
+
|
| 401 |
+
/* Extract slice */
|
| 402 |
+
float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
|
| 403 |
+
lut_to_float(t, full);
|
| 404 |
+
memcpy(f, full + start, head_dim * sizeof(float));
|
| 405 |
+
|
| 406 |
+
for (int i = 0; i < head_dim; i += 2) {
|
| 407 |
+
float freq = 1.0f / powf(theta, (float)i / head_dim);
|
| 408 |
+
float angle = offset * freq;
|
| 409 |
+
float c = cosf(angle), s = sinf(angle);
|
| 410 |
+
float v0 = f[i], v1 = f[i + 1];
|
| 411 |
+
f[i] = v0 * c - v1 * s;
|
| 412 |
+
f[i + 1] = v0 * s + v1 * c;
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
memcpy(full + start, f, head_dim * sizeof(float));
|
| 416 |
+
lut_from_float(t, full);
|
| 417 |
+
free(f); free(full);
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
/* ============================================================
|
| 421 |
+
* UTILITY: Get float slice from log-unary tensor
|
| 422 |
+
* (for attention scores which need float softmax)
|
| 423 |
+
* ============================================================ */
|
| 424 |
+
void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) {
|
| 425 |
+
float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
|
| 426 |
+
lut_to_float(t, full);
|
| 427 |
+
memcpy(out, full + start, len * sizeof(float));
|
| 428 |
+
free(full);
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
/* ============================================================
|
| 432 |
+
* BENCHMARK: measure matvec throughput
|
| 433 |
+
* ============================================================ */
|
| 434 |
+
typedef struct {
|
| 435 |
+
double total_and_ops;
|
| 436 |
+
double total_popcount_ops;
|
| 437 |
+
double wall_time_s;
|
| 438 |
+
double elements_per_sec;
|
| 439 |
+
double gops; /* giga-operations per second */
|
| 440 |
+
} BenchResult;
|
| 441 |
+
|
| 442 |
+
BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
|
| 443 |
+
LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
|
| 444 |
+
LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
|
| 445 |
+
LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);
|
| 446 |
+
|
| 447 |
+
/* Fill with random bits */
|
| 448 |
+
for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
|
| 449 |
+
M->sign[i] = ((uint64_t)rand() << 32) | rand();
|
| 450 |
+
for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
|
| 451 |
+
M->planes[i] = ((uint64_t)rand() << 32) | rand();
|
| 452 |
+
for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
|
| 453 |
+
for (size_t i = 0; i < (size_t)x->chunks; i++)
|
| 454 |
+
x->sign[i] = ((uint64_t)rand() << 32) | rand();
|
| 455 |
+
for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
|
| 456 |
+
x->planes[i] = ((uint64_t)rand() << 32) | rand();
|
| 457 |
+
x->base_scale = 1.0f;
|
| 458 |
+
|
| 459 |
+
/* Warmup */
|
| 460 |
+
lum_matvec(M, x, y);
|
| 461 |
+
|
| 462 |
+
struct timespec t0, t1;
|
| 463 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 464 |
+
for (int i = 0; i < iters; i++)
|
| 465 |
+
lum_matvec(M, x, y);
|
| 466 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 467 |
+
|
| 468 |
+
double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
|
| 469 |
+
int chunks = M->chunks;
|
| 470 |
+
double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; /* AND + popcount pairs */
|
| 471 |
+
|
| 472 |
+
BenchResult r;
|
| 473 |
+
r.wall_time_s = dt / iters;
|
| 474 |
+
r.total_and_ops = ops_per_call;
|
| 475 |
+
r.total_popcount_ops = ops_per_call;
|
| 476 |
+
r.elements_per_sec = (double)rows * cols * iters / dt;
|
| 477 |
+
r.gops = ops_per_call * iters / dt / 1e9;
|
| 478 |
+
|
| 479 |
+
lum_free(M); lut_free(x); lut_free(y);
|
| 480 |
+
return r;
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
/* ============================================================
|
| 484 |
+
* ACCURACY TEST: convert float->logunary->float roundtrip
|
| 485 |
+
* ============================================================ */
|
| 486 |
+
typedef struct {
|
| 487 |
+
float max_error;
|
| 488 |
+
float mean_error;
|
| 489 |
+
float cosine_sim;
|
| 490 |
+
float snr_db;
|
| 491 |
+
} AccuracyResult;
|
| 492 |
+
|
| 493 |
+
AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
|
| 494 |
+
float *original = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 495 |
+
float *recovered = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 496 |
+
|
| 497 |
+
/* Random normal-ish distribution */
|
| 498 |
+
for (int i = 0; i < dim; i++) {
|
| 499 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 500 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 501 |
+
original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
|
| 505 |
+
lut_from_float(t, original);
|
| 506 |
+
lut_to_float(t, recovered);
|
| 507 |
+
|
| 508 |
+
float max_err = 0, sum_err = 0;
|
| 509 |
+
float dot = 0, na = 0, nb = 0;
|
| 510 |
+
for (int i = 0; i < dim; i++) {
|
| 511 |
+
float err = fabsf(original[i] - recovered[i]);
|
| 512 |
+
if (err > max_err) max_err = err;
|
| 513 |
+
sum_err += err;
|
| 514 |
+
dot += original[i] * recovered[i];
|
| 515 |
+
na += original[i] * original[i];
|
| 516 |
+
nb += recovered[i] * recovered[i];
|
| 517 |
+
}
|
| 518 |
+
|
| 519 |
+
float noise_power = 0;
|
| 520 |
+
for (int i = 0; i < dim; i++) {
|
| 521 |
+
float e = original[i] - recovered[i];
|
| 522 |
+
noise_power += e * e;
|
| 523 |
+
}
|
| 524 |
+
|
| 525 |
+
AccuracyResult r;
|
| 526 |
+
r.max_error = max_err;
|
| 527 |
+
r.mean_error = sum_err / dim;
|
| 528 |
+
r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 529 |
+
r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));
|
| 530 |
+
|
| 531 |
+
lut_free(t);
|
| 532 |
+
free(original); free(recovered);
|
| 533 |
+
return r;
|
| 534 |
+
}
|
packed_convert.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Packed unary converter: uint8 magnitudes + bitpacked signs + per-row scales."""
|
| 3 |
+
import os, json, sys, time
|
| 4 |
+
import numpy as np
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
def load_safetensors(model_dir):
|
| 8 |
+
from safetensors.torch import load_file
|
| 9 |
+
tensors = {}
|
| 10 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 11 |
+
print(f" Loading {f.name}...")
|
| 12 |
+
for k, v in load_file(str(f)).items():
|
| 13 |
+
tensors[k] = v.float().numpy()
|
| 14 |
+
return tensors
|
| 15 |
+
|
| 16 |
+
def quantize_packed(w, n_levels=7):
|
| 17 |
+
out_dim, in_dim = w.shape
|
| 18 |
+
chunks = (in_dim + 63) // 64
|
| 19 |
+
padded = chunks * 64
|
| 20 |
+
row_max = np.max(np.abs(w), axis=1, keepdims=True)
|
| 21 |
+
row_max = np.where(row_max == 0, 1.0, row_max)
|
| 22 |
+
scales = (row_max.flatten() / n_levels).astype(np.float32)
|
| 23 |
+
mags = np.clip(np.round(np.abs(w / scales[:, None])), 0, n_levels).astype(np.uint8)
|
| 24 |
+
signs = (w < 0)
|
| 25 |
+
rmm = np.max(mags, axis=1).astype(np.uint8)
|
| 26 |
+
if in_dim < padded:
|
| 27 |
+
sp = np.zeros((out_dim, padded), dtype=bool)
|
| 28 |
+
sp[:, :in_dim] = signs
|
| 29 |
+
else:
|
| 30 |
+
sp = signs
|
| 31 |
+
bit_pos = np.uint64(1) << np.arange(64, dtype=np.uint64)
|
| 32 |
+
sign_bits = np.bitwise_or.reduce(sp.reshape(out_dim, chunks, 64).astype(np.uint64) * bit_pos, axis=2)
|
| 33 |
+
return mags, sign_bits, scales, rmm, np.mean(mags), np.mean(mags == 0)
|
| 34 |
+
|
| 35 |
+
def convert(tensors, output_dir, n_levels=7):
|
| 36 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 37 |
+
config = {"hidden_size":1536,"intermediate_size":8960,"num_attention_heads":12,
|
| 38 |
+
"num_key_value_heads":2,"num_hidden_layers":28,"vocab_size":151936,
|
| 39 |
+
"head_dim":128,"rope_theta":1000000.0,"rms_norm_eps":1e-6,
|
| 40 |
+
"n_levels":n_levels,"quant_type":"packed_unary"}
|
| 41 |
+
linear_keys = [k for k in tensors if any(p in k for p in
|
| 42 |
+
['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
|
| 43 |
+
'gate_proj.weight','up_proj.weight','down_proj.weight'])]
|
| 44 |
+
other_keys = [k for k in tensors if k not in linear_keys]
|
| 45 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 46 |
+
json.dump(config, f, indent=2)
|
| 47 |
+
total_packed = total_orig = 0
|
| 48 |
+
all_avg = []
|
| 49 |
+
for key in linear_keys:
|
| 50 |
+
w = tensors[key]; total_orig += w.nbytes
|
| 51 |
+
t0 = time.time()
|
| 52 |
+
mags, sb, sc, rmm, am, sp = quantize_packed(w, n_levels)
|
| 53 |
+
dt = time.time() - t0
|
| 54 |
+
pfx = os.path.join(output_dir, key.replace(".", "_"))
|
| 55 |
+
mags.tofile(pfx+".mags"); sb.tofile(pfx+".signs")
|
| 56 |
+
sc.tofile(pfx+".scales"); rmm.tofile(pfx+".rmm")
|
| 57 |
+
ub = mags.nbytes + sb.nbytes + sc.nbytes + rmm.nbytes
|
| 58 |
+
total_packed += ub; all_avg.append(am)
|
| 59 |
+
print(f" {key}: {w.shape} -> {ub/1024:.0f}KB (avg_mag={am:.2f}, {dt:.1f}s)")
|
| 60 |
+
total_fp16 = 0
|
| 61 |
+
for key in other_keys:
|
| 62 |
+
w = tensors[key].astype(np.float16)
|
| 63 |
+
pfx = os.path.join(output_dir, key.replace(".", "_"))
|
| 64 |
+
w.tofile(pfx+".fp16"); total_fp16 += w.nbytes
|
| 65 |
+
manifest = {"packed":{k:list(tensors[k].shape) for k in linear_keys},
|
| 66 |
+
"fp16":{k:list(tensors[k].shape) for k in other_keys}}
|
| 67 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 68 |
+
json.dump(manifest, f, indent=2)
|
| 69 |
+
print(f"\n=== PACKED UNARY ===")
|
| 70 |
+
print(f"Packed linear: {total_packed/1e6:.1f} MB | FP16 other: {total_fp16/1e6:.1f} MB")
|
| 71 |
+
print(f"Total: {(total_packed+total_fp16)/1e6:.1f} MB | Avg mag: {np.mean(all_avg):.3f}")
|
| 72 |
+
print(f"Expected speedup vs 7-plane: {7/np.mean(all_avg):.1f}x")
|
| 73 |
+
|
| 74 |
+
if __name__ == "__main__":
|
| 75 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
|
| 76 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-packed"
|
| 77 |
+
tensors = load_safetensors(model_dir)
|
| 78 |
+
convert(tensors, output_dir)
|
| 79 |
+
print("Done!")
|
packed_engine.c
ADDED
|
@@ -0,0 +1,408 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PACKED UNARY TRANSFORMER ENGINE - AVX-512 + OpenMP
|
| 3 |
+
*
|
| 4 |
+
* Instead of 7 fixed bitplanes (scanning 80% zeros),
|
| 5 |
+
* store magnitude per weight directly. Kernel processes
|
| 6 |
+
* groups of 16 weights, only loops to local max magnitude.
|
| 7 |
+
*
|
| 8 |
+
* Weight j with magnitude 3: adds x[j] THREE times (pure unary).
|
| 9 |
+
* But only 3 passes for that group, not 7.
|
| 10 |
+
*
|
| 11 |
+
* Average magnitude = 1.374, so average ~1.4 passes per group
|
| 12 |
+
* instead of always 7. That's the 5x speedup.
|
| 13 |
+
*
|
| 14 |
+
* Format per output row:
|
| 15 |
+
* mags[in_dim] uint8 - magnitude 0-7 per weight
|
| 16 |
+
* signs[chunks] uint64 - bitpacked sign (1=negative)
|
| 17 |
+
* scale float - per-row scale
|
| 18 |
+
*
|
| 19 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 20 |
+
*/
|
| 21 |
+
|
| 22 |
+
#include <immintrin.h>
|
| 23 |
+
#include <stdint.h>
|
| 24 |
+
#include <stdlib.h>
|
| 25 |
+
#include <string.h>
|
| 26 |
+
#include <math.h>
|
| 27 |
+
#include <stdio.h>
|
| 28 |
+
#include <time.h>
|
| 29 |
+
#include <omp.h>
|
| 30 |
+
|
| 31 |
+
#define HIDDEN 1536
|
| 32 |
+
#define INTER 8960
|
| 33 |
+
#define N_HEADS 12
|
| 34 |
+
#define N_KV_HEADS 2
|
| 35 |
+
#define HEAD_DIM 128
|
| 36 |
+
#define N_LAYERS 28
|
| 37 |
+
#define VOCAB 151936
|
| 38 |
+
#define RMS_EPS 1e-6f
|
| 39 |
+
#define ROPE_THETA 1000000.0f
|
| 40 |
+
#define MAX_SEQ 4096
|
| 41 |
+
#define GQA_RATIO (N_HEADS / N_KV_HEADS)
|
| 42 |
+
|
| 43 |
+
typedef struct {
|
| 44 |
+
uint8_t *mags; /* [out_dim * in_dim] magnitude per weight */
|
| 45 |
+
uint64_t *sign_bits; /* [out_dim * chunks] bitpacked signs */
|
| 46 |
+
float *scales; /* [out_dim] per-row scale */
|
| 47 |
+
float *bias; /* [out_dim] or NULL */
|
| 48 |
+
int out_dim, in_dim;
|
| 49 |
+
uint8_t *row_maxmag; /* [out_dim] max magnitude per row for early exit */
|
| 50 |
+
} PL; /* Packed Linear */
|
| 51 |
+
|
| 52 |
+
typedef struct { uint16_t *w; int od, id; } FL;
|
| 53 |
+
|
| 54 |
+
typedef struct {
|
| 55 |
+
PL qp, kp, vp, op, gp, up, dp;
|
| 56 |
+
float *in_norm, *pn_norm;
|
| 57 |
+
float *qb, *kb, *vb;
|
| 58 |
+
} Lay;
|
| 59 |
+
|
| 60 |
+
typedef struct {
|
| 61 |
+
uint16_t *emb;
|
| 62 |
+
Lay lay[N_LAYERS];
|
| 63 |
+
float *fnorm;
|
| 64 |
+
FL lmh;
|
| 65 |
+
float *kc, *vc;
|
| 66 |
+
float *h, *h2;
|
| 67 |
+
float *sq, *sk, *sv, *ao;
|
| 68 |
+
float *sg, *su, *sd;
|
| 69 |
+
float *lg, *as;
|
| 70 |
+
} M;
|
| 71 |
+
|
| 72 |
+
/* ============================================================
|
| 73 |
+
* PACKED UNARY MATVEC
|
| 74 |
+
*
|
| 75 |
+
* Process 16 weights at a time. For each group:
|
| 76 |
+
* 1. Load 16 magnitudes (uint8)
|
| 77 |
+
* 2. Find local max magnitude
|
| 78 |
+
* 3. For m = 1 to local_max:
|
| 79 |
+
* mask = (mag >= m)
|
| 80 |
+
* pos_mask = mask & ~sign
|
| 81 |
+
* neg_mask = mask & sign
|
| 82 |
+
* acc += masked x (pos)
|
| 83 |
+
* acc -= masked x (neg)
|
| 84 |
+
*
|
| 85 |
+
* Each pass = one unary "mark". Pure base-1.
|
| 86 |
+
* Groups where all mags <= 1: ONE pass.
|
| 87 |
+
* Groups where all mags == 0: ZERO passes. Skip entirely.
|
| 88 |
+
* ============================================================ */
|
| 89 |
+
static void pmv(const PL *L, const float *x, float *y) {
|
| 90 |
+
const int od = L->out_dim, id = L->in_dim;
|
| 91 |
+
const int chunks = (id + 63) / 64;
|
| 92 |
+
const int id16 = (id + 15) & ~15;
|
| 93 |
+
|
| 94 |
+
float *xp = (float*)aligned_alloc(64, id16 * sizeof(float));
|
| 95 |
+
memcpy(xp, x, id * sizeof(float));
|
| 96 |
+
if (id16 > id) memset(xp + id, 0, (id16 - id) * sizeof(float));
|
| 97 |
+
|
| 98 |
+
#pragma omp parallel for schedule(dynamic, 64)
|
| 99 |
+
for (int i = 0; i < od; i++) {
|
| 100 |
+
const uint8_t *row_mag = L->mags + (size_t)i * id;
|
| 101 |
+
const uint64_t *row_sign = L->sign_bits + (size_t)i * chunks;
|
| 102 |
+
const int rmax = L->row_maxmag[i];
|
| 103 |
+
|
| 104 |
+
__m512 acc = _mm512_setzero_ps();
|
| 105 |
+
|
| 106 |
+
for (int j = 0; j < id; j += 16) {
|
| 107 |
+
if (j >= id16) break;
|
| 108 |
+
|
| 109 |
+
/* Load 16 magnitudes */
|
| 110 |
+
__m128i mv = _mm_loadu_si128((__m128i*)(row_mag + j));
|
| 111 |
+
|
| 112 |
+
/* Quick check: if all 16 mags are zero, skip entirely */
|
| 113 |
+
if (_mm_testz_si128(mv, mv)) continue;
|
| 114 |
+
|
| 115 |
+
__m512 xv = _mm512_load_ps(xp + j);
|
| 116 |
+
|
| 117 |
+
/* Extract 16 sign bits from bitpacked array */
|
| 118 |
+
int chunk_idx = j / 64;
|
| 119 |
+
int bit_off = j % 64;
|
| 120 |
+
uint64_t sbits = row_sign[chunk_idx];
|
| 121 |
+
uint16_t signs = (uint16_t)((sbits >> bit_off) & 0xFFFF);
|
| 122 |
+
|
| 123 |
+
/* Find max magnitude in this group of 16 */
|
| 124 |
+
/* Use SSE horizontal max */
|
| 125 |
+
__m128i mx = mv;
|
| 126 |
+
mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 8));
|
| 127 |
+
mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 4));
|
| 128 |
+
mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 2));
|
| 129 |
+
mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 1));
|
| 130 |
+
int local_max = _mm_extract_epi8(mx, 0);
|
| 131 |
+
|
| 132 |
+
/* Threshold vector for comparisons */
|
| 133 |
+
for (int m = 1; m <= local_max; m++) {
|
| 134 |
+
/* mask = (mag >= m) */
|
| 135 |
+
__m128i thresh = _mm_set1_epi8((char)m);
|
| 136 |
+
/* Compare: result is 0xFF where mag >= m, 0 otherwise */
|
| 137 |
+
/* SSE doesn't have >= for uint8, use: NOT(max(thresh, mag) == thresh XOR mag == thresh) */
|
| 138 |
+
/* Simpler: mag >= m iff mag - m doesn't underflow, i.e. saturating sub == 0 is false */
|
| 139 |
+
/* Or: max(mag, thresh) == mag means mag >= thresh */
|
| 140 |
+
__m128i cmp = _mm_cmpeq_epi8(_mm_max_epu8(mv, thresh), mv);
|
| 141 |
+
uint16_t active = (uint16_t)_mm_movemask_epi8(cmp);
|
| 142 |
+
|
| 143 |
+
__mmask16 pos = (__mmask16)(active & ~signs);
|
| 144 |
+
__mmask16 neg = (__mmask16)(active & signs);
|
| 145 |
+
|
| 146 |
+
acc = _mm512_mask_add_ps(acc, pos, acc, xv);
|
| 147 |
+
acc = _mm512_mask_sub_ps(acc, neg, acc, xv);
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
y[i] = _mm512_reduce_add_ps(acc) * L->scales[i];
|
| 152 |
+
if (L->bias) y[i] += L->bias[i];
|
| 153 |
+
}
|
| 154 |
+
free(xp);
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
/* FP16 matvec for lm_head */
|
| 158 |
+
static void fmv(const FL *L, const float *x, float *y) {
|
| 159 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 160 |
+
for (int i = 0; i < L->od; i++) {
|
| 161 |
+
__m512 acc = _mm512_setzero_ps();
|
| 162 |
+
const uint16_t *row = L->w + (size_t)i * L->id;
|
| 163 |
+
int j;
|
| 164 |
+
for (j = 0; j + 16 <= L->id; j += 16) {
|
| 165 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + j));
|
| 166 |
+
acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
|
| 167 |
+
}
|
| 168 |
+
float s = _mm512_reduce_add_ps(acc);
|
| 169 |
+
for (; j < L->id; j++) {
|
| 170 |
+
float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
|
| 171 |
+
s += wf * x[j];
|
| 172 |
+
}
|
| 173 |
+
y[i] = s;
|
| 174 |
+
}
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
/* RMSNorm */
|
| 178 |
+
static void rn(const float *x, const float *w, float *y, int d) {
|
| 179 |
+
__m512 sq = _mm512_setzero_ps();
|
| 180 |
+
int i;
|
| 181 |
+
for (i = 0; i+16 <= d; i += 16) {
|
| 182 |
+
__m512 v = _mm512_loadu_ps(x+i);
|
| 183 |
+
sq = _mm512_fmadd_ps(v, v, sq);
|
| 184 |
+
}
|
| 185 |
+
float ss = _mm512_reduce_add_ps(sq);
|
| 186 |
+
for (; i < d; i++) ss += x[i]*x[i];
|
| 187 |
+
float r = 1.0f / sqrtf(ss/d + RMS_EPS);
|
| 188 |
+
__m512 rv = _mm512_set1_ps(r);
|
| 189 |
+
for (i = 0; i+16 <= d; i += 16)
|
| 190 |
+
_mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
|
| 191 |
+
_mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
|
| 192 |
+
for (; i < d; i++) y[i] = x[i]*r*w[i];
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
static void silu(float *x, int n) {
|
| 196 |
+
for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
|
| 197 |
+
}
|
| 198 |
+
static void emul(const float *a, const float *b, float *c, int n) {
|
| 199 |
+
int i;
|
| 200 |
+
for (i = 0; i+16 <= n; i += 16)
|
| 201 |
+
_mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
|
| 202 |
+
for (; i < n; i++) c[i] = a[i]*b[i];
|
| 203 |
+
}
|
| 204 |
+
static void va(float *y, const float *x, int n) {
|
| 205 |
+
int i;
|
| 206 |
+
for (i = 0; i+16 <= n; i += 16)
|
| 207 |
+
_mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
|
| 208 |
+
for (; i < n; i++) y[i] += x[i];
|
| 209 |
+
}
|
| 210 |
+
static void rope(float *v, int pos, int d) {
|
| 211 |
+
for (int i = 0; i < d; i += 2) {
|
| 212 |
+
float f = 1.0f / powf(ROPE_THETA, (float)i/d);
|
| 213 |
+
float a = pos*f, co = cosf(a), si = sinf(a);
|
| 214 |
+
float v0 = v[i], v1 = v[i+1];
|
| 215 |
+
v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
static void sm(float *x, int n) {
|
| 219 |
+
float mx = x[0];
|
| 220 |
+
for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
|
| 221 |
+
float s = 0;
|
| 222 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
|
| 223 |
+
float iv = 1.0f/s;
|
| 224 |
+
for (int i = 0; i < n; i++) x[i] *= iv;
|
| 225 |
+
}
|
| 226 |
+
static void etok(const M *m, int t, float *o) {
|
| 227 |
+
const uint16_t *r = m->emb + (size_t)t * HIDDEN;
|
| 228 |
+
int i;
|
| 229 |
+
for (i = 0; i+16 <= HIDDEN; i += 16)
|
| 230 |
+
_mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
|
| 231 |
+
for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
|
| 232 |
+
}
|
| 233 |
+
static float* kvp(float *c, int l, int p, int h) {
|
| 234 |
+
return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
|
| 235 |
+
}
|
| 236 |
+
|
| 237 |
+
static void do_attn(M *m, int l, int pos) {
|
| 238 |
+
Lay *ly = &m->lay[l];
|
| 239 |
+
pmv(&ly->qp, m->h2, m->sq);
|
| 240 |
+
pmv(&ly->kp, m->h2, m->sk);
|
| 241 |
+
pmv(&ly->vp, m->h2, m->sv);
|
| 242 |
+
if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
|
| 243 |
+
if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
|
| 244 |
+
if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
|
| 245 |
+
for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
|
| 246 |
+
for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
|
| 247 |
+
for (int h = 0; h < N_KV_HEADS; h++) {
|
| 248 |
+
memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
|
| 249 |
+
memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
|
| 250 |
+
}
|
| 251 |
+
float sc = 1.0f/sqrtf((float)HEAD_DIM);
|
| 252 |
+
memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
|
| 253 |
+
for (int h = 0; h < N_HEADS; h++) {
|
| 254 |
+
int kvh = h / GQA_RATIO;
|
| 255 |
+
float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
|
| 256 |
+
for (int t = 0; t <= pos; t++) {
|
| 257 |
+
float *kk = kvp(m->kc,l,t,kvh);
|
| 258 |
+
__m512 a = _mm512_setzero_ps();
|
| 259 |
+
int d;
|
| 260 |
+
for (d = 0; d+16 <= HEAD_DIM; d += 16)
|
| 261 |
+
a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
|
| 262 |
+
float dot = _mm512_reduce_add_ps(a);
|
| 263 |
+
for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
|
| 264 |
+
m->as[t] = dot * sc;
|
| 265 |
+
}
|
| 266 |
+
sm(m->as, pos+1);
|
| 267 |
+
for (int t = 0; t <= pos; t++) {
|
| 268 |
+
float w = m->as[t];
|
| 269 |
+
if (w < 1e-8f) continue;
|
| 270 |
+
float *vv = kvp(m->vc,l,t,kvh);
|
| 271 |
+
__m512 wv = _mm512_set1_ps(w);
|
| 272 |
+
int d;
|
| 273 |
+
for (d = 0; d+16 <= HEAD_DIM; d += 16)
|
| 274 |
+
_mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
|
| 275 |
+
for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
|
| 276 |
+
}
|
| 277 |
+
}
|
| 278 |
+
pmv(&ly->op, m->ao, m->h2);
|
| 279 |
+
}
|
| 280 |
+
|
| 281 |
+
static void do_mlp(M *m, int l) {
|
| 282 |
+
Lay *ly = &m->lay[l];
|
| 283 |
+
pmv(&ly->gp, m->h2, m->sg);
|
| 284 |
+
pmv(&ly->up, m->h2, m->su);
|
| 285 |
+
silu(m->sg, INTER);
|
| 286 |
+
emul(m->sg, m->su, m->sd, INTER);
|
| 287 |
+
pmv(&ly->dp, m->sd, m->h2);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
float* forward_token(M *m, int tid, int pos) {
|
| 291 |
+
etok(m, tid, m->h);
|
| 292 |
+
for (int l = 0; l < N_LAYERS; l++) {
|
| 293 |
+
rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
|
| 294 |
+
do_attn(m, l, pos);
|
| 295 |
+
va(m->h, m->h2, HIDDEN);
|
| 296 |
+
rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
|
| 297 |
+
do_mlp(m, l);
|
| 298 |
+
va(m->h, m->h2, HIDDEN);
|
| 299 |
+
}
|
| 300 |
+
rn(m->h, m->fnorm, m->h2, HIDDEN);
|
| 301 |
+
fmv(&m->lmh, m->h2, m->lg);
|
| 302 |
+
return m->lg;
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
static int samp(float *lg, int V, float T, float tp) {
|
| 306 |
+
if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
|
| 307 |
+
sm(lg, V);
|
| 308 |
+
float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
|
| 309 |
+
memcpy(pr, lg, V*4);
|
| 310 |
+
for (int i = 0; i < V; i++) ix[i] = i;
|
| 311 |
+
float cum = 0; int nk = 0;
|
| 312 |
+
while (cum < tp && nk < V && nk < 50) {
|
| 313 |
+
int b = nk;
|
| 314 |
+
for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
|
| 315 |
+
float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
|
| 316 |
+
int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
|
| 317 |
+
cum += pr[nk]; nk++;
|
| 318 |
+
}
|
| 319 |
+
float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
|
| 320 |
+
float r = (float)rand()/RAND_MAX * s, ac = 0;
|
| 321 |
+
int ch = ix[0];
|
| 322 |
+
for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
|
| 323 |
+
free(pr); free(ix);
|
| 324 |
+
return ch;
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
int generate(M *m, const int *pr, int pl, int *out, int mx,
|
| 328 |
+
float T, float tp, int eos) {
|
| 329 |
+
srand(time(NULL));
|
| 330 |
+
for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
|
| 331 |
+
int pos = pl, gen = 0;
|
| 332 |
+
for (int t = 0; t < mx; t++) {
|
| 333 |
+
int nx;
|
| 334 |
+
if (T <= 0) {
|
| 335 |
+
nx = 0;
|
| 336 |
+
for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
|
| 337 |
+
} else {
|
| 338 |
+
nx = samp(m->lg, VOCAB, T, tp);
|
| 339 |
+
}
|
| 340 |
+
out[t] = nx; gen++;
|
| 341 |
+
if (nx == eos) break;
|
| 342 |
+
forward_token(m, nx, pos); pos++;
|
| 343 |
+
}
|
| 344 |
+
return gen;
|
| 345 |
+
}
|
| 346 |
+
|
| 347 |
+
M* model_alloc(void) {
|
| 348 |
+
M *m = (M*)calloc(1, sizeof(M));
|
| 349 |
+
size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
|
| 350 |
+
m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
|
| 351 |
+
m->h = (float*)aligned_alloc(64,HIDDEN*4);
|
| 352 |
+
m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
|
| 353 |
+
m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
|
| 354 |
+
m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
|
| 355 |
+
m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
|
| 356 |
+
m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
|
| 357 |
+
m->sg = (float*)aligned_alloc(64,INTER*4);
|
| 358 |
+
m->su = (float*)aligned_alloc(64,INTER*4);
|
| 359 |
+
m->sd = (float*)aligned_alloc(64,INTER*4);
|
| 360 |
+
m->lg = (float*)aligned_alloc(64,VOCAB*4);
|
| 361 |
+
m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
|
| 362 |
+
m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
|
| 363 |
+
printf("Alloc: KV=%zuMB\n", kv*2*4/1024/1024);
|
| 364 |
+
return m;
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
|
| 368 |
+
void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
|
| 369 |
+
void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
|
| 370 |
+
m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
|
| 371 |
+
}
|
| 372 |
+
void layer_set_norms(M *m, int l, float *i, float *p) {
|
| 373 |
+
m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
|
| 374 |
+
}
|
| 375 |
+
void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
|
| 376 |
+
m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
|
| 377 |
+
}
|
| 378 |
+
void set_pl(PL *p, uint8_t *mags, uint64_t *signs, float *scales,
|
| 379 |
+
uint8_t *rmm, int od, int id) {
|
| 380 |
+
p->mags = mags; p->sign_bits = signs; p->scales = scales;
|
| 381 |
+
p->row_maxmag = rmm; p->out_dim = od; p->in_dim = id; p->bias = NULL;
|
| 382 |
+
}
|
| 383 |
+
void layer_set_linears(M *m, int l,
|
| 384 |
+
uint8_t*qm,uint64_t*qs,float*qc,uint8_t*qx,int qo,int qi,
|
| 385 |
+
uint8_t*km,uint64_t*ks,float*kc,uint8_t*kx,int ko,int ki,
|
| 386 |
+
uint8_t*vm,uint64_t*vs,float*vc,uint8_t*vx,int vo,int vi,
|
| 387 |
+
uint8_t*om,uint64_t*os_,float*oc,uint8_t*ox,int oo,int oi,
|
| 388 |
+
uint8_t*gm,uint64_t*gs,float*gc,uint8_t*gx,int go,int gi,
|
| 389 |
+
uint8_t*um,uint64_t*us,float*uc,uint8_t*ux,int uo,int ui,
|
| 390 |
+
uint8_t*dm,uint64_t*ds,float*dc,uint8_t*dx,int doo,int di) {
|
| 391 |
+
set_pl(&m->lay[l].qp,qm,qs,qc,qx,qo,qi);
|
| 392 |
+
set_pl(&m->lay[l].kp,km,ks,kc,kx,ko,ki);
|
| 393 |
+
set_pl(&m->lay[l].vp,vm,vs,vc,vx,vo,vi);
|
| 394 |
+
set_pl(&m->lay[l].op,om,os_,oc,ox,oo,oi);
|
| 395 |
+
set_pl(&m->lay[l].gp,gm,gs,gc,gx,go,gi);
|
| 396 |
+
set_pl(&m->lay[l].up,um,us,uc,ux,uo,ui);
|
| 397 |
+
set_pl(&m->lay[l].dp,dm,ds,dc,dx,doo,di);
|
| 398 |
+
}
|
| 399 |
+
void model_reset_cache(M *m) {
|
| 400 |
+
size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
|
| 401 |
+
memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
|
| 402 |
+
}
|
| 403 |
+
void model_free(M *m) {
|
| 404 |
+
free(m->kc);free(m->vc);free(m->h);free(m->h2);
|
| 405 |
+
free(m->sq);free(m->sk);free(m->sv);free(m->ao);
|
| 406 |
+
free(m->sg);free(m->su);free(m->sd);
|
| 407 |
+
free(m->lg);free(m->as);free(m->fnorm);free(m);
|
| 408 |
+
}
|
packed_loader.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Packed unary loader. Loads weights, passes pointers to C engine."""
|
| 3 |
+
import ctypes, os, sys, time, json
|
| 4 |
+
import numpy as np
|
| 5 |
+
from ctypes import c_int, c_float, c_void_p, POINTER, c_uint8, c_uint64
|
| 6 |
+
|
| 7 |
+
class PackedEngine:
|
| 8 |
+
def __init__(self, model_dir, engine_path="./packed_engine.so"):
|
| 9 |
+
self.lib = ctypes.CDLL(engine_path)
|
| 10 |
+
self.lib.model_alloc.restype = c_void_p
|
| 11 |
+
self.lib.forward_token.restype = POINTER(c_float)
|
| 12 |
+
self.model_dir = model_dir
|
| 13 |
+
|
| 14 |
+
with open(os.path.join(model_dir, "manifest.json")) as f:
|
| 15 |
+
self.manifest = json.load(f)
|
| 16 |
+
with open(os.path.join(model_dir, "config.json")) as f:
|
| 17 |
+
self.config = json.load(f)
|
| 18 |
+
|
| 19 |
+
self.arrays = [] # prevent GC
|
| 20 |
+
self.model = self.lib.model_alloc()
|
| 21 |
+
self._load_weights()
|
| 22 |
+
|
| 23 |
+
def _keep(self, arr):
|
| 24 |
+
self.arrays.append(arr)
|
| 25 |
+
return arr.ctypes.data
|
| 26 |
+
|
| 27 |
+
def _load_file(self, key, ext, dtype):
|
| 28 |
+
path = os.path.join(self.model_dir, key.replace(".", "_") + ext)
|
| 29 |
+
return np.fromfile(path, dtype=dtype)
|
| 30 |
+
|
| 31 |
+
def _load_weights(self):
|
| 32 |
+
t0 = time.time()
|
| 33 |
+
fp16_keys = self.manifest["fp16"]
|
| 34 |
+
packed_keys = self.manifest["packed"]
|
| 35 |
+
|
| 36 |
+
# Embeddings
|
| 37 |
+
emb = self._load_file("model.embed_tokens.weight", ".fp16", np.uint16)
|
| 38 |
+
self.lib.model_set_embed(self.model, self._keep(emb))
|
| 39 |
+
print(f" Embeddings: {emb.nbytes/1e6:.1f} MB")
|
| 40 |
+
|
| 41 |
+
# LM head
|
| 42 |
+
lm = self._load_file("lm_head.weight", ".fp16", np.uint16)
|
| 43 |
+
od, id_ = fp16_keys["lm_head.weight"]
|
| 44 |
+
self.lib.model_set_lm_head(self.model, self._keep(lm), od, id_)
|
| 45 |
+
print(f" LM head: {lm.nbytes/1e6:.1f} MB")
|
| 46 |
+
|
| 47 |
+
# Final norm
|
| 48 |
+
fn = self._load_file("model.norm.weight", ".fp16", np.uint16).astype(np.float32)
|
| 49 |
+
# fp16 stored, convert
|
| 50 |
+
fn_f16 = self._load_file("model.norm.weight", ".fp16", np.float16)
|
| 51 |
+
fn = fn_f16.astype(np.float32)
|
| 52 |
+
self.lib.model_set_final_norm(self.model, self._keep(fn))
|
| 53 |
+
|
| 54 |
+
n_layers = self.config["num_hidden_layers"]
|
| 55 |
+
for l in range(n_layers):
|
| 56 |
+
pfx = f"model.layers.{l}"
|
| 57 |
+
|
| 58 |
+
# Norms
|
| 59 |
+
in_f16 = self._load_file(f"{pfx}.input_layernorm.weight", ".fp16", np.float16)
|
| 60 |
+
pn_f16 = self._load_file(f"{pfx}.post_attention_layernorm.weight", ".fp16", np.float16)
|
| 61 |
+
in_f = in_f16.astype(np.float32)
|
| 62 |
+
pn_f = pn_f16.astype(np.float32)
|
| 63 |
+
self.lib.layer_set_norms(self.model, l, self._keep(in_f), self._keep(pn_f))
|
| 64 |
+
|
| 65 |
+
# Biases (Q/K/V)
|
| 66 |
+
qb = kb = vb = None
|
| 67 |
+
qb_key = f"{pfx}.self_attn.q_proj.bias"
|
| 68 |
+
if qb_key in fp16_keys:
|
| 69 |
+
qb_f16 = self._load_file(qb_key, ".fp16", np.float16)
|
| 70 |
+
qb = qb_f16.astype(np.float32)
|
| 71 |
+
kb_f16 = self._load_file(f"{pfx}.self_attn.k_proj.bias", ".fp16", np.float16)
|
| 72 |
+
kb = kb_f16.astype(np.float32)
|
| 73 |
+
vb_f16 = self._load_file(f"{pfx}.self_attn.v_proj.bias", ".fp16", np.float16)
|
| 74 |
+
vb = vb_f16.astype(np.float32)
|
| 75 |
+
self.lib.layer_set_bias(self.model, l,
|
| 76 |
+
self._keep(qb), self._keep(kb), self._keep(vb))
|
| 77 |
+
else:
|
| 78 |
+
self.lib.layer_set_bias(self.model, l, None, None, None)
|
| 79 |
+
|
| 80 |
+
# 7 linear layers: q,k,v,o,gate,up,down
|
| 81 |
+
args = []
|
| 82 |
+
for name in ['self_attn.q_proj','self_attn.k_proj','self_attn.v_proj',
|
| 83 |
+
'self_attn.o_proj','mlp.gate_proj','mlp.up_proj','mlp.down_proj']:
|
| 84 |
+
key = f"{pfx}.{name}.weight"
|
| 85 |
+
shape = packed_keys[key]
|
| 86 |
+
od, id_ = shape
|
| 87 |
+
mags = self._load_file(key, ".mags", np.uint8)
|
| 88 |
+
signs = self._load_file(key, ".signs", np.uint64)
|
| 89 |
+
scales = self._load_file(key, ".scales", np.float32)
|
| 90 |
+
rmm = self._load_file(key, ".rmm", np.uint8)
|
| 91 |
+
args.extend([self._keep(mags), self._keep(signs),
|
| 92 |
+
self._keep(scales), self._keep(rmm), od, id_])
|
| 93 |
+
|
| 94 |
+
self.lib.layer_set_linears(self.model, l, *args)
|
| 95 |
+
|
| 96 |
+
if (l+1) % 7 == 0 or l == n_layers-1:
|
| 97 |
+
print(f" Loaded {l+1}/{n_layers} layers")
|
| 98 |
+
|
| 99 |
+
dt = time.time() - t0
|
| 100 |
+
total = sum(a.nbytes for a in self.arrays)
|
| 101 |
+
print(f"\nModel loaded in {dt:.1f}s, {total/1e6:.0f} MB in Python arrays")
|
| 102 |
+
|
| 103 |
+
def generate(self, token_ids, max_new_tokens=100, temperature=0.6, top_p=0.9, eos_id=151643):
|
| 104 |
+
prompt = (c_int * len(token_ids))(*token_ids)
|
| 105 |
+
output = (c_int * max_new_tokens)()
|
| 106 |
+
self.lib.model_reset_cache(self.model)
|
| 107 |
+
t0 = time.time()
|
| 108 |
+
n = self.lib.generate(self.model, prompt, len(token_ids),
|
| 109 |
+
output, max_new_tokens, c_float(temperature),
|
| 110 |
+
c_float(top_p), eos_id)
|
| 111 |
+
dt = time.time() - t0
|
| 112 |
+
tokens = [output[i] for i in range(n)]
|
| 113 |
+
return tokens, n, dt
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
if __name__ == "__main__":
|
| 117 |
+
from transformers import AutoTokenizer
|
| 118 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-packed"
|
| 119 |
+
tok_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
|
| 120 |
+
|
| 121 |
+
print("Loading tokenizer...")
|
| 122 |
+
tok = AutoTokenizer.from_pretrained(tok_dir, trust_remote_code=True)
|
| 123 |
+
print("Loading packed unary engine...")
|
| 124 |
+
engine = PackedEngine(model_dir, "./packed_engine.so")
|
| 125 |
+
|
| 126 |
+
prompts = ["What is 2+2?", "Explain gravity in one sentence.", "Write a haiku about snow."]
|
| 127 |
+
for prompt in prompts:
|
| 128 |
+
msgs = [{"role": "user", "content": prompt}]
|
| 129 |
+
ids = tok.apply_chat_template(msgs, add_generation_prompt=True)
|
| 130 |
+
tokens, n, dt = engine.generate(ids, max_new_tokens=100, temperature=0.6)
|
| 131 |
+
text = tok.decode(tokens, skip_special_tokens=False)
|
| 132 |
+
print(f"\n[{prompt}] ({n} tok, {dt:.1f}s, {n/dt:.1f} tok/s)")
|
| 133 |
+
print(text[:300])
|
| 134 |
+
print("---")
|
proper_unary
ADDED
|
Binary file (26 kB). View file
|
|
|
proper_unary.c
ADDED
|
@@ -0,0 +1,563 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PROPER UNARY — ONE QUANTUM, NO SCALES
|
| 3 |
+
*
|
| 4 |
+
* Every single bit in the entire system has weight = 1 quantum.
|
| 5 |
+
* The quantum is set ONCE for the whole model.
|
| 6 |
+
* There are NO per-vector scales. NO per-row scales.
|
| 7 |
+
*
|
| 8 |
+
* The number 5.0 with quantum=0.1 is stored as 50 ones.
|
| 9 |
+
* The number 5.0 with quantum=0.01 is stored as 500 ones.
|
| 10 |
+
* More precision = more bits. That's the tradeoff.
|
| 11 |
+
*
|
| 12 |
+
* ADDITION = CONCATENATION. Always. No exceptions.
|
| 13 |
+
* Because every bit everywhere means the same thing.
|
| 14 |
+
*
|
| 15 |
+
* MATMUL: y[i] = sum_j W[i][j] * x[j]
|
| 16 |
+
* = sum over all (w_slot, x_slot) pairs:
|
| 17 |
+
* popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
|
| 18 |
+
* - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
|
| 19 |
+
* = quantum² * integer_count
|
| 20 |
+
*
|
| 21 |
+
* Output quantum = input_quantum² (magnitude grows)
|
| 22 |
+
* Or we pick output quantum = input_quantum and accept
|
| 23 |
+
* that the integer count includes the scaling.
|
| 24 |
+
*
|
| 25 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 26 |
+
*/
|
| 27 |
+
|
| 28 |
+
#define _POSIX_C_SOURCE 199309L
|
| 29 |
+
#include <immintrin.h>
|
| 30 |
+
#include <omp.h>
|
| 31 |
+
#include <stdint.h>
|
| 32 |
+
#include <stdlib.h>
|
| 33 |
+
#include <string.h>
|
| 34 |
+
#include <math.h>
|
| 35 |
+
#include <stdio.h>
|
| 36 |
+
#include <time.h>
|
| 37 |
+
|
| 38 |
+
/* ============================================================
|
| 39 |
+
* PROPER UNARY VECTOR
|
| 40 |
+
* Every bit = 1 quantum. No local scale.
|
| 41 |
+
* ============================================================ */
|
| 42 |
+
typedef struct {
|
| 43 |
+
uint64_t *sign; /* [chunks] */
|
| 44 |
+
uint64_t *slots; /* [n_slots * chunks] */
|
| 45 |
+
int dim;
|
| 46 |
+
int chunks;
|
| 47 |
+
int n_slots;
|
| 48 |
+
int cap; /* max slots allocated */
|
| 49 |
+
} UVec;
|
| 50 |
+
|
| 51 |
+
/* Proper unary matrix — same quantum as vectors */
|
| 52 |
+
typedef struct {
|
| 53 |
+
uint64_t *sign; /* [rows * chunks] */
|
| 54 |
+
uint64_t *slots; /* [K * rows * chunks] */
|
| 55 |
+
int rows, cols, chunks, K;
|
| 56 |
+
} UMat;
|
| 57 |
+
|
| 58 |
+
/* Global system quantum */
|
| 59 |
+
typedef struct {
|
| 60 |
+
float quantum; /* every bit = this much */
|
| 61 |
+
/* quantum² is the matmul output unit */
|
| 62 |
+
} USystem;
|
| 63 |
+
|
| 64 |
+
/* ============================================================
|
| 65 |
+
* ALLOC
|
| 66 |
+
* ============================================================ */
|
| 67 |
+
UVec* uv_new(int dim, int cap) {
|
| 68 |
+
UVec *v = (UVec *)calloc(1, sizeof(UVec));
|
| 69 |
+
v->dim = dim;
|
| 70 |
+
v->chunks = (dim + 63) / 64;
|
| 71 |
+
v->n_slots = 0;
|
| 72 |
+
v->cap = cap;
|
| 73 |
+
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
|
| 74 |
+
v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t));
|
| 75 |
+
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
|
| 76 |
+
memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
|
| 77 |
+
return v;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
UMat* um_new(int rows, int cols, int K) {
|
| 81 |
+
UMat *m = (UMat *)calloc(1, sizeof(UMat));
|
| 82 |
+
m->rows = rows; m->cols = cols; m->K = K;
|
| 83 |
+
m->chunks = (cols + 63) / 64;
|
| 84 |
+
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 85 |
+
m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 86 |
+
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 87 |
+
memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 88 |
+
return m;
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
|
| 92 |
+
void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }
|
| 93 |
+
|
| 94 |
+
/* ============================================================
|
| 95 |
+
* QUANTIZE: float → proper unary
|
| 96 |
+
*
|
| 97 |
+
* Given global quantum q:
|
| 98 |
+
* magnitude = round(|value| / q)
|
| 99 |
+
* That many slots get bit set.
|
| 100 |
+
*
|
| 101 |
+
* NO per-vector absmax. NO local scale.
|
| 102 |
+
* Values that exceed K are clipped.
|
| 103 |
+
* ============================================================ */
|
| 104 |
+
void uv_from_float(UVec *v, const float *x, int K, float quantum) {
|
| 105 |
+
int dim = v->dim, chunks = v->chunks;
|
| 106 |
+
v->n_slots = K;
|
| 107 |
+
|
| 108 |
+
memset(v->sign, 0, chunks * sizeof(uint64_t));
|
| 109 |
+
memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
|
| 110 |
+
|
| 111 |
+
float inv_q = 1.0f / quantum;
|
| 112 |
+
for (int i = 0; i < dim; i++) {
|
| 113 |
+
int c = i / 64;
|
| 114 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 115 |
+
|
| 116 |
+
if (x[i] < 0.0f) v->sign[c] |= bit;
|
| 117 |
+
|
| 118 |
+
int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
|
| 119 |
+
if (mag > K) mag = K; /* clip */
|
| 120 |
+
for (int s = 0; s < mag; s++)
|
| 121 |
+
v->slots[(size_t)s * chunks + c] |= bit;
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
void uv_to_float(const UVec *v, float *out, float quantum) {
|
| 126 |
+
int dim = v->dim, chunks = v->chunks;
|
| 127 |
+
|
| 128 |
+
for (int i = 0; i < dim; i++) {
|
| 129 |
+
int c = i / 64;
|
| 130 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 131 |
+
|
| 132 |
+
int mag = 0;
|
| 133 |
+
for (int s = 0; s < v->n_slots; s++)
|
| 134 |
+
if (v->slots[(size_t)s * chunks + c] & bit)
|
| 135 |
+
mag++;
|
| 136 |
+
|
| 137 |
+
out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
void um_from_float(UMat *m, const float *data, float quantum) {
|
| 142 |
+
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
|
| 143 |
+
|
| 144 |
+
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
|
| 145 |
+
memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
|
| 146 |
+
|
| 147 |
+
float inv_q = 1.0f / quantum;
|
| 148 |
+
for (int r = 0; r < rows; r++) {
|
| 149 |
+
const float *row = data + (size_t)r * cols;
|
| 150 |
+
uint64_t *rs = m->sign + (size_t)r * chunks;
|
| 151 |
+
|
| 152 |
+
for (int j = 0; j < cols; j++) {
|
| 153 |
+
int c = j / 64;
|
| 154 |
+
uint64_t bit = 1ULL << (j % 64);
|
| 155 |
+
if (row[j] < 0.0f) rs[c] |= bit;
|
| 156 |
+
|
| 157 |
+
int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
|
| 158 |
+
if (mag > K) mag = K;
|
| 159 |
+
for (int s = 0; s < mag; s++)
|
| 160 |
+
m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
|
| 161 |
+
}
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/* ============================================================
|
| 166 |
+
* CONCATENATION = ADDITION
|
| 167 |
+
*
|
| 168 |
+
* Since every bit everywhere = same quantum,
|
| 169 |
+
* appending slots IS adding magnitudes. Period.
|
| 170 |
+
*
|
| 171 |
+
* Sign handling: for elements where signs differ,
|
| 172 |
+
* cancel bits from existing slots.
|
| 173 |
+
* ============================================================ */
|
| 174 |
+
void uv_concat(UVec *dst, const UVec *src) {
|
| 175 |
+
int chunks = dst->chunks;
|
| 176 |
+
|
| 177 |
+
for (int s = 0; s < src->n_slots; s++) {
|
| 178 |
+
if (dst->n_slots >= dst->cap) {
|
| 179 |
+
printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
|
| 180 |
+
return;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
const uint64_t *src_slot = src->slots + (size_t)s * chunks;
|
| 184 |
+
uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks;
|
| 185 |
+
|
| 186 |
+
for (int c = 0; c < chunks; c++) {
|
| 187 |
+
uint64_t sb = src_slot[c];
|
| 188 |
+
uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
|
| 189 |
+
uint64_t disagree = dst->sign[c] ^ src->sign[c];
|
| 190 |
+
|
| 191 |
+
/* Same sign: straight append */
|
| 192 |
+
uint64_t add = sb & agree;
|
| 193 |
+
|
| 194 |
+
/* Different sign: cancel from existing */
|
| 195 |
+
uint64_t cancel = sb & disagree;
|
| 196 |
+
for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
|
| 197 |
+
uint64_t *ds = dst->slots + (size_t)d * chunks + c;
|
| 198 |
+
uint64_t overlap = *ds & cancel;
|
| 199 |
+
*ds &= ~overlap;
|
| 200 |
+
cancel &= ~overlap;
|
| 201 |
+
}
|
| 202 |
+
/* Leftover cancel = src magnitude exceeds dst, flip sign */
|
| 203 |
+
if (cancel) {
|
| 204 |
+
dst->sign[c] ^= cancel;
|
| 205 |
+
add |= cancel;
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
new_slot[c] = add;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
/* Check if slot has any bits */
|
| 212 |
+
int any = 0;
|
| 213 |
+
for (int c = 0; c < chunks && !any; c++)
|
| 214 |
+
if (new_slot[c]) any = 1;
|
| 215 |
+
if (any) dst->n_slots++;
|
| 216 |
+
}
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
/* ============================================================
|
| 220 |
+
* MATMUL: y = M @ x
|
| 221 |
+
*
|
| 222 |
+
* Output unit = quantum² (one quantum from weight × one from activation)
|
| 223 |
+
* The integer accumulator directly gives the value in units of quantum².
|
| 224 |
+
*
|
| 225 |
+
* To keep everything in the same quantum system:
|
| 226 |
+
* y_float[i] = acc * quantum²
|
| 227 |
+
* Then requantize to unary with the SAME global quantum.
|
| 228 |
+
* y_mag[i] = acc * quantum² / quantum = acc * quantum
|
| 229 |
+
*
|
| 230 |
+
* ============================================================ */
|
| 231 |
+
void uv_matmul(
|
| 232 |
+
const UMat *M, const UVec *x,
|
| 233 |
+
UVec *y, int K_out, float quantum
|
| 234 |
+
) {
|
| 235 |
+
int out_dim = M->rows;
|
| 236 |
+
int chunks = M->chunks;
|
| 237 |
+
int wK = M->K;
|
| 238 |
+
int xK = x->n_slots;
|
| 239 |
+
|
| 240 |
+
float q2 = quantum * quantum;
|
| 241 |
+
|
| 242 |
+
y->n_slots = K_out;
|
| 243 |
+
memset(y->sign, 0, y->chunks * sizeof(uint64_t));
|
| 244 |
+
memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));
|
| 245 |
+
|
| 246 |
+
/* Compute integer dot products */
|
| 247 |
+
int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int));
|
| 248 |
+
uint8_t *neg = (uint8_t *)calloc(out_dim, 1);
|
| 249 |
+
|
| 250 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 251 |
+
for (int i = 0; i < out_dim; i++) {
|
| 252 |
+
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
|
| 253 |
+
long long a = 0;
|
| 254 |
+
|
| 255 |
+
for (int c = 0; c < chunks; c++) {
|
| 256 |
+
uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
|
| 257 |
+
uint64_t diff = w_sign_row[c] ^ x->sign[c];
|
| 258 |
+
|
| 259 |
+
for (int p = 0; p < wK; p++) {
|
| 260 |
+
uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
|
| 261 |
+
for (int q = 0; q < xK; q++) {
|
| 262 |
+
uint64_t xq = x->slots[(size_t)q * chunks + c];
|
| 263 |
+
uint64_t active = wp & xq;
|
| 264 |
+
a += __builtin_popcountll(active & same)
|
| 265 |
+
- __builtin_popcountll(active & diff);
|
| 266 |
+
}
|
| 267 |
+
}
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
/* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
|
| 271 |
+
float val = (float)a * quantum;
|
| 272 |
+
int mag = (int)(fabsf(val) + 0.5f);
|
| 273 |
+
if (mag > K_out) mag = K_out;
|
| 274 |
+
acc[i] = mag;
|
| 275 |
+
neg[i] = (val < 0.0f) ? 1 : 0;
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
/* Encode directly to unary — no float intermediate */
|
| 279 |
+
for (int i = 0; i < out_dim; i++) {
|
| 280 |
+
int c = i / 64;
|
| 281 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 282 |
+
if (neg[i]) y->sign[c] |= bit;
|
| 283 |
+
for (int s = 0; s < acc[i]; s++)
|
| 284 |
+
y->slots[(size_t)s * y->chunks + c] |= bit;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
free(acc); free(neg);
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
/* ============================================================
|
| 291 |
+
* RMSNORM — resets slot count, keeps same quantum
|
| 292 |
+
* ============================================================ */
|
| 293 |
+
void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) {
|
| 294 |
+
int dim = x->dim;
|
| 295 |
+
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
|
| 296 |
+
uv_to_float(x, xf, quantum);
|
| 297 |
+
|
| 298 |
+
float ss = 0.0f;
|
| 299 |
+
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
|
| 300 |
+
float rms = 1.0f / sqrtf(ss / dim + eps);
|
| 301 |
+
for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
|
| 302 |
+
|
| 303 |
+
uv_from_float(out, xf, K_out, quantum);
|
| 304 |
+
free(xf);
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
/* ============================================================
|
| 308 |
+
* TESTS
|
| 309 |
+
* ============================================================ */
|
| 310 |
+
|
| 311 |
+
void test_concat_correct() {
|
| 312 |
+
printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");
|
| 313 |
+
|
| 314 |
+
float quantum = 0.25f; /* every bit = 0.25 */
|
| 315 |
+
int dim = 8;
|
| 316 |
+
|
| 317 |
+
/* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
|
| 318 |
+
* In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
|
| 319 |
+
* Need K >= 28 slots to hold 7.0
|
| 320 |
+
*/
|
| 321 |
+
float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
|
| 322 |
+
float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
|
| 323 |
+
float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};
|
| 324 |
+
|
| 325 |
+
int K = 32;
|
| 326 |
+
UVec *a = uv_new(dim, 128);
|
| 327 |
+
UVec *b = uv_new(dim, 128);
|
| 328 |
+
|
| 329 |
+
uv_from_float(a, a_vals, K, quantum);
|
| 330 |
+
uv_from_float(b, b_vals, K, quantum);
|
| 331 |
+
|
| 332 |
+
float a_rec[8], b_rec[8];
|
| 333 |
+
uv_to_float(a, a_rec, quantum);
|
| 334 |
+
uv_to_float(b, b_rec, quantum);
|
| 335 |
+
|
| 336 |
+
printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
|
| 337 |
+
printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
|
| 338 |
+
printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
|
| 339 |
+
printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
|
| 340 |
+
printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");
|
| 341 |
+
|
| 342 |
+
printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
|
| 343 |
+
uv_concat(a, b);
|
| 344 |
+
printf("After concat: %d slots\n\n", a->n_slots);
|
| 345 |
+
|
| 346 |
+
float result[8];
|
| 347 |
+
uv_to_float(a, result, quantum);
|
| 348 |
+
|
| 349 |
+
printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
|
| 350 |
+
printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
|
| 351 |
+
printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");
|
| 352 |
+
|
| 353 |
+
uv_free(a); uv_free(b);
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
void test_chain_concat() {
|
| 357 |
+
printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");
|
| 358 |
+
|
| 359 |
+
float quantum = 0.1f;
|
| 360 |
+
int dim = 4;
|
| 361 |
+
int K = 64;
|
| 362 |
+
|
| 363 |
+
float vals[] = {1.0, -2.0, 3.0, -0.5};
|
| 364 |
+
UVec *acc = uv_new(dim, 512);
|
| 365 |
+
uv_from_float(acc, vals, K, quantum);
|
| 366 |
+
|
| 367 |
+
printf("Start: ");
|
| 368 |
+
float tmp[4];
|
| 369 |
+
uv_to_float(acc, tmp, quantum);
|
| 370 |
+
for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
|
| 371 |
+
printf(" (%d slots)\n", acc->n_slots);
|
| 372 |
+
|
| 373 |
+
float expected[] = {1.0, -2.0, 3.0, -0.5};
|
| 374 |
+
|
| 375 |
+
for (int step = 0; step < 5; step++) {
|
| 376 |
+
float add_vals[] = {0.5, 0.3, -1.0, 0.7};
|
| 377 |
+
UVec *delta = uv_new(dim, K);
|
| 378 |
+
uv_from_float(delta, add_vals, K, quantum);
|
| 379 |
+
|
| 380 |
+
uv_concat(acc, delta);
|
| 381 |
+
|
| 382 |
+
for (int i = 0; i < 4; i++) expected[i] += add_vals[i];
|
| 383 |
+
|
| 384 |
+
uv_to_float(acc, tmp, quantum);
|
| 385 |
+
printf(" +[0.5,0.3,-1.0,0.7] = ");
|
| 386 |
+
for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
|
| 387 |
+
printf(" (%d slots) expect:", acc->n_slots);
|
| 388 |
+
for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);
|
| 389 |
+
|
| 390 |
+
/* Check error */
|
| 391 |
+
float max_err = 0;
|
| 392 |
+
for(int i=0;i<4;i++) {
|
| 393 |
+
float e = fabsf(expected[i] - tmp[i]);
|
| 394 |
+
if (e > max_err) max_err = e;
|
| 395 |
+
}
|
| 396 |
+
printf(" err=%.2f\n", max_err);
|
| 397 |
+
|
| 398 |
+
uv_free(delta);
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
uv_free(acc);
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
void test_matmul() {
|
| 405 |
+
printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");
|
| 406 |
+
|
| 407 |
+
int rows = 512, cols = 256;
|
| 408 |
+
int wK = 32, xK = 32;
|
| 409 |
+
|
| 410 |
+
srand(42);
|
| 411 |
+
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
|
| 412 |
+
float *xf = (float *)malloc(cols * sizeof(float));
|
| 413 |
+
float *y_ref = (float *)calloc(rows, sizeof(float));
|
| 414 |
+
|
| 415 |
+
/* Small values so magnitudes fit in K slots */
|
| 416 |
+
for (size_t i = 0; i < (size_t)rows * cols; i++)
|
| 417 |
+
Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
|
| 418 |
+
for (int i = 0; i < cols; i++)
|
| 419 |
+
xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
|
| 420 |
+
for (int i = 0; i < rows; i++)
|
| 421 |
+
for (int j = 0; j < cols; j++)
|
| 422 |
+
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
|
| 423 |
+
|
| 424 |
+
/* Find quantum that fits the data range */
|
| 425 |
+
float data_max = 0;
|
| 426 |
+
for (size_t i = 0; i < (size_t)rows * cols; i++) {
|
| 427 |
+
float a = fabsf(Mf[i]);
|
| 428 |
+
if (a > data_max) data_max = a;
|
| 429 |
+
}
|
| 430 |
+
for (int i = 0; i < cols; i++) {
|
| 431 |
+
float a = fabsf(xf[i]);
|
| 432 |
+
if (a > data_max) data_max = a;
|
| 433 |
+
}
|
| 434 |
+
float quantum = data_max / wK;
|
| 435 |
+
|
| 436 |
+
printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
|
| 437 |
+
printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK);
|
| 438 |
+
printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);
|
| 439 |
+
|
| 440 |
+
UMat *M = um_new(rows, cols, wK);
|
| 441 |
+
UVec *x = uv_new(cols, xK);
|
| 442 |
+
|
| 443 |
+
um_from_float(M, Mf, quantum);
|
| 444 |
+
uv_from_float(x, xf, xK, quantum);
|
| 445 |
+
|
| 446 |
+
/* Output needs enough K for the matmul result range */
|
| 447 |
+
float ymax = 0;
|
| 448 |
+
for (int i = 0; i < rows; i++) {
|
| 449 |
+
float a = fabsf(y_ref[i]);
|
| 450 |
+
if (a > ymax) ymax = a;
|
| 451 |
+
}
|
| 452 |
+
int K_out = (int)(ymax / quantum + 1);
|
| 453 |
+
if (K_out > 4096) K_out = 4096;
|
| 454 |
+
printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);
|
| 455 |
+
|
| 456 |
+
UVec *y = uv_new(rows, K_out);
|
| 457 |
+
|
| 458 |
+
struct timespec t0, t1;
|
| 459 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 460 |
+
uv_matmul(M, x, y, K_out, quantum);
|
| 461 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 462 |
+
double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
|
| 463 |
+
|
| 464 |
+
float *yf = (float *)malloc(rows * sizeof(float));
|
| 465 |
+
uv_to_float(y, yf, quantum);
|
| 466 |
+
|
| 467 |
+
float dot = 0, na = 0, nb = 0, noise = 0;
|
| 468 |
+
for (int i = 0; i < rows; i++) {
|
| 469 |
+
dot += y_ref[i] * yf[i];
|
| 470 |
+
na += y_ref[i] * y_ref[i];
|
| 471 |
+
nb += yf[i] * yf[i];
|
| 472 |
+
float e = y_ref[i] - yf[i]; noise += e * e;
|
| 473 |
+
}
|
| 474 |
+
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 475 |
+
float snr = 10.0f * log10f(na / (noise + 1e-10f));
|
| 476 |
+
|
| 477 |
+
printf("\nCosine: %.6f\n", cosine);
|
| 478 |
+
printf("SNR: %.1f dB\n", snr);
|
| 479 |
+
printf("Time: %.1f ms\n", ms);
|
| 480 |
+
|
| 481 |
+
printf("\nFirst 10 values:\n");
|
| 482 |
+
printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
|
| 483 |
+
for (int i = 0; i < 10; i++)
|
| 484 |
+
printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);
|
| 485 |
+
|
| 486 |
+
um_free(M); uv_free(x); uv_free(y);
|
| 487 |
+
free(Mf); free(xf); free(y_ref); free(yf);
|
| 488 |
+
}
|
| 489 |
+
|
| 490 |
+
void test_residual_chain() {
|
| 491 |
+
printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");
|
| 492 |
+
|
| 493 |
+
float quantum = 0.05f;
|
| 494 |
+
int dim = 1024;
|
| 495 |
+
int K = 128; /* fits values up to 6.4 */
|
| 496 |
+
|
| 497 |
+
srand(123);
|
| 498 |
+
float *embed = (float *)malloc(dim * sizeof(float));
|
| 499 |
+
for (int i = 0; i < dim; i++)
|
| 500 |
+
embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;
|
| 501 |
+
|
| 502 |
+
/* Float reference: accumulate residuals */
|
| 503 |
+
float *ref = (float *)malloc(dim * sizeof(float));
|
| 504 |
+
memcpy(ref, embed, dim * sizeof(float));
|
| 505 |
+
|
| 506 |
+
/* Unary: grow via concat */
|
| 507 |
+
int total_cap = K + 10 * K; /* room for 10 concat operations */
|
| 508 |
+
UVec *residual = uv_new(dim, total_cap);
|
| 509 |
+
uv_from_float(residual, embed, K, quantum);
|
| 510 |
+
|
| 511 |
+
printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
|
| 512 |
+
printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");
|
| 513 |
+
|
| 514 |
+
for (int step = 0; step < 10; step++) {
|
| 515 |
+
float *delta = (float *)malloc(dim * sizeof(float));
|
| 516 |
+
for (int i = 0; i < dim; i++)
|
| 517 |
+
delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;
|
| 518 |
+
|
| 519 |
+
/* Float reference */
|
| 520 |
+
for (int i = 0; i < dim; i++) ref[i] += delta[i];
|
| 521 |
+
|
| 522 |
+
/* Unary: concat */
|
| 523 |
+
UVec *d = uv_new(dim, K);
|
| 524 |
+
uv_from_float(d, delta, K, quantum);
|
| 525 |
+
uv_concat(residual, d);
|
| 526 |
+
|
| 527 |
+
/* Compare */
|
| 528 |
+
float *rec = (float *)malloc(dim * sizeof(float));
|
| 529 |
+
uv_to_float(residual, rec, quantum);
|
| 530 |
+
|
| 531 |
+
float dot = 0, na = 0, nb = 0, max_err = 0;
|
| 532 |
+
for (int i = 0; i < dim; i++) {
|
| 533 |
+
dot += ref[i] * rec[i];
|
| 534 |
+
na += ref[i] * ref[i];
|
| 535 |
+
nb += rec[i] * rec[i];
|
| 536 |
+
float e = fabsf(ref[i] - rec[i]);
|
| 537 |
+
if (e > max_err) max_err = e;
|
| 538 |
+
}
|
| 539 |
+
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 540 |
+
|
| 541 |
+
printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);
|
| 542 |
+
|
| 543 |
+
uv_free(d); free(delta); free(rec);
|
| 544 |
+
}
|
| 545 |
+
|
| 546 |
+
uv_free(residual);
|
| 547 |
+
free(embed); free(ref);
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
int main() {
|
| 551 |
+
printf("================================================\n");
|
| 552 |
+
printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
|
| 553 |
+
printf(" Every bit = 1 quantum. Concat = Add.\n");
|
| 554 |
+
printf("================================================\n\n");
|
| 555 |
+
|
| 556 |
+
test_concat_correct();
|
| 557 |
+
test_chain_concat();
|
| 558 |
+
test_matmul();
|
| 559 |
+
test_residual_chain();
|
| 560 |
+
|
| 561 |
+
printf("\n=== DONE ===\n");
|
| 562 |
+
return 0;
|
| 563 |
+
}
|
pure_unary_engine.c
ADDED
|
@@ -0,0 +1,658 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PURE UNARY TRANSFORMER ENGINE
|
| 3 |
+
*
|
| 4 |
+
* ALL matrix multiplications use base-1 arithmetic:
|
| 5 |
+
* - Weights: unary encoded (sign + N magnitude planes)
|
| 6 |
+
* - Activations: unary encoded (sign + M magnitude planes)
|
| 7 |
+
* - Matmul = bitwise AND + popcount across plane pairs
|
| 8 |
+
* - Float only used for: RMSNorm, SiLU, Softmax, rescale, residual add
|
| 9 |
+
* - These are all O(dim) not O(dim²), so don't dominate
|
| 10 |
+
*
|
| 11 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 12 |
+
*/
|
| 13 |
+
|
| 14 |
+
#include <immintrin.h>
|
| 15 |
+
#include <omp.h>
|
| 16 |
+
#include <stdint.h>
|
| 17 |
+
#include <stdlib.h>
|
| 18 |
+
#include <string.h>
|
| 19 |
+
#include <math.h>
|
| 20 |
+
#include <stdio.h>
|
| 21 |
+
#include <time.h>
|
| 22 |
+
|
| 23 |
+
#define MAX_SEQ 4096
|
| 24 |
+
#define RMS_EPS 1e-6f
|
| 25 |
+
|
| 26 |
+
/* ============================================================
|
| 27 |
+
* Unary vector: a quantized 1D activation or intermediate
|
| 28 |
+
* ============================================================ */
|
| 29 |
+
typedef struct {
|
| 30 |
+
uint64_t *sign; /* [chunks] */
|
| 31 |
+
uint64_t *planes; /* [n_planes][chunks] */
|
| 32 |
+
float scale;
|
| 33 |
+
int dim;
|
| 34 |
+
int chunks;
|
| 35 |
+
int n_planes;
|
| 36 |
+
} UnaryVec;
|
| 37 |
+
|
| 38 |
+
/* ============================================================
|
| 39 |
+
* Config
|
| 40 |
+
* ============================================================ */
|
| 41 |
+
typedef struct {
|
| 42 |
+
int hidden;
|
| 43 |
+
int inter;
|
| 44 |
+
int n_heads;
|
| 45 |
+
int n_kv_heads;
|
| 46 |
+
int head_dim;
|
| 47 |
+
int n_layers;
|
| 48 |
+
int vocab;
|
| 49 |
+
float rope_theta;
|
| 50 |
+
int tie_embeddings;
|
| 51 |
+
int w_planes; /* weight quantization planes */
|
| 52 |
+
int a_planes; /* activation quantization planes */
|
| 53 |
+
} Config;
|
| 54 |
+
|
| 55 |
+
/* Unary weight matrix */
|
| 56 |
+
typedef struct {
|
| 57 |
+
uint64_t *sign_bits;
|
| 58 |
+
uint64_t *mag_planes;
|
| 59 |
+
float *scales;
|
| 60 |
+
int out_dim;
|
| 61 |
+
int in_dim;
|
| 62 |
+
int n_planes;
|
| 63 |
+
int chunks; /* = (in_dim + 63) / 64 */
|
| 64 |
+
} UnaryWeight;
|
| 65 |
+
|
| 66 |
+
/* Transformer layer */
|
| 67 |
+
typedef struct {
|
| 68 |
+
UnaryWeight q_proj, k_proj, v_proj, o_proj;
|
| 69 |
+
UnaryWeight gate_proj, up_proj, down_proj;
|
| 70 |
+
float *input_norm;
|
| 71 |
+
float *post_norm;
|
| 72 |
+
float *q_norm, *k_norm;
|
| 73 |
+
} Layer;
|
| 74 |
+
|
| 75 |
+
/* Full model */
|
| 76 |
+
typedef struct {
|
| 77 |
+
Config cfg;
|
| 78 |
+
uint16_t *embed;
|
| 79 |
+
Layer *layers;
|
| 80 |
+
float *final_norm;
|
| 81 |
+
|
| 82 |
+
/* KV cache (float - only O(seq × heads × dim) not O(dim²)) */
|
| 83 |
+
float *k_cache;
|
| 84 |
+
float *v_cache;
|
| 85 |
+
|
| 86 |
+
/* Scratch - float buffers for non-matmul ops */
|
| 87 |
+
float *hidden; /* residual stream */
|
| 88 |
+
float *normed; /* after RMSNorm, before quantization */
|
| 89 |
+
float *q_float;
|
| 90 |
+
float *k_float;
|
| 91 |
+
float *v_float;
|
| 92 |
+
float *attn_out;
|
| 93 |
+
float *gate_float;
|
| 94 |
+
float *up_float;
|
| 95 |
+
float *mlp_act; /* gate*up result before quantization */
|
| 96 |
+
float *logits;
|
| 97 |
+
float *attn_scores;
|
| 98 |
+
|
| 99 |
+
/* Scratch - unary vectors for matmul inputs */
|
| 100 |
+
UnaryVec uv_normed;
|
| 101 |
+
UnaryVec uv_mlp_in;
|
| 102 |
+
UnaryVec uv_mlp_act; /* for down_proj input */
|
| 103 |
+
|
| 104 |
+
/* Output integer accumulators (avoid malloc per call) */
|
| 105 |
+
int *acc_buf;
|
| 106 |
+
} Model;
|
| 107 |
+
|
| 108 |
+
/* ============================================================
|
| 109 |
+
* ACTIVATION QUANTIZATION: float -> unary
|
| 110 |
+
* Runs per-vector: one scale for entire vector
|
| 111 |
+
* O(dim) operation, not in the hot path
|
| 112 |
+
* ============================================================ */
|
| 113 |
+
static void quantize_to_unary(
|
| 114 |
+
const float *x, int dim, int n_planes,
|
| 115 |
+
uint64_t *sign_out, uint64_t *planes_out, float *scale_out
|
| 116 |
+
) {
|
| 117 |
+
int chunks = (dim + 63) / 64;
|
| 118 |
+
|
| 119 |
+
/* Find absmax */
|
| 120 |
+
float amax = 0.0f;
|
| 121 |
+
for (int i = 0; i < dim; i++) {
|
| 122 |
+
float a = fabsf(x[i]);
|
| 123 |
+
if (a > amax) amax = a;
|
| 124 |
+
}
|
| 125 |
+
if (amax == 0.0f) amax = 1.0f;
|
| 126 |
+
*scale_out = amax / n_planes;
|
| 127 |
+
|
| 128 |
+
/* Clear output */
|
| 129 |
+
memset(sign_out, 0, chunks * sizeof(uint64_t));
|
| 130 |
+
memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
|
| 131 |
+
|
| 132 |
+
/* Quantize element by element */
|
| 133 |
+
float inv_scale = n_planes / amax;
|
| 134 |
+
for (int i = 0; i < dim; i++) {
|
| 135 |
+
int chunk = i / 64;
|
| 136 |
+
int bit = i % 64;
|
| 137 |
+
uint64_t mask = 1ULL << bit;
|
| 138 |
+
|
| 139 |
+
/* Sign */
|
| 140 |
+
if (x[i] < 0.0f)
|
| 141 |
+
sign_out[chunk] |= mask;
|
| 142 |
+
|
| 143 |
+
/* Magnitude: thermometer encode */
|
| 144 |
+
int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
|
| 145 |
+
if (mag > n_planes) mag = n_planes;
|
| 146 |
+
for (int p = 0; p < mag; p++)
|
| 147 |
+
planes_out[(size_t)p * chunks + chunk] |= mask;
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
/* ============================================================
|
| 152 |
+
* PURE UNARY MATVEC: y = W @ x
|
| 153 |
+
*
|
| 154 |
+
* Both W and x are unary encoded.
|
| 155 |
+
* Inner loop is purely: AND + popcount
|
| 156 |
+
* Float multiply happens ONCE per output element (rescale)
|
| 157 |
+
* ============================================================ */
|
| 158 |
+
static void pure_unary_matvec(
|
| 159 |
+
const UnaryWeight *W,
|
| 160 |
+
const uint64_t *x_sign, const uint64_t *x_planes,
|
| 161 |
+
float x_scale, int x_n_planes,
|
| 162 |
+
float *y_out, /* float output for non-matmul ops */
|
| 163 |
+
int *acc_buf /* scratch for integer accumulators */
|
| 164 |
+
) {
|
| 165 |
+
int out_dim = W->out_dim;
|
| 166 |
+
int chunks = W->chunks;
|
| 167 |
+
int wp = W->n_planes;
|
| 168 |
+
int xp = x_n_planes;
|
| 169 |
+
|
| 170 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 171 |
+
for (int i = 0; i < out_dim; i++) {
|
| 172 |
+
const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
|
| 173 |
+
|
| 174 |
+
/* Precompute same_sign mask for this row vs input */
|
| 175 |
+
/* same_sign[c] = ~(w_sign[c] ^ x_sign[c]) */
|
| 176 |
+
/* We compute this per-chunk inside the loop to avoid allocation */
|
| 177 |
+
|
| 178 |
+
long long acc = 0;
|
| 179 |
+
|
| 180 |
+
for (int c = 0; c < chunks; c++) {
|
| 181 |
+
uint64_t ws = w_sign_row[c];
|
| 182 |
+
uint64_t xs = x_sign[c];
|
| 183 |
+
uint64_t same = ~(ws ^ xs); /* bits where signs agree */
|
| 184 |
+
uint64_t diff = ws ^ xs; /* bits where signs differ */
|
| 185 |
+
|
| 186 |
+
for (int p = 0; p < wp; p++) {
|
| 187 |
+
uint64_t w_mag = W->mag_planes[((size_t)p * out_dim + i) * chunks + c];
|
| 188 |
+
|
| 189 |
+
for (int q = 0; q < xp; q++) {
|
| 190 |
+
uint64_t x_mag = x_planes[(size_t)q * chunks + c];
|
| 191 |
+
uint64_t active = w_mag & x_mag;
|
| 192 |
+
|
| 193 |
+
/* Count positive and negative contributions */
|
| 194 |
+
uint64_t pos = active & same;
|
| 195 |
+
uint64_t neg = active & diff;
|
| 196 |
+
acc += __builtin_popcountll(pos) - __builtin_popcountll(neg);
|
| 197 |
+
}
|
| 198 |
+
}
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
/* Single float rescale per output element */
|
| 202 |
+
y_out[i] = (float)acc * W->scales[i] * x_scale;
|
| 203 |
+
}
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
/* ============================================================
|
| 207 |
+
* FP16 embedding lookup (only used for embed/lm_head)
|
| 208 |
+
* ============================================================ */
|
| 209 |
+
static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
|
| 210 |
+
const uint16_t *row = embed + (size_t)token_id * hidden;
|
| 211 |
+
int i;
|
| 212 |
+
for (i = 0; i + 16 <= hidden; i += 16) {
|
| 213 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + i));
|
| 214 |
+
__m512 fv = _mm512_cvtph_ps(h);
|
| 215 |
+
_mm512_storeu_ps(out + i, fv);
|
| 216 |
+
}
|
| 217 |
+
for (; i < hidden; i++) {
|
| 218 |
+
__m128i hv = _mm_set1_epi16(row[i]);
|
| 219 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 220 |
+
_mm_store_ss(out + i, fv);
|
| 221 |
+
}
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
/* FP16 matvec for lm_head (vocab is huge, keep as FP16) */
|
| 225 |
+
static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
|
| 226 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 227 |
+
for (int i = 0; i < out_dim; i++) {
|
| 228 |
+
__m512 acc = _mm512_setzero_ps();
|
| 229 |
+
int j;
|
| 230 |
+
for (j = 0; j + 16 <= in_dim; j += 16) {
|
| 231 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
|
| 232 |
+
__m512 wv = _mm512_cvtph_ps(h);
|
| 233 |
+
__m512 xv = _mm512_loadu_ps(x + j);
|
| 234 |
+
acc = _mm512_fmadd_ps(wv, xv, acc);
|
| 235 |
+
}
|
| 236 |
+
float sum = _mm512_reduce_add_ps(acc);
|
| 237 |
+
for (; j < in_dim; j++) {
|
| 238 |
+
__m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
|
| 239 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 240 |
+
float wf;
|
| 241 |
+
_mm_store_ss(&wf, fv);
|
| 242 |
+
sum += wf * x[j];
|
| 243 |
+
}
|
| 244 |
+
y[i] = sum;
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
/* ============================================================
|
| 249 |
+
* O(dim) operations - float is fine here, not the bottleneck
|
| 250 |
+
* ============================================================ */
|
| 251 |
+
static void rmsnorm(const float *x, const float *w, float *y, int dim) {
|
| 252 |
+
float ss = 0.0f;
|
| 253 |
+
for (int i = 0; i < dim; i++) ss += x[i] * x[i];
|
| 254 |
+
float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
|
| 255 |
+
for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
static void rmsnorm_head(const float *x, const float *w, float *y, int dim) {
|
| 259 |
+
/* RMSNorm for a single attention head */
|
| 260 |
+
rmsnorm(x, w, y, dim);
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
static void silu_mul(const float *gate, const float *up, float *out, int n) {
|
| 264 |
+
for (int i = 0; i < n; i++)
|
| 265 |
+
out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
static void vec_add(float *y, const float *x, int n) {
|
| 269 |
+
for (int i = 0; i < n; i++) y[i] += x[i];
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
static void apply_rope(float *vec, int pos, int dim, float theta) {
|
| 273 |
+
for (int i = 0; i < dim; i += 2) {
|
| 274 |
+
float freq = 1.0f / powf(theta, (float)i / dim);
|
| 275 |
+
float angle = pos * freq;
|
| 276 |
+
float c = cosf(angle), s = sinf(angle);
|
| 277 |
+
float v0 = vec[i], v1 = vec[i + 1];
|
| 278 |
+
vec[i] = v0 * c - v1 * s;
|
| 279 |
+
vec[i + 1] = v0 * s + v1 * c;
|
| 280 |
+
}
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
static void softmax(float *x, int n) {
|
| 284 |
+
float mx = x[0];
|
| 285 |
+
for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
|
| 286 |
+
float sum = 0.0f;
|
| 287 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
|
| 288 |
+
float inv = 1.0f / sum;
|
| 289 |
+
for (int i = 0; i < n; i++) x[i] *= inv;
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
/* KV cache access */
|
| 293 |
+
static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
|
| 294 |
+
return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
|
| 295 |
+
(size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
/* ============================================================
|
| 299 |
+
* ALLOC unary vector scratch
|
| 300 |
+
* ============================================================ */
|
| 301 |
+
static void uv_alloc(UnaryVec *uv, int dim, int n_planes) {
|
| 302 |
+
int chunks = (dim + 63) / 64;
|
| 303 |
+
uv->dim = dim;
|
| 304 |
+
uv->chunks = chunks;
|
| 305 |
+
uv->n_planes = n_planes;
|
| 306 |
+
uv->sign = (uint64_t *)aligned_alloc(64, chunks * sizeof(uint64_t));
|
| 307 |
+
uv->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * chunks * sizeof(uint64_t));
|
| 308 |
+
uv->scale = 0.0f;
|
| 309 |
+
}
|
| 310 |
+
|
| 311 |
+
/* ============================================================
|
| 312 |
+
* ATTENTION (using pure unary for projections)
|
| 313 |
+
* ============================================================ */
|
| 314 |
+
static void attention(Model *m, int layer_idx, int pos) {
|
| 315 |
+
Config *c = &m->cfg;
|
| 316 |
+
Layer *layer = &m->layers[layer_idx];
|
| 317 |
+
int heads_per_kv = c->n_heads / c->n_kv_heads;
|
| 318 |
+
|
| 319 |
+
/* Quantize normed hidden to unary */
|
| 320 |
+
quantize_to_unary(m->normed, c->hidden, c->a_planes,
|
| 321 |
+
m->uv_normed.sign, m->uv_normed.planes, &m->uv_normed.scale);
|
| 322 |
+
|
| 323 |
+
/* Q, K, V projections - PURE UNARY */
|
| 324 |
+
pure_unary_matvec(&layer->q_proj,
|
| 325 |
+
m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
|
| 326 |
+
m->q_float, m->acc_buf);
|
| 327 |
+
pure_unary_matvec(&layer->k_proj,
|
| 328 |
+
m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
|
| 329 |
+
m->k_float, m->acc_buf);
|
| 330 |
+
pure_unary_matvec(&layer->v_proj,
|
| 331 |
+
m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
|
| 332 |
+
m->v_float, m->acc_buf);
|
| 333 |
+
|
| 334 |
+
/* QK-Norm (per head) */
|
| 335 |
+
if (layer->q_norm) {
|
| 336 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 337 |
+
rmsnorm_head(m->q_float + h * c->head_dim, layer->q_norm,
|
| 338 |
+
m->q_float + h * c->head_dim, c->head_dim);
|
| 339 |
+
}
|
| 340 |
+
if (layer->k_norm) {
|
| 341 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 342 |
+
rmsnorm_head(m->k_float + h * c->head_dim, layer->k_norm,
|
| 343 |
+
m->k_float + h * c->head_dim, c->head_dim);
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
/* RoPE */
|
| 347 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 348 |
+
apply_rope(m->q_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 349 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 350 |
+
apply_rope(m->k_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 351 |
+
|
| 352 |
+
/* Store K, V to cache */
|
| 353 |
+
for (int h = 0; h < c->n_kv_heads; h++) {
|
| 354 |
+
memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
|
| 355 |
+
m->k_float + h * c->head_dim, c->head_dim * sizeof(float));
|
| 356 |
+
memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
|
| 357 |
+
m->v_float + h * c->head_dim, c->head_dim * sizeof(float));
|
| 358 |
+
}
|
| 359 |
+
|
| 360 |
+
/* Attention scores + weighted sum (O(seq × head_dim), not O(dim²)) */
|
| 361 |
+
float scale = 1.0f / sqrtf((float)c->head_dim);
|
| 362 |
+
memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
|
| 363 |
+
|
| 364 |
+
for (int h = 0; h < c->n_heads; h++) {
|
| 365 |
+
int kv_h = h / heads_per_kv;
|
| 366 |
+
float *q_head = m->q_float + h * c->head_dim;
|
| 367 |
+
float *out_head = m->attn_out + h * c->head_dim;
|
| 368 |
+
|
| 369 |
+
for (int t = 0; t <= pos; t++) {
|
| 370 |
+
float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
|
| 371 |
+
float dot = 0.0f;
|
| 372 |
+
for (int d = 0; d < c->head_dim; d++)
|
| 373 |
+
dot += q_head[d] * k_cached[d];
|
| 374 |
+
m->attn_scores[t] = dot * scale;
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
softmax(m->attn_scores, pos + 1);
|
| 378 |
+
|
| 379 |
+
for (int t = 0; t <= pos; t++) {
|
| 380 |
+
float w = m->attn_scores[t];
|
| 381 |
+
if (w < 1e-8f) continue;
|
| 382 |
+
float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
|
| 383 |
+
for (int d = 0; d < c->head_dim; d++)
|
| 384 |
+
out_head[d] += w * v_cached[d];
|
| 385 |
+
}
|
| 386 |
+
}
|
| 387 |
+
|
| 388 |
+
/* O projection - quantize attn_out, then pure unary */
|
| 389 |
+
int o_in = c->n_heads * c->head_dim;
|
| 390 |
+
UnaryVec uv_attn;
|
| 391 |
+
uv_alloc(&uv_attn, o_in, c->a_planes);
|
| 392 |
+
quantize_to_unary(m->attn_out, o_in, c->a_planes,
|
| 393 |
+
uv_attn.sign, uv_attn.planes, &uv_attn.scale);
|
| 394 |
+
|
| 395 |
+
/* Temp buffer for O projection output */
|
| 396 |
+
float *o_out = m->normed; /* reuse normed buffer */
|
| 397 |
+
pure_unary_matvec(&layer->o_proj,
|
| 398 |
+
uv_attn.sign, uv_attn.planes, uv_attn.scale, c->a_planes,
|
| 399 |
+
o_out, m->acc_buf);
|
| 400 |
+
|
| 401 |
+
/* Copy o_out to where caller expects it (normed acts as temp) */
|
| 402 |
+
memcpy(m->attn_out, o_out, c->hidden * sizeof(float));
|
| 403 |
+
|
| 404 |
+
free(uv_attn.sign);
|
| 405 |
+
free(uv_attn.planes);
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
/* ============================================================
|
| 409 |
+
* MLP (using pure unary for all projections)
|
| 410 |
+
* ============================================================ */
|
| 411 |
+
static void mlp(Model *m, int layer_idx) {
|
| 412 |
+
Config *c = &m->cfg;
|
| 413 |
+
Layer *layer = &m->layers[layer_idx];
|
| 414 |
+
|
| 415 |
+
/* Quantize normed input */
|
| 416 |
+
quantize_to_unary(m->normed, c->hidden, c->a_planes,
|
| 417 |
+
m->uv_mlp_in.sign, m->uv_mlp_in.planes, &m->uv_mlp_in.scale);
|
| 418 |
+
|
| 419 |
+
/* Gate and Up projections - PURE UNARY */
|
| 420 |
+
pure_unary_matvec(&layer->gate_proj,
|
| 421 |
+
m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
|
| 422 |
+
m->gate_float, m->acc_buf);
|
| 423 |
+
pure_unary_matvec(&layer->up_proj,
|
| 424 |
+
m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
|
| 425 |
+
m->up_float, m->acc_buf);
|
| 426 |
+
|
| 427 |
+
/* SiLU(gate) * up - O(inter) float op */
|
| 428 |
+
silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
|
| 429 |
+
|
| 430 |
+
/* Quantize for down projection */
|
| 431 |
+
quantize_to_unary(m->mlp_act, c->inter, c->a_planes,
|
| 432 |
+
m->uv_mlp_act.sign, m->uv_mlp_act.planes, &m->uv_mlp_act.scale);
|
| 433 |
+
|
| 434 |
+
/* Down projection - PURE UNARY */
|
| 435 |
+
pure_unary_matvec(&layer->down_proj,
|
| 436 |
+
m->uv_mlp_act.sign, m->uv_mlp_act.planes, m->uv_mlp_act.scale, c->a_planes,
|
| 437 |
+
m->normed, m->acc_buf); /* reuse normed as output */
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
/* ============================================================
|
| 441 |
+
* FORWARD ONE TOKEN
|
| 442 |
+
* ============================================================ */
|
| 443 |
+
float* forward_token(Model *m, int token_id, int pos) {
|
| 444 |
+
Config *c = &m->cfg;
|
| 445 |
+
|
| 446 |
+
embed_token(m->embed, token_id, m->hidden, c->hidden);
|
| 447 |
+
|
| 448 |
+
for (int l = 0; l < c->n_layers; l++) {
|
| 449 |
+
/* Pre-attention norm */
|
| 450 |
+
rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
|
| 451 |
+
|
| 452 |
+
/* Attention (quantizes normed internally, outputs to attn_out) */
|
| 453 |
+
attention(m, l, pos);
|
| 454 |
+
vec_add(m->hidden, m->attn_out, c->hidden);
|
| 455 |
+
|
| 456 |
+
/* Post-attention norm */
|
| 457 |
+
rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
|
| 458 |
+
|
| 459 |
+
/* MLP (quantizes normed internally, outputs to normed) */
|
| 460 |
+
mlp(m, l);
|
| 461 |
+
vec_add(m->hidden, m->normed, c->hidden);
|
| 462 |
+
}
|
| 463 |
+
|
| 464 |
+
/* Final norm */
|
| 465 |
+
rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
|
| 466 |
+
|
| 467 |
+
/* LM head - FP16 for now (vocab projection is O(vocab × hidden), not repeated per-layer) */
|
| 468 |
+
if (c->tie_embeddings) {
|
| 469 |
+
fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
return m->logits;
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
/* ============================================================
|
| 476 |
+
* SAMPLING
|
| 477 |
+
* ============================================================ */
|
| 478 |
+
static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
|
| 479 |
+
if (temperature > 0) {
|
| 480 |
+
float inv_t = 1.0f / temperature;
|
| 481 |
+
for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
|
| 482 |
+
}
|
| 483 |
+
softmax(logits, vocab);
|
| 484 |
+
|
| 485 |
+
int n_keep = 0;
|
| 486 |
+
float cum = 0.0f;
|
| 487 |
+
float *probs = (float *)malloc(vocab * sizeof(float));
|
| 488 |
+
int *indices = (int *)malloc(vocab * sizeof(int));
|
| 489 |
+
memcpy(probs, logits, vocab * sizeof(float));
|
| 490 |
+
for (int i = 0; i < vocab; i++) indices[i] = i;
|
| 491 |
+
|
| 492 |
+
while (cum < top_p && n_keep < vocab) {
|
| 493 |
+
int best = n_keep;
|
| 494 |
+
for (int i = n_keep + 1; i < vocab; i++)
|
| 495 |
+
if (probs[i] > probs[best]) best = i;
|
| 496 |
+
float tmp = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp;
|
| 497 |
+
int ti = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = ti;
|
| 498 |
+
cum += probs[n_keep];
|
| 499 |
+
n_keep++;
|
| 500 |
+
if (n_keep >= 40) break;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
float sum = 0.0f;
|
| 504 |
+
for (int i = 0; i < n_keep; i++) sum += probs[i];
|
| 505 |
+
float r = (float)rand() / RAND_MAX * sum;
|
| 506 |
+
float acc = 0.0f;
|
| 507 |
+
int chosen = indices[0];
|
| 508 |
+
for (int i = 0; i < n_keep; i++) {
|
| 509 |
+
acc += probs[i];
|
| 510 |
+
if (acc >= r) { chosen = indices[i]; break; }
|
| 511 |
+
}
|
| 512 |
+
free(probs); free(indices);
|
| 513 |
+
return chosen;
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
int generate(
|
| 517 |
+
Model *m,
|
| 518 |
+
const int *prompt_ids, int prompt_len,
|
| 519 |
+
int *out_tokens, int max_new_tokens,
|
| 520 |
+
float temperature, float top_p, int eos_token
|
| 521 |
+
) {
|
| 522 |
+
srand(time(NULL));
|
| 523 |
+
|
| 524 |
+
for (int i = 0; i < prompt_len; i++)
|
| 525 |
+
forward_token(m, prompt_ids[i], i);
|
| 526 |
+
|
| 527 |
+
int pos = prompt_len;
|
| 528 |
+
int generated = 0;
|
| 529 |
+
|
| 530 |
+
for (int t = 0; t < max_new_tokens; t++) {
|
| 531 |
+
int next;
|
| 532 |
+
if (temperature <= 0) {
|
| 533 |
+
next = 0;
|
| 534 |
+
for (int i = 1; i < m->cfg.vocab; i++)
|
| 535 |
+
if (m->logits[i] > m->logits[next]) next = i;
|
| 536 |
+
} else {
|
| 537 |
+
next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
|
| 538 |
+
}
|
| 539 |
+
|
| 540 |
+
out_tokens[t] = next;
|
| 541 |
+
generated++;
|
| 542 |
+
if (next == eos_token) break;
|
| 543 |
+
|
| 544 |
+
forward_token(m, next, pos);
|
| 545 |
+
pos++;
|
| 546 |
+
}
|
| 547 |
+
return generated;
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
/* ============================================================
|
| 551 |
+
* ALLOCATION
|
| 552 |
+
* ============================================================ */
|
| 553 |
+
Model* model_alloc(
|
| 554 |
+
int w_planes, int a_planes,
|
| 555 |
+
int hidden, int inter, int n_heads, int n_kv_heads,
|
| 556 |
+
int head_dim, int n_layers, int vocab,
|
| 557 |
+
float rope_theta, int tie_embeddings
|
| 558 |
+
) {
|
| 559 |
+
Model *m = (Model *)calloc(1, sizeof(Model));
|
| 560 |
+
Config *c = &m->cfg;
|
| 561 |
+
c->hidden = hidden; c->inter = inter;
|
| 562 |
+
c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
|
| 563 |
+
c->head_dim = head_dim; c->n_layers = n_layers;
|
| 564 |
+
c->vocab = vocab; c->rope_theta = rope_theta;
|
| 565 |
+
c->tie_embeddings = tie_embeddings;
|
| 566 |
+
c->w_planes = w_planes; c->a_planes = a_planes;
|
| 567 |
+
|
| 568 |
+
m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
|
| 569 |
+
|
| 570 |
+
size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
|
| 571 |
+
m->k_cache = (float *)calloc(kv_size, sizeof(float));
|
| 572 |
+
m->v_cache = (float *)calloc(kv_size, sizeof(float));
|
| 573 |
+
|
| 574 |
+
m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 575 |
+
m->normed = (float *)aligned_alloc(64, (inter > hidden ? inter : hidden) * sizeof(float));
|
| 576 |
+
m->q_float = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 577 |
+
m->k_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 578 |
+
m->v_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 579 |
+
m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 580 |
+
m->gate_float = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 581 |
+
m->up_float = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 582 |
+
m->mlp_act = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 583 |
+
m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
|
| 584 |
+
m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
|
| 585 |
+
m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 586 |
+
m->acc_buf = (int *)aligned_alloc(64, (inter > vocab ? inter : vocab) * sizeof(int));
|
| 587 |
+
|
| 588 |
+
/* Unary vector scratch */
|
| 589 |
+
uv_alloc(&m->uv_normed, hidden, a_planes);
|
| 590 |
+
uv_alloc(&m->uv_mlp_in, hidden, a_planes);
|
| 591 |
+
uv_alloc(&m->uv_mlp_act, inter, a_planes);
|
| 592 |
+
|
| 593 |
+
size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
|
| 594 |
+
printf("PURE UNARY ENGINE\n");
|
| 595 |
+
printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
|
| 596 |
+
hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
|
| 597 |
+
printf(" Weight planes: %d, Activation planes: %d\n", w_planes, a_planes);
|
| 598 |
+
printf(" Plane pairs per matvec element: %d\n", w_planes * a_planes);
|
| 599 |
+
printf(" KV cache: %zu MB\n", kv_mb);
|
| 600 |
+
printf(" Float ops: RMSNorm, SiLU, Softmax, RoPE, residual (all O(dim))\n");
|
| 601 |
+
printf(" Integer ops: ALL matmuls (O(dim²) — the actual bottleneck)\n");
|
| 602 |
+
|
| 603 |
+
return m;
|
| 604 |
+
}
|
| 605 |
+
|
| 606 |
+
/* Weight setters (same interface as v2) */
|
| 607 |
+
void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
|
| 608 |
+
void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
|
| 609 |
+
|
| 610 |
+
void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
|
| 611 |
+
m->layers[l].input_norm = in_norm;
|
| 612 |
+
m->layers[l].post_norm = post_norm;
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
|
| 616 |
+
m->layers[l].q_norm = q_norm;
|
| 617 |
+
m->layers[l].k_norm = k_norm;
|
| 618 |
+
}
|
| 619 |
+
|
| 620 |
+
static void init_unary_weight(
|
| 621 |
+
UnaryWeight *uw,
|
| 622 |
+
uint64_t *sign, uint64_t *planes, float *scales,
|
| 623 |
+
int out_dim, int in_dim, int n_planes
|
| 624 |
+
) {
|
| 625 |
+
uw->sign_bits = sign;
|
| 626 |
+
uw->mag_planes = planes;
|
| 627 |
+
uw->scales = scales;
|
| 628 |
+
uw->out_dim = out_dim;
|
| 629 |
+
uw->in_dim = in_dim;
|
| 630 |
+
uw->n_planes = n_planes;
|
| 631 |
+
uw->chunks = (in_dim + 63) / 64;
|
| 632 |
+
}
|
| 633 |
+
|
| 634 |
+
void layer_set_linears(
|
| 635 |
+
Model *m, int l,
|
| 636 |
+
uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
|
| 637 |
+
uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
|
| 638 |
+
uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
|
| 639 |
+
uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
|
| 640 |
+
uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
|
| 641 |
+
uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
|
| 642 |
+
uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
|
| 643 |
+
int n_planes
|
| 644 |
+
) {
|
| 645 |
+
init_unary_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
|
| 646 |
+
init_unary_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
|
| 647 |
+
init_unary_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
|
| 648 |
+
init_unary_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
|
| 649 |
+
init_unary_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
|
| 650 |
+
init_unary_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
|
| 651 |
+
init_unary_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
|
| 652 |
+
}
|
| 653 |
+
|
| 654 |
+
void model_reset_cache(Model *m) {
|
| 655 |
+
size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
|
| 656 |
+
memset(m->k_cache, 0, kv_size * sizeof(float));
|
| 657 |
+
memset(m->v_cache, 0, kv_size * sizeof(float));
|
| 658 |
+
}
|
run_convert.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, json, numpy as np, time, sys
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from safetensors import safe_open
|
| 4 |
+
import torch
|
| 5 |
+
sys.path.insert(0, "/root/ternary_engine")
|
| 6 |
+
from convert import quantize_weight_matrix
|
| 7 |
+
|
| 8 |
+
model_dir = "/root/ternary_engine/deepseek-r1-1.5b-hf"
|
| 9 |
+
output_dir = "/root/ternary_engine/deepseek-r1-1.5b-ternary"
|
| 10 |
+
alpha = 0.7
|
| 11 |
+
|
| 12 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 13 |
+
|
| 14 |
+
tensors = {}
|
| 15 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 16 |
+
print("Loading " + f.name)
|
| 17 |
+
with safe_open(str(f), framework="pt") as st:
|
| 18 |
+
for key in st.keys():
|
| 19 |
+
tensors[key] = st.get_tensor(key).float().numpy()
|
| 20 |
+
|
| 21 |
+
print("Loaded " + str(len(tensors)) + " tensors")
|
| 22 |
+
|
| 23 |
+
config = {
|
| 24 |
+
"hidden_size": 1536, "intermediate_size": 8960,
|
| 25 |
+
"num_attention_heads": 12, "num_key_value_heads": 2,
|
| 26 |
+
"num_hidden_layers": 28, "vocab_size": 151936,
|
| 27 |
+
"head_dim": 128, "rope_theta": 1000000.0,
|
| 28 |
+
"rms_norm_eps": 1e-6, "alpha": alpha,
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
ternary_manifest = {}
|
| 32 |
+
fp16_manifest = {}
|
| 33 |
+
|
| 34 |
+
linear_suffixes = ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
|
| 35 |
+
'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
|
| 36 |
+
'down_proj.weight']
|
| 37 |
+
|
| 38 |
+
total_tb = 0
|
| 39 |
+
total_ob = 0
|
| 40 |
+
|
| 41 |
+
for key, w in tensors.items():
|
| 42 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 43 |
+
is_linear = any(key.endswith(s) for s in linear_suffixes)
|
| 44 |
+
|
| 45 |
+
if is_linear and len(w.shape) == 2:
|
| 46 |
+
out_dim, in_dim = w.shape
|
| 47 |
+
total_ob += w.nbytes
|
| 48 |
+
|
| 49 |
+
t0 = time.time()
|
| 50 |
+
pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
|
| 51 |
+
dt = time.time() - t0
|
| 52 |
+
|
| 53 |
+
pos.tofile(prefix + ".pos")
|
| 54 |
+
neg.tofile(prefix + ".neg")
|
| 55 |
+
scales.tofile(prefix + ".scales")
|
| 56 |
+
|
| 57 |
+
tb = pos.nbytes + neg.nbytes + scales.nbytes
|
| 58 |
+
total_tb += tb
|
| 59 |
+
ratio = w.nbytes / tb
|
| 60 |
+
ternary_manifest[key] = list(w.shape)
|
| 61 |
+
print(" T %s: %s -> %dKB (%.1fx, %.0f%% sparse, %.1fs)" % (
|
| 62 |
+
key, str(w.shape), tb // 1024, ratio, sparsity * 100, dt))
|
| 63 |
+
else:
|
| 64 |
+
w16 = w.astype(np.float16)
|
| 65 |
+
w16.tofile(prefix + ".fp16")
|
| 66 |
+
fp16_manifest[key] = list(w.shape)
|
| 67 |
+
print(" F %s: %s -> %dKB" % (key, str(w.shape), w16.nbytes // 1024))
|
| 68 |
+
|
| 69 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 70 |
+
json.dump(config, f, indent=2)
|
| 71 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 72 |
+
json.dump({"ternary": ternary_manifest, "fp16": fp16_manifest}, f, indent=2)
|
| 73 |
+
|
| 74 |
+
print("")
|
| 75 |
+
print("Ternary: %.1fMB (from %.1fMB FP32)" % (total_tb / 1024 / 1024, total_ob / 1024 / 1024))
|
| 76 |
+
print("DONE")
|
run_log_unary.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Log-unary model loader. (c) 2026 OpenTransformers Ltd"""
|
| 3 |
+
import ctypes, numpy as np, os, sys, json, time
|
| 4 |
+
|
| 5 |
+
def load_and_run(model_dir, prompt, max_tokens=32, temperature=0.0, top_p=0.9, a_planes=4):
|
| 6 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 7 |
+
manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
|
| 8 |
+
w_planes = manifest["n_planes"]
|
| 9 |
+
n_layers = config["num_hidden_layers"]
|
| 10 |
+
hidden = config["hidden_size"]
|
| 11 |
+
inter = config["intermediate_size"]
|
| 12 |
+
n_heads = config["num_attention_heads"]
|
| 13 |
+
n_kv_heads = config["num_key_value_heads"]
|
| 14 |
+
head_dim = config.get("head_dim", hidden // n_heads)
|
| 15 |
+
vocab = config["vocab_size"]
|
| 16 |
+
rope_theta = config.get("rope_theta", 10000.0)
|
| 17 |
+
tie = 1 if config.get("tie_word_embeddings", False) else 0
|
| 18 |
+
|
| 19 |
+
w_max = (1 << w_planes) - 1
|
| 20 |
+
a_max = (1 << a_planes) - 1
|
| 21 |
+
print(f"Config: {n_layers}L hidden={hidden} inter={inter} heads={n_heads}/{n_kv_heads}")
|
| 22 |
+
print(f"Weight: {w_planes} log-planes ({2*w_max+1} levels)")
|
| 23 |
+
print(f"Activation: {a_planes} log-planes ({2*a_max+1} levels)")
|
| 24 |
+
print(f"Plane pairs: {w_planes * a_planes}")
|
| 25 |
+
|
| 26 |
+
engine = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "log_unary_engine.so")
|
| 27 |
+
lib = ctypes.CDLL(engine)
|
| 28 |
+
|
| 29 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 30 |
+
lib.model_alloc.argtypes = [ctypes.c_int]*2 + [ctypes.c_int]*7 + [ctypes.c_float, ctypes.c_int]
|
| 31 |
+
lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
|
| 32 |
+
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 33 |
+
lib.generate.restype = ctypes.c_int
|
| 34 |
+
lib.generate.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 35 |
+
ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 36 |
+
ctypes.c_float, ctypes.c_float, ctypes.c_int]
|
| 37 |
+
|
| 38 |
+
u16p = ctypes.POINTER(ctypes.c_uint16)
|
| 39 |
+
f32p = ctypes.POINTER(ctypes.c_float)
|
| 40 |
+
u64p = ctypes.POINTER(ctypes.c_uint64)
|
| 41 |
+
|
| 42 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
|
| 43 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
|
| 44 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 45 |
+
lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 46 |
+
lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + \
|
| 47 |
+
[u64p, u64p, f32p, ctypes.c_int, ctypes.c_int] * 7 + [ctypes.c_int]
|
| 48 |
+
|
| 49 |
+
print("Allocating...")
|
| 50 |
+
model = lib.model_alloc(w_planes, a_planes, hidden, inter, n_heads, n_kv_heads,
|
| 51 |
+
head_dim, n_layers, vocab, rope_theta, tie)
|
| 52 |
+
_refs = []
|
| 53 |
+
|
| 54 |
+
def load_fp16(name):
|
| 55 |
+
d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
|
| 56 |
+
_refs.append(d); return d.ctypes.data_as(u16p)
|
| 57 |
+
|
| 58 |
+
def load_f32(name):
|
| 59 |
+
d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
|
| 60 |
+
f = d.view(np.float16).astype(np.float32); _refs.append(f); return f.ctypes.data_as(f32p)
|
| 61 |
+
|
| 62 |
+
def load_unary(name):
|
| 63 |
+
fn = name.replace(".","_")
|
| 64 |
+
s = np.fromfile(os.path.join(model_dir, f"{fn}.sign"), dtype=np.uint64)
|
| 65 |
+
p = np.fromfile(os.path.join(model_dir, f"{fn}.planes"), dtype=np.uint64)
|
| 66 |
+
sc = np.fromfile(os.path.join(model_dir, f"{fn}.scales"), dtype=np.float32)
|
| 67 |
+
_refs.extend([s,p,sc])
|
| 68 |
+
return s.ctypes.data_as(u64p), p.ctypes.data_as(u64p), sc.ctypes.data_as(f32p)
|
| 69 |
+
|
| 70 |
+
lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
|
| 71 |
+
lib.model_set_final_norm(model, load_f32("model.norm.weight"))
|
| 72 |
+
|
| 73 |
+
print(f"Loading {n_layers} layers...")
|
| 74 |
+
um = manifest["unary"]
|
| 75 |
+
for l in range(n_layers):
|
| 76 |
+
p = f"model.layers.{l}"
|
| 77 |
+
lib.layer_set_norms(model, l, load_f32(f"{p}.input_layernorm.weight"),
|
| 78 |
+
load_f32(f"{p}.post_attention_layernorm.weight"))
|
| 79 |
+
qn = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
|
| 80 |
+
if os.path.exists(qn):
|
| 81 |
+
lib.layer_set_qk_norm(model, l, load_f32(f"{p}.self_attn.q_norm.weight"),
|
| 82 |
+
load_f32(f"{p}.self_attn.k_norm.weight"))
|
| 83 |
+
|
| 84 |
+
projs = ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
|
| 85 |
+
"mlp.gate_proj","mlp.up_proj","mlp.down_proj"]
|
| 86 |
+
args = [model, l]
|
| 87 |
+
for pj in projs:
|
| 88 |
+
key = f"{p}.{pj}.weight"
|
| 89 |
+
s,pl,sc = load_unary(key)
|
| 90 |
+
args.extend([s, pl, sc, um[key][0], um[key][1]])
|
| 91 |
+
args.append(w_planes)
|
| 92 |
+
lib.layer_set_linears(*args)
|
| 93 |
+
|
| 94 |
+
if (l+1) % 12 == 0 or l == n_layers-1:
|
| 95 |
+
print(f" Layer {l+1}/{n_layers}")
|
| 96 |
+
|
| 97 |
+
print("Tokenizing...")
|
| 98 |
+
from transformers import AutoTokenizer
|
| 99 |
+
tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
| 100 |
+
ids = tok.encode(prompt)
|
| 101 |
+
print(f"Prompt: {len(ids)} tokens")
|
| 102 |
+
|
| 103 |
+
eos = config.get("eos_token_id", 151645)
|
| 104 |
+
pa = (ctypes.c_int * len(ids))(*ids)
|
| 105 |
+
oa = (ctypes.c_int * max_tokens)()
|
| 106 |
+
|
| 107 |
+
print(f"\nGenerating (w={w_planes}log a={a_planes}log pairs={w_planes*a_planes})...")
|
| 108 |
+
t0 = time.time()
|
| 109 |
+
n = lib.generate(model, pa, len(ids), oa, max_tokens,
|
| 110 |
+
ctypes.c_float(temperature), ctypes.c_float(top_p), eos)
|
| 111 |
+
dt = time.time() - t0
|
| 112 |
+
|
| 113 |
+
text = tok.decode([oa[i] for i in range(n)], skip_special_tokens=True)
|
| 114 |
+
print(f"\n=== LOG-UNARY ({n} tok in {dt:.1f}s = {n/dt:.2f} tok/s) ===")
|
| 115 |
+
print(text)
|
| 116 |
+
print(f"\nDecode: {n/dt:.2f} tok/s")
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
d = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-log-unary"
|
| 120 |
+
p = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
|
| 121 |
+
mt = int(sys.argv[3]) if len(sys.argv) > 3 else 32
|
| 122 |
+
ap = int(sys.argv[4]) if len(sys.argv) > 4 else 4
|
| 123 |
+
load_and_run(d, p, mt, a_planes=ap)
|
run_pure_unary.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pure unary model loader - ALL matmuls are AND+popcount
|
| 4 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 5 |
+
"""
|
| 6 |
+
import ctypes, numpy as np, os, sys, json, time
|
| 7 |
+
|
| 8 |
+
def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9, a_planes=4):
|
| 9 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 10 |
+
manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
|
| 11 |
+
w_planes = manifest["n_planes"]
|
| 12 |
+
n_layers = config["num_hidden_layers"]
|
| 13 |
+
hidden = config["hidden_size"]
|
| 14 |
+
inter = config["intermediate_size"]
|
| 15 |
+
n_heads = config["num_attention_heads"]
|
| 16 |
+
n_kv_heads = config["num_key_value_heads"]
|
| 17 |
+
head_dim = config.get("head_dim", hidden // n_heads)
|
| 18 |
+
vocab = config["vocab_size"]
|
| 19 |
+
rope_theta = config.get("rope_theta", 10000.0)
|
| 20 |
+
tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
|
| 21 |
+
|
| 22 |
+
print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}")
|
| 23 |
+
print(f"Weight planes: {w_planes}, Activation planes: {a_planes}")
|
| 24 |
+
print(f"Plane pairs per element: {w_planes * a_planes}")
|
| 25 |
+
print(f"Tied embeddings: {'yes' if tie_embeddings else 'no'}")
|
| 26 |
+
|
| 27 |
+
engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "pure_unary_engine.so")
|
| 28 |
+
lib = ctypes.CDLL(engine_path)
|
| 29 |
+
|
| 30 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 31 |
+
lib.model_alloc.argtypes = [
|
| 32 |
+
ctypes.c_int, ctypes.c_int, # w_planes, a_planes
|
| 33 |
+
ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
| 34 |
+
ctypes.c_int, ctypes.c_int, ctypes.c_int,
|
| 35 |
+
ctypes.c_float, ctypes.c_int,
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
|
| 39 |
+
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 40 |
+
|
| 41 |
+
lib.generate.restype = ctypes.c_int
|
| 42 |
+
lib.generate.argtypes = [
|
| 43 |
+
ctypes.c_void_p,
|
| 44 |
+
ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 45 |
+
ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 46 |
+
ctypes.c_float, ctypes.c_float, ctypes.c_int
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
u16p = ctypes.POINTER(ctypes.c_uint16)
|
| 50 |
+
f32p = ctypes.POINTER(ctypes.c_float)
|
| 51 |
+
u64p = ctypes.POINTER(ctypes.c_uint64)
|
| 52 |
+
|
| 53 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
|
| 54 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
|
| 55 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 56 |
+
lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 57 |
+
lib.layer_set_linears.argtypes = [
|
| 58 |
+
ctypes.c_void_p, ctypes.c_int,
|
| 59 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 60 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 61 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 62 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 63 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 64 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 65 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 66 |
+
ctypes.c_int,
|
| 67 |
+
]
|
| 68 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 69 |
+
|
| 70 |
+
print("Allocating model...")
|
| 71 |
+
model = lib.model_alloc(
|
| 72 |
+
w_planes, a_planes,
|
| 73 |
+
hidden, inter, n_heads, n_kv_heads,
|
| 74 |
+
head_dim, n_layers, vocab, rope_theta, tie_embeddings
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
_refs = []
|
| 78 |
+
|
| 79 |
+
def load_fp16(name):
|
| 80 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 81 |
+
data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
|
| 82 |
+
_refs.append(data)
|
| 83 |
+
return data.ctypes.data_as(u16p)
|
| 84 |
+
|
| 85 |
+
def load_f32(name):
|
| 86 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 87 |
+
data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
|
| 88 |
+
f32 = data.view(np.float16).astype(np.float32)
|
| 89 |
+
_refs.append(f32)
|
| 90 |
+
return f32.ctypes.data_as(f32p)
|
| 91 |
+
|
| 92 |
+
def load_unary(name):
|
| 93 |
+
fname = name.replace(".", "_")
|
| 94 |
+
sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
|
| 95 |
+
planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
|
| 96 |
+
scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
|
| 97 |
+
_refs.extend([sign, planes, scales])
|
| 98 |
+
return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
|
| 99 |
+
scales.ctypes.data_as(f32p))
|
| 100 |
+
|
| 101 |
+
print("Loading embeddings...")
|
| 102 |
+
lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
|
| 103 |
+
|
| 104 |
+
print("Loading final norm...")
|
| 105 |
+
lib.model_set_final_norm(model, load_f32("model.norm.weight"))
|
| 106 |
+
|
| 107 |
+
print(f"Loading {n_layers} layers...")
|
| 108 |
+
for l in range(n_layers):
|
| 109 |
+
p = f"model.layers.{l}"
|
| 110 |
+
lib.layer_set_norms(model, l,
|
| 111 |
+
load_f32(f"{p}.input_layernorm.weight"),
|
| 112 |
+
load_f32(f"{p}.post_attention_layernorm.weight"))
|
| 113 |
+
|
| 114 |
+
# QK-Norm (Qwen3)
|
| 115 |
+
qn_path = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
|
| 116 |
+
if os.path.exists(qn_path):
|
| 117 |
+
lib.layer_set_qk_norm(model, l,
|
| 118 |
+
load_f32(f"{p}.self_attn.q_norm.weight"),
|
| 119 |
+
load_f32(f"{p}.self_attn.k_norm.weight"))
|
| 120 |
+
|
| 121 |
+
q_s, q_p, q_sc = load_unary(f"{p}.self_attn.q_proj.weight")
|
| 122 |
+
k_s, k_p, k_sc = load_unary(f"{p}.self_attn.k_proj.weight")
|
| 123 |
+
v_s, v_p, v_sc = load_unary(f"{p}.self_attn.v_proj.weight")
|
| 124 |
+
o_s, o_p, o_sc = load_unary(f"{p}.self_attn.o_proj.weight")
|
| 125 |
+
g_s, g_p, g_sc = load_unary(f"{p}.mlp.gate_proj.weight")
|
| 126 |
+
u_s, u_p, u_sc = load_unary(f"{p}.mlp.up_proj.weight")
|
| 127 |
+
d_s, d_p, d_sc = load_unary(f"{p}.mlp.down_proj.weight")
|
| 128 |
+
|
| 129 |
+
um = manifest["unary"]
|
| 130 |
+
lib.layer_set_linears(model, l,
|
| 131 |
+
q_s, q_p, q_sc, um[f"{p}.self_attn.q_proj.weight"][0], um[f"{p}.self_attn.q_proj.weight"][1],
|
| 132 |
+
k_s, k_p, k_sc, um[f"{p}.self_attn.k_proj.weight"][0], um[f"{p}.self_attn.k_proj.weight"][1],
|
| 133 |
+
v_s, v_p, v_sc, um[f"{p}.self_attn.v_proj.weight"][0], um[f"{p}.self_attn.v_proj.weight"][1],
|
| 134 |
+
o_s, o_p, o_sc, um[f"{p}.self_attn.o_proj.weight"][0], um[f"{p}.self_attn.o_proj.weight"][1],
|
| 135 |
+
g_s, g_p, g_sc, um[f"{p}.mlp.gate_proj.weight"][0], um[f"{p}.mlp.gate_proj.weight"][1],
|
| 136 |
+
u_s, u_p, u_sc, um[f"{p}.mlp.up_proj.weight"][0], um[f"{p}.mlp.up_proj.weight"][1],
|
| 137 |
+
d_s, d_p, d_sc, um[f"{p}.mlp.down_proj.weight"][0], um[f"{p}.mlp.down_proj.weight"][1],
|
| 138 |
+
w_planes)
|
| 139 |
+
|
| 140 |
+
if (l + 1) % 6 == 0 or l == n_layers - 1:
|
| 141 |
+
print(f" Loaded layer {l+1}/{n_layers}")
|
| 142 |
+
|
| 143 |
+
# Tokenize
|
| 144 |
+
print("Tokenizing...")
|
| 145 |
+
from transformers import AutoTokenizer
|
| 146 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
| 147 |
+
input_ids = tokenizer.encode(prompt)
|
| 148 |
+
print(f"Prompt: {len(input_ids)} tokens -> {repr(prompt[:60])}")
|
| 149 |
+
|
| 150 |
+
eos_token = config.get("eos_token_id", 151645)
|
| 151 |
+
prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
|
| 152 |
+
out_arr = (ctypes.c_int * max_tokens)()
|
| 153 |
+
|
| 154 |
+
print(f"\nGenerating (temp={temperature}, top_p={top_p}, a_planes={a_planes})...")
|
| 155 |
+
t0 = time.time()
|
| 156 |
+
n_gen = lib.generate(
|
| 157 |
+
model, prompt_arr, len(input_ids),
|
| 158 |
+
out_arr, max_tokens,
|
| 159 |
+
ctypes.c_float(temperature), ctypes.c_float(top_p), eos_token
|
| 160 |
+
)
|
| 161 |
+
dt = time.time() - t0
|
| 162 |
+
|
| 163 |
+
out_ids = [out_arr[i] for i in range(n_gen)]
|
| 164 |
+
text = tokenizer.decode(out_ids, skip_special_tokens=True)
|
| 165 |
+
|
| 166 |
+
print(f"\n=== PURE UNARY Output ({n_gen} tokens in {dt:.1f}s = {n_gen/dt:.2f} tok/s) ===")
|
| 167 |
+
print(text)
|
| 168 |
+
print(f"\nDecode speed: {n_gen/dt:.2f} tok/s")
|
| 169 |
+
return text
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
|
| 173 |
+
prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
|
| 174 |
+
max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 32
|
| 175 |
+
a_planes = int(sys.argv[4]) if len(sys.argv) > 4 else 4
|
| 176 |
+
load_and_run(model_dir, prompt, max_tokens=max_tokens, a_planes=a_planes)
|
run_qwen3_4b.py
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unary model loader for Qwen3-4B-Thinking.
|
| 4 |
+
Loads converted weights and runs inference via unary_engine_v2.so
|
| 5 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 6 |
+
"""
|
| 7 |
+
import ctypes, numpy as np, os, sys, json, time
|
| 8 |
+
|
| 9 |
+
def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9):
|
| 10 |
+
# Load config
|
| 11 |
+
config = json.load(open(os.path.join(model_dir, "config.json")))
|
| 12 |
+
manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
|
| 13 |
+
n_planes = manifest["n_planes"]
|
| 14 |
+
n_layers = config["num_hidden_layers"]
|
| 15 |
+
hidden = config["hidden_size"]
|
| 16 |
+
inter = config["intermediate_size"]
|
| 17 |
+
n_heads = config["num_attention_heads"]
|
| 18 |
+
n_kv_heads = config["num_key_value_heads"]
|
| 19 |
+
head_dim = config.get("head_dim", hidden // n_heads)
|
| 20 |
+
vocab = config["vocab_size"]
|
| 21 |
+
rope_theta = config.get("rope_theta", 10000.0)
|
| 22 |
+
has_attn_bias = 1 if config.get("attention_bias", False) else 0
|
| 23 |
+
tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
|
| 24 |
+
|
| 25 |
+
print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}, vocab={vocab}")
|
| 26 |
+
print(f"QK-Norm: yes, Tied embeddings: {'yes' if tie_embeddings else 'no'}, n_planes={n_planes}")
|
| 27 |
+
|
| 28 |
+
# Load C engine
|
| 29 |
+
engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "unary_engine_v2.so")
|
| 30 |
+
lib = ctypes.CDLL(engine_path)
|
| 31 |
+
|
| 32 |
+
# Configure function signatures
|
| 33 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 34 |
+
lib.model_alloc.argtypes = [
|
| 35 |
+
ctypes.c_int, # n_planes
|
| 36 |
+
ctypes.c_int, # hidden
|
| 37 |
+
ctypes.c_int, # inter
|
| 38 |
+
ctypes.c_int, # n_heads
|
| 39 |
+
ctypes.c_int, # n_kv_heads
|
| 40 |
+
ctypes.c_int, # head_dim
|
| 41 |
+
ctypes.c_int, # n_layers
|
| 42 |
+
ctypes.c_int, # vocab
|
| 43 |
+
ctypes.c_float, # rope_theta
|
| 44 |
+
ctypes.c_int, # has_attn_bias
|
| 45 |
+
ctypes.c_int, # tie_embeddings
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
|
| 49 |
+
lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 50 |
+
|
| 51 |
+
lib.generate.restype = ctypes.c_int
|
| 52 |
+
lib.generate.argtypes = [
|
| 53 |
+
ctypes.c_void_p,
|
| 54 |
+
ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 55 |
+
ctypes.POINTER(ctypes.c_int), ctypes.c_int,
|
| 56 |
+
ctypes.c_float, ctypes.c_float, ctypes.c_int
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
u16p = ctypes.POINTER(ctypes.c_uint16)
|
| 60 |
+
f32p = ctypes.POINTER(ctypes.c_float)
|
| 61 |
+
u64p = ctypes.POINTER(ctypes.c_uint64)
|
| 62 |
+
|
| 63 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
|
| 64 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
|
| 65 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, u16p, ctypes.c_int, ctypes.c_int]
|
| 66 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 67 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p, f32p]
|
| 68 |
+
lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
|
| 69 |
+
lib.layer_set_linears.argtypes = [
|
| 70 |
+
ctypes.c_void_p, ctypes.c_int,
|
| 71 |
+
# q: sign, planes, scales, out, in
|
| 72 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 73 |
+
# k
|
| 74 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 75 |
+
# v
|
| 76 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 77 |
+
# o
|
| 78 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 79 |
+
# gate
|
| 80 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 81 |
+
# up
|
| 82 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 83 |
+
# down
|
| 84 |
+
u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
|
| 85 |
+
ctypes.c_int, # n_planes
|
| 86 |
+
]
|
| 87 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 88 |
+
|
| 89 |
+
# Allocate model
|
| 90 |
+
print("Allocating model...")
|
| 91 |
+
model = lib.model_alloc(
|
| 92 |
+
n_planes, hidden, inter, n_heads, n_kv_heads,
|
| 93 |
+
head_dim, n_layers, vocab, rope_theta,
|
| 94 |
+
has_attn_bias, tie_embeddings
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
# Keep references to prevent GC
|
| 98 |
+
_refs = []
|
| 99 |
+
|
| 100 |
+
def load_fp16(name):
|
| 101 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 102 |
+
path = os.path.join(model_dir, fname)
|
| 103 |
+
data = np.fromfile(path, dtype=np.uint16)
|
| 104 |
+
_refs.append(data)
|
| 105 |
+
return data.ctypes.data_as(u16p)
|
| 106 |
+
|
| 107 |
+
def load_f32_from_fp16(name):
|
| 108 |
+
fname = name.replace(".", "_") + ".fp16"
|
| 109 |
+
path = os.path.join(model_dir, fname)
|
| 110 |
+
data = np.fromfile(path, dtype=np.uint16)
|
| 111 |
+
# Convert FP16 -> FP32
|
| 112 |
+
f32 = data.view(np.float16).astype(np.float32)
|
| 113 |
+
_refs.append(f32)
|
| 114 |
+
return f32.ctypes.data_as(f32p)
|
| 115 |
+
|
| 116 |
+
def load_unary(name):
|
| 117 |
+
fname = name.replace(".", "_")
|
| 118 |
+
sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
|
| 119 |
+
planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
|
| 120 |
+
scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
|
| 121 |
+
_refs.extend([sign, planes, scales])
|
| 122 |
+
return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
|
| 123 |
+
scales.ctypes.data_as(f32p))
|
| 124 |
+
|
| 125 |
+
# Load embeddings
|
| 126 |
+
print("Loading embeddings...")
|
| 127 |
+
embed_ptr = load_fp16("model.embed_tokens.weight")
|
| 128 |
+
lib.model_set_embed(model, embed_ptr)
|
| 129 |
+
|
| 130 |
+
# Load final norm
|
| 131 |
+
print("Loading final norm...")
|
| 132 |
+
fnorm_ptr = load_f32_from_fp16("model.norm.weight")
|
| 133 |
+
lib.model_set_final_norm(model, fnorm_ptr)
|
| 134 |
+
|
| 135 |
+
# Load layers
|
| 136 |
+
print(f"Loading {n_layers} layers...")
|
| 137 |
+
for l in range(n_layers):
|
| 138 |
+
prefix = f"model.layers.{l}"
|
| 139 |
+
|
| 140 |
+
# Norms
|
| 141 |
+
in_norm = load_f32_from_fp16(f"{prefix}.input_layernorm.weight")
|
| 142 |
+
post_norm = load_f32_from_fp16(f"{prefix}.post_attention_layernorm.weight")
|
| 143 |
+
lib.layer_set_norms(model, l, in_norm, post_norm)
|
| 144 |
+
|
| 145 |
+
# QK-Norm
|
| 146 |
+
q_norm = load_f32_from_fp16(f"{prefix}.self_attn.q_norm.weight")
|
| 147 |
+
k_norm = load_f32_from_fp16(f"{prefix}.self_attn.k_norm.weight")
|
| 148 |
+
lib.layer_set_qk_norm(model, l, q_norm, k_norm)
|
| 149 |
+
|
| 150 |
+
# Linear layers
|
| 151 |
+
q_s, q_p, q_sc = load_unary(f"{prefix}.self_attn.q_proj.weight")
|
| 152 |
+
k_s, k_p, k_sc = load_unary(f"{prefix}.self_attn.k_proj.weight")
|
| 153 |
+
v_s, v_p, v_sc = load_unary(f"{prefix}.self_attn.v_proj.weight")
|
| 154 |
+
o_s, o_p, o_sc = load_unary(f"{prefix}.self_attn.o_proj.weight")
|
| 155 |
+
g_s, g_p, g_sc = load_unary(f"{prefix}.mlp.gate_proj.weight")
|
| 156 |
+
u_s, u_p, u_sc = load_unary(f"{prefix}.mlp.up_proj.weight")
|
| 157 |
+
d_s, d_p, d_sc = load_unary(f"{prefix}.mlp.down_proj.weight")
|
| 158 |
+
|
| 159 |
+
# Dims from manifest
|
| 160 |
+
q_shape = manifest["unary"][f"{prefix}.self_attn.q_proj.weight"]
|
| 161 |
+
k_shape = manifest["unary"][f"{prefix}.self_attn.k_proj.weight"]
|
| 162 |
+
v_shape = manifest["unary"][f"{prefix}.self_attn.v_proj.weight"]
|
| 163 |
+
o_shape = manifest["unary"][f"{prefix}.self_attn.o_proj.weight"]
|
| 164 |
+
g_shape = manifest["unary"][f"{prefix}.mlp.gate_proj.weight"]
|
| 165 |
+
u_shape = manifest["unary"][f"{prefix}.mlp.up_proj.weight"]
|
| 166 |
+
d_shape = manifest["unary"][f"{prefix}.mlp.down_proj.weight"]
|
| 167 |
+
|
| 168 |
+
lib.layer_set_linears(
|
| 169 |
+
model, l,
|
| 170 |
+
q_s, q_p, q_sc, q_shape[0], q_shape[1],
|
| 171 |
+
k_s, k_p, k_sc, k_shape[0], k_shape[1],
|
| 172 |
+
v_s, v_p, v_sc, v_shape[0], v_shape[1],
|
| 173 |
+
o_s, o_p, o_sc, o_shape[0], o_shape[1],
|
| 174 |
+
g_s, g_p, g_sc, g_shape[0], g_shape[1],
|
| 175 |
+
u_s, u_p, u_sc, u_shape[0], u_shape[1],
|
| 176 |
+
d_s, d_p, d_sc, d_shape[0], d_shape[1],
|
| 177 |
+
n_planes
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
if (l + 1) % 6 == 0 or l == n_layers - 1:
|
| 181 |
+
print(f" Loaded layer {l+1}/{n_layers}")
|
| 182 |
+
|
| 183 |
+
# Tokenize
|
| 184 |
+
print("Tokenizing prompt...")
|
| 185 |
+
from transformers import AutoTokenizer
|
| 186 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
|
| 187 |
+
input_ids = tokenizer.encode(prompt)
|
| 188 |
+
print(f"Prompt: {len(input_ids)} tokens")
|
| 189 |
+
|
| 190 |
+
eos_token = config.get("eos_token_id", 151645)
|
| 191 |
+
|
| 192 |
+
# Generate
|
| 193 |
+
prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
|
| 194 |
+
out_arr = (ctypes.c_int * max_tokens)()
|
| 195 |
+
|
| 196 |
+
print(f"\nGenerating (temp={temperature}, top_p={top_p})...")
|
| 197 |
+
t0 = time.time()
|
| 198 |
+
n_generated = lib.generate(
|
| 199 |
+
model, prompt_arr, len(input_ids),
|
| 200 |
+
out_arr, max_tokens,
|
| 201 |
+
ctypes.c_float(temperature), ctypes.c_float(top_p),
|
| 202 |
+
eos_token
|
| 203 |
+
)
|
| 204 |
+
dt = time.time() - t0
|
| 205 |
+
|
| 206 |
+
out_ids = [out_arr[i] for i in range(n_generated)]
|
| 207 |
+
text = tokenizer.decode(out_ids, skip_special_tokens=True)
|
| 208 |
+
|
| 209 |
+
total_tokens = len(input_ids) + n_generated
|
| 210 |
+
print(f"\n=== Output ({n_generated} tokens in {dt:.1f}s = {n_generated/dt:.1f} tok/s) ===")
|
| 211 |
+
print(text)
|
| 212 |
+
print(f"\nPrefill: {len(input_ids)} tokens, Decode: {n_generated} tokens")
|
| 213 |
+
print(f"Total time: {dt:.1f}s, Speed: {total_tokens/dt:.1f} tok/s total, {n_generated/dt:.1f} tok/s decode")
|
| 214 |
+
|
| 215 |
+
return text
|
| 216 |
+
|
| 217 |
+
if __name__ == "__main__":
|
| 218 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
|
| 219 |
+
prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
|
| 220 |
+
max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 64
|
| 221 |
+
load_and_run(model_dir, prompt, max_tokens=max_tokens)
|
server.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
OpenAI-compatible API server for Ternary Transformer Engine.
|
| 4 |
+
Drop-in replacement for llama-server.
|
| 5 |
+
|
| 6 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
import time
|
| 11 |
+
import threading
|
| 12 |
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
| 13 |
+
from inference import TernaryQwen, Tokenizer, load_kernel
|
| 14 |
+
import os
|
| 15 |
+
|
| 16 |
+
MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary")
|
| 17 |
+
TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf")
|
| 18 |
+
HOST = os.environ.get("HOST", "127.0.0.1")
|
| 19 |
+
PORT = int(os.environ.get("PORT", "8080"))
|
| 20 |
+
|
| 21 |
+
print("Loading ternary kernel...")
|
| 22 |
+
kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
|
| 23 |
+
|
| 24 |
+
print(f"Loading model from {MODEL_DIR}...")
|
| 25 |
+
model = TernaryQwen(MODEL_DIR, kernel)
|
| 26 |
+
|
| 27 |
+
print(f"Loading tokenizer from {TOKENIZER_DIR}...")
|
| 28 |
+
tokenizer = Tokenizer(TOKENIZER_DIR)
|
| 29 |
+
|
| 30 |
+
lock = threading.Lock()
|
| 31 |
+
print("Ready!")
|
| 32 |
+
|
| 33 |
+
class Handler(BaseHTTPRequestHandler):
|
| 34 |
+
def do_POST(self):
|
| 35 |
+
if self.path == "/v1/chat/completions":
|
| 36 |
+
length = int(self.headers.get("Content-Length", 0))
|
| 37 |
+
body = json.loads(self.rfile.read(length))
|
| 38 |
+
|
| 39 |
+
messages = body.get("messages", [])
|
| 40 |
+
max_tokens = body.get("max_tokens", 256)
|
| 41 |
+
temperature = body.get("temperature", 0.6)
|
| 42 |
+
top_p = body.get("top_p", 0.95)
|
| 43 |
+
|
| 44 |
+
# Build prompt
|
| 45 |
+
prompt = tokenizer.apply_chat_template(messages)
|
| 46 |
+
input_ids = tokenizer.encode(prompt)
|
| 47 |
+
|
| 48 |
+
# Generate
|
| 49 |
+
with lock:
|
| 50 |
+
gen_ids, stats = model.generate(
|
| 51 |
+
input_ids,
|
| 52 |
+
max_new_tokens=max_tokens,
|
| 53 |
+
temperature=temperature,
|
| 54 |
+
top_p=top_p
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
text = tokenizer.decode(gen_ids)
|
| 58 |
+
|
| 59 |
+
response = {
|
| 60 |
+
"id": f"chatcmpl-ternary-{int(time.time())}",
|
| 61 |
+
"object": "chat.completion",
|
| 62 |
+
"created": int(time.time()),
|
| 63 |
+
"model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY",
|
| 64 |
+
"choices": [{
|
| 65 |
+
"index": 0,
|
| 66 |
+
"message": {"role": "assistant", "content": text},
|
| 67 |
+
"finish_reason": "stop"
|
| 68 |
+
}],
|
| 69 |
+
"usage": {
|
| 70 |
+
"prompt_tokens": len(input_ids),
|
| 71 |
+
"completion_tokens": stats["tokens_generated"],
|
| 72 |
+
"total_tokens": len(input_ids) + stats["tokens_generated"]
|
| 73 |
+
},
|
| 74 |
+
"timings": {
|
| 75 |
+
"prompt_n": stats["prefill_tokens"],
|
| 76 |
+
"prompt_ms": stats["prefill_ms"],
|
| 77 |
+
"predicted_n": stats["tokens_generated"],
|
| 78 |
+
"predicted_ms": stats["decode_ms"],
|
| 79 |
+
"predicted_per_second": stats["tok_per_sec"],
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
self.send_response(200)
|
| 84 |
+
self.send_header("Content-Type", "application/json")
|
| 85 |
+
self.end_headers()
|
| 86 |
+
self.wfile.write(json.dumps(response).encode())
|
| 87 |
+
else:
|
| 88 |
+
self.send_response(404)
|
| 89 |
+
self.end_headers()
|
| 90 |
+
|
| 91 |
+
def do_GET(self):
|
| 92 |
+
if self.path == "/health":
|
| 93 |
+
self.send_response(200)
|
| 94 |
+
self.send_header("Content-Type", "application/json")
|
| 95 |
+
self.end_headers()
|
| 96 |
+
self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}')
|
| 97 |
+
else:
|
| 98 |
+
self.send_response(404)
|
| 99 |
+
self.end_headers()
|
| 100 |
+
|
| 101 |
+
def log_message(self, format, *args):
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
if __name__ == "__main__":
|
| 105 |
+
server = HTTPServer((HOST, PORT), Handler)
|
| 106 |
+
print(f"Ternary engine serving on {HOST}:{PORT}")
|
| 107 |
+
server.serve_forever()
|
ternary_kernel.c
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Ternary Neural Network Kernel - AVX-512 optimized
|
| 3 |
+
*
|
| 4 |
+
* Weights are stored as two bitplanes per row:
|
| 5 |
+
* pos_mask: bit=1 where weight = +1
|
| 6 |
+
* neg_mask: bit=1 where weight = -1
|
| 7 |
+
* (both 0 = weight is 0)
|
| 8 |
+
*
|
| 9 |
+
* Matmul becomes: y[i] = sum(x[j] where pos) - sum(x[j] where neg)
|
| 10 |
+
* No multiplication at all — just masked add/subtract.
|
| 11 |
+
*
|
| 12 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
#include <immintrin.h>
|
| 16 |
+
#include <stdint.h>
|
| 17 |
+
#include <stdlib.h>
|
| 18 |
+
#include <string.h>
|
| 19 |
+
#include <math.h>
|
| 20 |
+
#include <stdio.h>
|
| 21 |
+
|
| 22 |
+
/* ============================================================
|
| 23 |
+
* Core ternary matmul: y = W_ternary @ x
|
| 24 |
+
*
|
| 25 |
+
* W stored as bitplanes: pos_bits[out_dim][ceil(in_dim/64)] uint64
|
| 26 |
+
* neg_bits[out_dim][ceil(in_dim/64)] uint64
|
| 27 |
+
* x: float32[in_dim]
|
| 28 |
+
* y: float32[out_dim]
|
| 29 |
+
* bias: float32[out_dim] or NULL
|
| 30 |
+
* scale: float32 per-row scale factor (to recover magnitude)
|
| 31 |
+
* ============================================================ */
|
| 32 |
+
void ternary_matvec_avx512(
|
| 33 |
+
const uint64_t *pos_bits, /* [out_dim * chunks] */
|
| 34 |
+
const uint64_t *neg_bits, /* [out_dim * chunks] */
|
| 35 |
+
const float *scales, /* [out_dim] per-row scale */
|
| 36 |
+
const float *x, /* [in_dim] input activations */
|
| 37 |
+
float *y, /* [out_dim] output */
|
| 38 |
+
int out_dim,
|
| 39 |
+
int in_dim
|
| 40 |
+
) {
|
| 41 |
+
int chunks = (in_dim + 63) / 64; /* 64 weights per uint64 */
|
| 42 |
+
|
| 43 |
+
/* Pad input to multiple of 16 floats for AVX-512 */
|
| 44 |
+
int in_padded = (in_dim + 15) & ~15;
|
| 45 |
+
float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
|
| 46 |
+
memcpy(x_pad, x, in_dim * sizeof(float));
|
| 47 |
+
memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
|
| 48 |
+
|
| 49 |
+
for (int i = 0; i < out_dim; i++) {
|
| 50 |
+
__m512 acc = _mm512_setzero_ps();
|
| 51 |
+
|
| 52 |
+
const uint64_t *row_pos = pos_bits + (size_t)i * chunks;
|
| 53 |
+
const uint64_t *row_neg = neg_bits + (size_t)i * chunks;
|
| 54 |
+
|
| 55 |
+
/* Process 64 weights at a time (4 AVX-512 ops of 16 floats each) */
|
| 56 |
+
for (int c = 0; c < chunks; c++) {
|
| 57 |
+
uint64_t pb = row_pos[c];
|
| 58 |
+
uint64_t nb = row_neg[c];
|
| 59 |
+
int base = c * 64;
|
| 60 |
+
|
| 61 |
+
/* Process in groups of 16 floats */
|
| 62 |
+
for (int g = 0; g < 4 && (base + g * 16) < in_padded; g++) {
|
| 63 |
+
int offset = base + g * 16;
|
| 64 |
+
__m512 xv = _mm512_load_ps(x_pad + offset);
|
| 65 |
+
|
| 66 |
+
/* Extract 16 bits for this group */
|
| 67 |
+
__mmask16 pmask = (__mmask16)((pb >> (g * 16)) & 0xFFFF);
|
| 68 |
+
__mmask16 nmask = (__mmask16)((nb >> (g * 16)) & 0xFFFF);
|
| 69 |
+
|
| 70 |
+
/* Masked add where weight = +1, masked subtract where weight = -1 */
|
| 71 |
+
acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
|
| 72 |
+
acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
|
| 73 |
+
}
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
/* Horizontal sum */
|
| 77 |
+
float sum = _mm512_reduce_add_ps(acc);
|
| 78 |
+
|
| 79 |
+
/* Apply per-row scale to recover magnitude */
|
| 80 |
+
y[i] = sum * scales[i];
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
free(x_pad);
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
/* ============================================================
|
| 87 |
+
* Batched version: Y = W_ternary @ X (multiple input vectors)
|
| 88 |
+
* X: [batch, in_dim], Y: [batch, out_dim]
|
| 89 |
+
* ============================================================ */
|
| 90 |
+
void ternary_matmul_avx512(
|
| 91 |
+
const uint64_t *pos_bits,
|
| 92 |
+
const uint64_t *neg_bits,
|
| 93 |
+
const float *scales,
|
| 94 |
+
const float *X,
|
| 95 |
+
float *Y,
|
| 96 |
+
int batch,
|
| 97 |
+
int out_dim,
|
| 98 |
+
int in_dim
|
| 99 |
+
) {
|
| 100 |
+
for (int b = 0; b < batch; b++) {
|
| 101 |
+
ternary_matvec_avx512(
|
| 102 |
+
pos_bits, neg_bits, scales,
|
| 103 |
+
X + (size_t)b * in_dim,
|
| 104 |
+
Y + (size_t)b * out_dim,
|
| 105 |
+
out_dim, in_dim
|
| 106 |
+
);
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
/* ============================================================
|
| 111 |
+
* RMSNorm: y = x * (1/rms(x)) * weight
|
| 112 |
+
* ============================================================ */
|
| 113 |
+
void rmsnorm_avx512(
|
| 114 |
+
const float *x,
|
| 115 |
+
const float *weight,
|
| 116 |
+
float *y,
|
| 117 |
+
int dim,
|
| 118 |
+
float eps
|
| 119 |
+
) {
|
| 120 |
+
/* Compute sum of squares */
|
| 121 |
+
__m512 sum_sq = _mm512_setzero_ps();
|
| 122 |
+
int i;
|
| 123 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 124 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 125 |
+
sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
|
| 126 |
+
}
|
| 127 |
+
float ss = _mm512_reduce_add_ps(sum_sq);
|
| 128 |
+
/* Handle remainder */
|
| 129 |
+
for (; i < dim; i++) ss += x[i] * x[i];
|
| 130 |
+
|
| 131 |
+
float rms = 1.0f / sqrtf(ss / dim + eps);
|
| 132 |
+
|
| 133 |
+
/* Apply norm and weight */
|
| 134 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 135 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 136 |
+
__m512 wv = _mm512_loadu_ps(weight + i);
|
| 137 |
+
__m512 rv = _mm512_set1_ps(rms);
|
| 138 |
+
__m512 out = _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv);
|
| 139 |
+
_mm512_storeu_ps(y + i, out);
|
| 140 |
+
}
|
| 141 |
+
for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
/* ============================================================
|
| 145 |
+
* SiLU activation: x * sigmoid(x)
|
| 146 |
+
* ============================================================ */
|
| 147 |
+
static inline float silu_scalar(float x) {
|
| 148 |
+
return x / (1.0f + expf(-x));
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
void silu_avx512(float *x, int n) {
|
| 152 |
+
/* Scalar fallback — vectorized exp is complex */
|
| 153 |
+
for (int i = 0; i < n; i++) {
|
| 154 |
+
x[i] = silu_scalar(x[i]);
|
| 155 |
+
}
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/* ============================================================
|
| 159 |
+
* Element-wise multiply: y = a * b
|
| 160 |
+
* ============================================================ */
|
| 161 |
+
void elemwise_mul_avx512(const float *a, const float *b, float *y, int n) {
|
| 162 |
+
int i;
|
| 163 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 164 |
+
__m512 av = _mm512_loadu_ps(a + i);
|
| 165 |
+
__m512 bv = _mm512_loadu_ps(b + i);
|
| 166 |
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(av, bv));
|
| 167 |
+
}
|
| 168 |
+
for (; i < n; i++) y[i] = a[i] * b[i];
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
/* ============================================================
|
| 172 |
+
* Softmax
|
| 173 |
+
* ============================================================ */
|
| 174 |
+
void softmax(float *x, int n) {
|
| 175 |
+
float max_val = x[0];
|
| 176 |
+
for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
|
| 177 |
+
float sum = 0;
|
| 178 |
+
for (int i = 0; i < n; i++) {
|
| 179 |
+
x[i] = expf(x[i] - max_val);
|
| 180 |
+
sum += x[i];
|
| 181 |
+
}
|
| 182 |
+
float inv_sum = 1.0f / sum;
|
| 183 |
+
for (int i = 0; i < n; i++) x[i] *= inv_sum;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
/* ============================================================
|
| 187 |
+
* RoPE (Rotary Position Embedding)
|
| 188 |
+
* ============================================================ */
|
| 189 |
+
void apply_rope(
|
| 190 |
+
float *q, /* [n_heads, head_dim] */
|
| 191 |
+
float *k, /* [n_kv_heads, head_dim] */
|
| 192 |
+
int n_heads,
|
| 193 |
+
int n_kv_heads,
|
| 194 |
+
int head_dim,
|
| 195 |
+
int pos,
|
| 196 |
+
float rope_theta
|
| 197 |
+
) {
|
| 198 |
+
for (int h = 0; h < n_heads + n_kv_heads; h++) {
|
| 199 |
+
float *vec = (h < n_heads) ? q + h * head_dim : k + (h - n_heads) * head_dim;
|
| 200 |
+
for (int i = 0; i < head_dim; i += 2) {
|
| 201 |
+
float freq = 1.0f / powf(rope_theta, (float)i / head_dim);
|
| 202 |
+
float angle = pos * freq;
|
| 203 |
+
float cos_a = cosf(angle);
|
| 204 |
+
float sin_a = sinf(angle);
|
| 205 |
+
float v0 = vec[i];
|
| 206 |
+
float v1 = vec[i + 1];
|
| 207 |
+
vec[i] = v0 * cos_a - v1 * sin_a;
|
| 208 |
+
vec[i + 1] = v0 * sin_a + v1 * cos_a;
|
| 209 |
+
}
|
| 210 |
+
}
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
/* ============================================================
|
| 214 |
+
* Quantization: convert float weights to ternary
|
| 215 |
+
* Uses per-row threshold: threshold = alpha * mean(|w|)
|
| 216 |
+
* Returns: pos_bits, neg_bits, scales
|
| 217 |
+
* ============================================================ */
|
| 218 |
+
void quantize_to_ternary(
|
| 219 |
+
const float *weights, /* [out_dim, in_dim] */
|
| 220 |
+
uint64_t *pos_bits, /* [out_dim * chunks] output */
|
| 221 |
+
uint64_t *neg_bits, /* [out_dim * chunks] output */
|
| 222 |
+
float *scales, /* [out_dim] output */
|
| 223 |
+
int out_dim,
|
| 224 |
+
int in_dim,
|
| 225 |
+
float alpha /* threshold multiplier, typically 0.7-1.0 */
|
| 226 |
+
) {
|
| 227 |
+
int chunks = (in_dim + 63) / 64;
|
| 228 |
+
|
| 229 |
+
for (int i = 0; i < out_dim; i++) {
|
| 230 |
+
const float *row = weights + (size_t)i * in_dim;
|
| 231 |
+
|
| 232 |
+
/* Compute mean absolute value for threshold */
|
| 233 |
+
float abs_sum = 0;
|
| 234 |
+
for (int j = 0; j < in_dim; j++) abs_sum += fabsf(row[j]);
|
| 235 |
+
float mean_abs = abs_sum / in_dim;
|
| 236 |
+
float threshold = alpha * mean_abs;
|
| 237 |
+
|
| 238 |
+
/* Compute scale: mean of absolute values of non-zero quantized weights */
|
| 239 |
+
float nz_sum = 0;
|
| 240 |
+
int nz_count = 0;
|
| 241 |
+
for (int j = 0; j < in_dim; j++) {
|
| 242 |
+
if (fabsf(row[j]) >= threshold) {
|
| 243 |
+
nz_sum += fabsf(row[j]);
|
| 244 |
+
nz_count++;
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
scales[i] = (nz_count > 0) ? (nz_sum / nz_count) : 1.0f;
|
| 248 |
+
|
| 249 |
+
/* Quantize to ternary bits */
|
| 250 |
+
for (int c = 0; c < chunks; c++) {
|
| 251 |
+
uint64_t pb = 0, nb = 0;
|
| 252 |
+
for (int b = 0; b < 64; b++) {
|
| 253 |
+
int j = c * 64 + b;
|
| 254 |
+
if (j >= in_dim) break;
|
| 255 |
+
if (row[j] >= threshold) {
|
| 256 |
+
pb |= (1ULL << b);
|
| 257 |
+
} else if (row[j] <= -threshold) {
|
| 258 |
+
nb |= (1ULL << b);
|
| 259 |
+
}
|
| 260 |
+
}
|
| 261 |
+
pos_bits[(size_t)i * chunks + c] = pb;
|
| 262 |
+
neg_bits[(size_t)i * chunks + c] = nb;
|
| 263 |
+
}
|
| 264 |
+
}
|
| 265 |
+
}
|
test_logunary
ADDED
|
Binary file (26.3 kB). View file
|
|
|
test_logunary.c
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* Log-Unary Tensor Tests
|
| 3 |
+
* Benchmarks accuracy and speed of native base-1 log-encoded tensors
|
| 4 |
+
*/
|
| 5 |
+
#include <stdio.h>
|
| 6 |
+
#include <stdlib.h>
|
| 7 |
+
#include <math.h>
|
| 8 |
+
#include <time.h>
|
| 9 |
+
|
| 10 |
+
/* Forward declarations from library */
|
| 11 |
+
typedef struct LogUnaryTensor LogUnaryTensor;
|
| 12 |
+
typedef struct LogUnaryMatrix LogUnaryMatrix;
|
| 13 |
+
typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
|
| 14 |
+
typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;
|
| 15 |
+
|
| 16 |
+
extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
|
| 17 |
+
extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
|
| 18 |
+
extern void lut_free(LogUnaryTensor *t);
|
| 19 |
+
extern void lum_free(LogUnaryMatrix *m);
|
| 20 |
+
extern void lut_from_float(LogUnaryTensor *t, const float *x);
|
| 21 |
+
extern void lut_to_float(const LogUnaryTensor *t, float *out);
|
| 22 |
+
extern void lum_from_float(LogUnaryMatrix *m, const float *data);
|
| 23 |
+
extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
|
| 24 |
+
extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
|
| 25 |
+
extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
|
| 26 |
+
extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
|
| 27 |
+
extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);
|
| 28 |
+
|
| 29 |
+
/* Test matvec correctness against float reference */
|
| 30 |
+
static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
|
| 31 |
+
printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);
|
| 32 |
+
|
| 33 |
+
/* Random float matrix and vector */
|
| 34 |
+
float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
|
| 35 |
+
float *x_float = (float *)malloc(cols * sizeof(float));
|
| 36 |
+
float *y_ref = (float *)calloc(rows, sizeof(float));
|
| 37 |
+
float *y_lut = (float *)malloc(rows * sizeof(float));
|
| 38 |
+
|
| 39 |
+
srand(42);
|
| 40 |
+
for (size_t i = 0; i < (size_t)rows * cols; i++) {
|
| 41 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 42 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 43 |
+
M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 44 |
+
}
|
| 45 |
+
for (int i = 0; i < cols; i++) {
|
| 46 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 47 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 48 |
+
x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/* Float reference matmul */
|
| 52 |
+
for (int i = 0; i < rows; i++)
|
| 53 |
+
for (int j = 0; j < cols; j++)
|
| 54 |
+
y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];
|
| 55 |
+
|
| 56 |
+
/* Log-unary matmul */
|
| 57 |
+
LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
|
| 58 |
+
LogUnaryTensor *x = lut_alloc(cols, planes, bias);
|
| 59 |
+
LogUnaryTensor *y = lut_alloc(rows, planes, bias);
|
| 60 |
+
|
| 61 |
+
lum_from_float(M, M_float);
|
| 62 |
+
lut_from_float(x, x_float);
|
| 63 |
+
lum_matvec(M, x, y);
|
| 64 |
+
lut_to_float(y, y_lut);
|
| 65 |
+
|
| 66 |
+
/* Compare */
|
| 67 |
+
float dot = 0, na = 0, nb = 0, max_err = 0;
|
| 68 |
+
for (int i = 0; i < rows; i++) {
|
| 69 |
+
dot += y_ref[i] * y_lut[i];
|
| 70 |
+
na += y_ref[i] * y_ref[i];
|
| 71 |
+
nb += y_lut[i] * y_lut[i];
|
| 72 |
+
float err = fabsf(y_ref[i] - y_lut[i]);
|
| 73 |
+
if (err > max_err) max_err = err;
|
| 74 |
+
}
|
| 75 |
+
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 76 |
+
|
| 77 |
+
float noise = 0;
|
| 78 |
+
for (int i = 0; i < rows; i++) {
|
| 79 |
+
float e = y_ref[i] - y_lut[i];
|
| 80 |
+
noise += e * e;
|
| 81 |
+
}
|
| 82 |
+
float snr = 10.0f * log10f(na / (noise + 1e-10f));
|
| 83 |
+
|
| 84 |
+
printf(" Cosine similarity: %.6f\n", cosine);
|
| 85 |
+
printf(" SNR: %.1f dB\n", snr);
|
| 86 |
+
printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));
|
| 87 |
+
|
| 88 |
+
/* Show first few values */
|
| 89 |
+
printf(" First 5 values:\n");
|
| 90 |
+
for (int i = 0; i < 5 && i < rows; i++)
|
| 91 |
+
printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);
|
| 92 |
+
|
| 93 |
+
lum_free(M); lut_free(x); lut_free(y);
|
| 94 |
+
free(M_float); free(x_float); free(y_ref); free(y_lut);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
int main() {
|
| 98 |
+
srand(time(NULL));
|
| 99 |
+
|
| 100 |
+
printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");
|
| 101 |
+
|
| 102 |
+
/* 1. Roundtrip accuracy at different plane counts */
|
| 103 |
+
printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
|
| 104 |
+
printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
|
| 105 |
+
for (int np = 4; np <= 12; np += 2) {
|
| 106 |
+
int bias = np / 2;
|
| 107 |
+
AccuracyResult r = lut_accuracy_test(4096, np, bias);
|
| 108 |
+
printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
|
| 109 |
+
np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
/* 2. Matvec correctness */
|
| 113 |
+
test_matvec_correctness(64, 256, 7, 3);
|
| 114 |
+
test_matvec_correctness(256, 1024, 7, 3);
|
| 115 |
+
test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */
|
| 116 |
+
|
| 117 |
+
/* 3. Speed benchmarks - various configurations */
|
| 118 |
+
printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
|
| 119 |
+
printf("%10s %6s %6s %6s %10s %10s %10s\n",
|
| 120 |
+
"Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");
|
| 121 |
+
|
| 122 |
+
struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
|
| 123 |
+
/* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
|
| 124 |
+
{4096, 2560, 7, 4, 3, "q_proj"},
|
| 125 |
+
{4096, 2560, 7, 7, 3, "q_proj_7x7"},
|
| 126 |
+
{1024, 2560, 7, 4, 3, "k_proj"},
|
| 127 |
+
/* Qwen3-4B MLP: inter=9728 */
|
| 128 |
+
{9728, 2560, 7, 4, 3, "gate_proj"},
|
| 129 |
+
{2560, 9728, 7, 4, 3, "down_proj"},
|
| 130 |
+
/* Different plane counts */
|
| 131 |
+
{4096, 2560, 4, 4, 2, "4x4"},
|
| 132 |
+
{4096, 2560, 8, 8, 4, "8x8"},
|
| 133 |
+
{4096, 2560, 10, 6, 3, "10x6"},
|
| 134 |
+
};
|
| 135 |
+
int n_configs = sizeof(configs) / sizeof(configs[0]);
|
| 136 |
+
|
| 137 |
+
for (int c = 0; c < n_configs; c++) {
|
| 138 |
+
int iters = 3;
|
| 139 |
+
BenchResult r = lum_bench_matvec(
|
| 140 |
+
configs[c].rows, configs[c].cols,
|
| 141 |
+
configs[c].wp, configs[c].xp, configs[c].bias, iters);
|
| 142 |
+
printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n",
|
| 143 |
+
configs[c].rows, configs[c].cols,
|
| 144 |
+
configs[c].wp, configs[c].xp, configs[c].bias,
|
| 145 |
+
r.wall_time_s * 1000,
|
| 146 |
+
r.elements_per_sec / 1e6,
|
| 147 |
+
r.gops,
|
| 148 |
+
configs[c].label);
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
printf("\n=== DONE ===\n");
|
| 152 |
+
return 0;
|
| 153 |
+
}
|
test_popcount.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Test the full-unary popcount engine."""
|
| 3 |
+
import ctypes, numpy as np, os, time, sys
|
| 4 |
+
os.environ["OMP_NUM_THREADS"] = "16"
|
| 5 |
+
|
| 6 |
+
MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary4"
|
| 7 |
+
HF_DIR = "deepseek-r1-1.5b-hf"
|
| 8 |
+
N_PLANES = int(sys.argv[2]) if len(sys.argv) > 2 else 4
|
| 9 |
+
|
| 10 |
+
lib = ctypes.CDLL("./unary_full.so")
|
| 11 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 12 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 13 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 14 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 15 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 16 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 17 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 18 |
+
args = [ctypes.c_void_p, ctypes.c_int]
|
| 19 |
+
for _ in range(7):
|
| 20 |
+
args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 21 |
+
args.append(ctypes.c_int)
|
| 22 |
+
lib.layer_set_linears.argtypes = args
|
| 23 |
+
lib.generate.restype = ctypes.c_int
|
| 24 |
+
lib.generate.argtypes = [
|
| 25 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
|
| 26 |
+
ctypes.c_void_p, ctypes.c_int,
|
| 27 |
+
ctypes.c_float, ctypes.c_float, ctypes.c_int
|
| 28 |
+
]
|
| 29 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 30 |
+
lib.model_free.argtypes = [ctypes.c_void_p]
|
| 31 |
+
|
| 32 |
+
_refs = []
|
| 33 |
+
def keep(a):
|
| 34 |
+
_refs.append(a)
|
| 35 |
+
return a.ctypes.data
|
| 36 |
+
|
| 37 |
+
print(f"Loading model from {MODEL_DIR} (w_planes={N_PLANES})...")
|
| 38 |
+
m = lib.model_alloc(N_PLANES)
|
| 39 |
+
|
| 40 |
+
# Embed + final norm + lm_head
|
| 41 |
+
e = np.fromfile(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"), dtype=np.uint16)
|
| 42 |
+
lib.model_set_embed(m, keep(e))
|
| 43 |
+
fn = np.fromfile(os.path.join(MODEL_DIR, "model_norm_weight.fp16"), dtype=np.float16).astype(np.float32)
|
| 44 |
+
lib.model_set_final_norm(m, keep(fn))
|
| 45 |
+
lm = np.fromfile(os.path.join(MODEL_DIR, "lm_head_weight.fp16"), dtype=np.uint16)
|
| 46 |
+
lib.model_set_lm_head(m, keep(lm), 151936, 1536)
|
| 47 |
+
|
| 48 |
+
PROJS = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
|
| 49 |
+
"self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
|
| 50 |
+
DIMS = {
|
| 51 |
+
"self_attn_q_proj": (1536, 1536), "self_attn_k_proj": (256, 1536),
|
| 52 |
+
"self_attn_v_proj": (256, 1536), "self_attn_o_proj": (1536, 1536),
|
| 53 |
+
"mlp_gate_proj": (8960, 1536), "mlp_up_proj": (8960, 1536),
|
| 54 |
+
"mlp_down_proj": (1536, 8960),
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
for l in range(28):
|
| 58 |
+
in_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
|
| 59 |
+
po_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
|
| 60 |
+
lib.layer_set_norms(m, l, keep(in_n), keep(po_n))
|
| 61 |
+
|
| 62 |
+
qb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
|
| 63 |
+
kb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
|
| 64 |
+
vb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
|
| 65 |
+
lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
|
| 66 |
+
|
| 67 |
+
pa = []
|
| 68 |
+
for p in PROJS:
|
| 69 |
+
base = os.path.join(MODEL_DIR, f"model_layers_{l}_{p}_weight")
|
| 70 |
+
s = np.fromfile(base + ".sign", dtype=np.uint64)
|
| 71 |
+
pl = np.fromfile(base + ".planes", dtype=np.uint64)
|
| 72 |
+
sc = np.fromfile(base + ".scales", dtype=np.float32)
|
| 73 |
+
od, id_ = DIMS[p]
|
| 74 |
+
pa.extend([keep(s), keep(pl), keep(sc), od, id_])
|
| 75 |
+
lib.layer_set_linears(m, l, *pa, N_PLANES)
|
| 76 |
+
if (l + 1) % 7 == 0:
|
| 77 |
+
print(f" Layer {l+1}/28")
|
| 78 |
+
|
| 79 |
+
print("Model loaded!")
|
| 80 |
+
|
| 81 |
+
from transformers import AutoTokenizer
|
| 82 |
+
tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
|
| 83 |
+
|
| 84 |
+
msg = [{"role": "user", "content": "What is 2+2?"}]
|
| 85 |
+
ids = tok.apply_chat_template(msg, add_generation_prompt=True)
|
| 86 |
+
arr = np.array(ids, dtype=np.int32)
|
| 87 |
+
out = np.zeros(30, dtype=np.int32)
|
| 88 |
+
|
| 89 |
+
lib.model_reset_cache(m)
|
| 90 |
+
print(f"Prompt: {len(ids)} tokens, generating 30...")
|
| 91 |
+
t0 = time.time()
|
| 92 |
+
n = lib.generate(m, arr.ctypes.data, len(ids), out.ctypes.data, 30,
|
| 93 |
+
ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
|
| 94 |
+
dt = time.time() - t0
|
| 95 |
+
text = tok.decode(out[:n].tolist(), skip_special_tokens=False)
|
| 96 |
+
print(f"\n=== {n} tokens, {dt:.1f}s, {n/dt:.1f} tok/s ===")
|
| 97 |
+
print(text)
|
| 98 |
+
print("===")
|
| 99 |
+
lib.model_free(m)
|
true_unary
ADDED
|
Binary file (29.9 kB). View file
|
|
|
true_unary.c
ADDED
|
@@ -0,0 +1,552 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
|
| 3 |
+
*
|
| 4 |
+
* Representation:
|
| 5 |
+
* A value of magnitude M is stored as M consecutive 1-bits.
|
| 6 |
+
* The number IS the count of ones.
|
| 7 |
+
* Every bit has weight exactly 1.
|
| 8 |
+
*
|
| 9 |
+
* For a vector element quantized to integer range [-K, K]:
|
| 10 |
+
* sign: 1 bit (0=positive, 1=negative)
|
| 11 |
+
* magnitude: K bit positions, first |value| are 1, rest are 0
|
| 12 |
+
*
|
| 13 |
+
* Storage layout for a vector of dim D with max magnitude K:
|
| 14 |
+
* sign: uint64[(D+63)/64] — one sign bit per element
|
| 15 |
+
* unary: uint64[K * (D+63)/64] — K bitplanes across D elements
|
| 16 |
+
* Plane p has bit j set iff |element_j| > p
|
| 17 |
+
* (thermometer = true unary in bitplane form)
|
| 18 |
+
*
|
| 19 |
+
* Multiplication: w * x = popcount of ones(w) matched with ones(x)
|
| 20 |
+
* Since every bit = 1, the dot product is JUST COUNTING.
|
| 21 |
+
* No weights, no shifts, no corrections.
|
| 22 |
+
* sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
|
| 23 |
+
* = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
|
| 24 |
+
*
|
| 25 |
+
* YES this uses more memory. A 2560-dim vector with K=32 uses:
|
| 26 |
+
* 32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
|
| 27 |
+
* But the MATH IS EXACT (to quantization level).
|
| 28 |
+
*
|
| 29 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 30 |
+
*/
|
| 31 |
+
|
| 32 |
+
#define _POSIX_C_SOURCE 199309L
|
| 33 |
+
#include <immintrin.h>
|
| 34 |
+
#include <omp.h>
|
| 35 |
+
#include <stdint.h>
|
| 36 |
+
#include <stdlib.h>
|
| 37 |
+
#include <string.h>
|
| 38 |
+
#include <math.h>
|
| 39 |
+
#include <stdio.h>
|
| 40 |
+
#include <time.h>
|
| 41 |
+
|
| 42 |
+
/* ============================================================
|
| 43 |
+
* TRUE UNARY VECTOR
|
| 44 |
+
* ============================================================ */
|
| 45 |
+
typedef struct {
|
| 46 |
+
uint64_t *sign; /* [chunks] — 1 bit per element */
|
| 47 |
+
uint64_t *unary; /* [K * chunks] — K bitplanes, each bit = weight 1 */
|
| 48 |
+
float scale; /* float scale: real_value = sign * count * scale */
|
| 49 |
+
int dim;
|
| 50 |
+
int chunks; /* (dim+63)/64 */
|
| 51 |
+
int K; /* max magnitude = number of unary bitplanes */
|
| 52 |
+
} TrueUnaryVec;
|
| 53 |
+
|
| 54 |
+
/* TRUE UNARY MATRIX — row-major */
|
| 55 |
+
typedef struct {
|
| 56 |
+
uint64_t *sign; /* [rows * chunks] */
|
| 57 |
+
uint64_t *unary; /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */
|
| 58 |
+
float *scales; /* [rows] — per-row scale factors */
|
| 59 |
+
int rows;
|
| 60 |
+
int cols;
|
| 61 |
+
int chunks; /* (cols+63)/64 */
|
| 62 |
+
int K; /* max magnitude per element */
|
| 63 |
+
} TrueUnaryMat;
|
| 64 |
+
|
| 65 |
+
/* ============================================================
|
| 66 |
+
* ALLOCATION
|
| 67 |
+
* ============================================================ */
|
| 68 |
+
TrueUnaryVec* tuv_alloc(int dim, int K) {
|
| 69 |
+
TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec));
|
| 70 |
+
v->dim = dim;
|
| 71 |
+
v->K = K;
|
| 72 |
+
v->chunks = (dim + 63) / 64;
|
| 73 |
+
v->scale = 1.0f;
|
| 74 |
+
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
|
| 75 |
+
v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t));
|
| 76 |
+
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
|
| 77 |
+
memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
|
| 78 |
+
return v;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
|
| 82 |
+
TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat));
|
| 83 |
+
m->rows = rows;
|
| 84 |
+
m->cols = cols;
|
| 85 |
+
m->K = K;
|
| 86 |
+
m->chunks = (cols + 63) / 64;
|
| 87 |
+
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 88 |
+
m->unary = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 89 |
+
m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
|
| 90 |
+
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
|
| 91 |
+
memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
|
| 92 |
+
for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
|
| 93 |
+
return m;
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void tuv_free(TrueUnaryVec *v) {
|
| 97 |
+
if (v) { free(v->sign); free(v->unary); free(v); }
|
| 98 |
+
}
|
| 99 |
+
void tum_free(TrueUnaryMat *m) {
|
| 100 |
+
if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
/* ============================================================
|
| 104 |
+
* FLOAT → TRUE UNARY
|
| 105 |
+
*
|
| 106 |
+
* Quantize: integer_val = round(float_val / scale * K)
|
| 107 |
+
* Then store |integer_val| as that many 1-bits.
|
| 108 |
+
*
|
| 109 |
+
* For vector: single global scale = absmax / K
|
| 110 |
+
* For matrix: per-row scale = row_absmax / K
|
| 111 |
+
* ============================================================ */
|
| 112 |
+
void tuv_from_float(TrueUnaryVec *v, const float *x) {
|
| 113 |
+
int dim = v->dim, K = v->K, chunks = v->chunks;
|
| 114 |
+
|
| 115 |
+
memset(v->sign, 0, chunks * sizeof(uint64_t));
|
| 116 |
+
memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));
|
| 117 |
+
|
| 118 |
+
float amax = 0.0f;
|
| 119 |
+
for (int i = 0; i < dim; i++) {
|
| 120 |
+
float a = fabsf(x[i]);
|
| 121 |
+
if (a > amax) amax = a;
|
| 122 |
+
}
|
| 123 |
+
if (amax == 0.0f) { v->scale = 1.0f; return; }
|
| 124 |
+
v->scale = amax / K;
|
| 125 |
+
|
| 126 |
+
float inv = K / amax;
|
| 127 |
+
for (int i = 0; i < dim; i++) {
|
| 128 |
+
int c = i / 64;
|
| 129 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 130 |
+
|
| 131 |
+
if (x[i] < 0.0f) v->sign[c] |= bit;
|
| 132 |
+
|
| 133 |
+
int mag = (int)(fabsf(x[i]) * inv + 0.5f);
|
| 134 |
+
if (mag > K) mag = K;
|
| 135 |
+
|
| 136 |
+
/* TRUE UNARY: set planes 0 through mag-1 */
|
| 137 |
+
for (int p = 0; p < mag; p++)
|
| 138 |
+
v->unary[(size_t)p * chunks + c] |= bit;
|
| 139 |
+
}
|
| 140 |
+
}
|
| 141 |
+
|
| 142 |
+
void tuv_to_float(const TrueUnaryVec *v, float *out) {
|
| 143 |
+
int dim = v->dim, K = v->K, chunks = v->chunks;
|
| 144 |
+
|
| 145 |
+
for (int i = 0; i < dim; i++) {
|
| 146 |
+
int c = i / 64;
|
| 147 |
+
uint64_t bit = 1ULL << (i % 64);
|
| 148 |
+
|
| 149 |
+
/* Count set planes = magnitude in base-1 */
|
| 150 |
+
int mag = 0;
|
| 151 |
+
for (int p = 0; p < K; p++) {
|
| 152 |
+
if (v->unary[(size_t)p * chunks + c] & bit)
|
| 153 |
+
mag++;
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
float val = (float)mag * v->scale;
|
| 157 |
+
out[i] = (v->sign[c] & bit) ? -val : val;
|
| 158 |
+
}
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
void tum_from_float(TrueUnaryMat *m, const float *data) {
|
| 162 |
+
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
|
| 163 |
+
|
| 164 |
+
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
|
| 165 |
+
memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
|
| 166 |
+
|
| 167 |
+
for (int r = 0; r < rows; r++) {
|
| 168 |
+
const float *row = data + (size_t)r * cols;
|
| 169 |
+
|
| 170 |
+
float amax = 0.0f;
|
| 171 |
+
for (int j = 0; j < cols; j++) {
|
| 172 |
+
float a = fabsf(row[j]);
|
| 173 |
+
if (a > amax) amax = a;
|
| 174 |
+
}
|
| 175 |
+
if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
|
| 176 |
+
m->scales[r] = amax / K;
|
| 177 |
+
float inv = K / amax;
|
| 178 |
+
|
| 179 |
+
uint64_t *row_sign = m->sign + (size_t)r * chunks;
|
| 180 |
+
|
| 181 |
+
for (int j = 0; j < cols; j++) {
|
| 182 |
+
int c = j / 64;
|
| 183 |
+
uint64_t bit = 1ULL << (j % 64);
|
| 184 |
+
|
| 185 |
+
if (row[j] < 0.0f) row_sign[c] |= bit;
|
| 186 |
+
|
| 187 |
+
int mag = (int)(fabsf(row[j]) * inv + 0.5f);
|
| 188 |
+
if (mag > K) mag = K;
|
| 189 |
+
|
| 190 |
+
for (int p = 0; p < mag; p++)
|
| 191 |
+
m->unary[((size_t)p * rows + r) * chunks + c] |= bit;
|
| 192 |
+
}
|
| 193 |
+
}
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
/* ============================================================
|
| 197 |
+
* TRUE UNARY MATVEC: y = M @ x
|
| 198 |
+
*
|
| 199 |
+
* THE CORE OPERATION.
|
| 200 |
+
*
|
| 201 |
+
* For each output element y[i]:
|
| 202 |
+
* For each pair of planes (p from weight, q from activation):
|
| 203 |
+
* active = w_plane_p[i] AND x_plane_q
|
| 204 |
+
* same = active AND ~(w_sign[i] XOR x_sign)
|
| 205 |
+
* diff = active AND (w_sign[i] XOR x_sign)
|
| 206 |
+
* acc += popcount(same) - popcount(diff)
|
| 207 |
+
*
|
| 208 |
+
* EVERY PLANE PAIR HAS WEIGHT = 1.
|
| 209 |
+
* No shifts. No scaling between planes. No corrections.
|
| 210 |
+
* The count IS the answer.
|
| 211 |
+
*
|
| 212 |
+
* y[i] = acc * w_scale[i] * x_scale
|
| 213 |
+
* (single float multiply at the very end)
|
| 214 |
+
*
|
| 215 |
+
* ============================================================ */
|
| 216 |
+
void tum_matvec(
|
| 217 |
+
const TrueUnaryMat *M,
|
| 218 |
+
const TrueUnaryVec *x,
|
| 219 |
+
float *y_out /* float output, requantize externally if needed */
|
| 220 |
+
) {
|
| 221 |
+
int out_dim = M->rows;
|
| 222 |
+
int chunks = M->chunks;
|
| 223 |
+
int wK = M->K;
|
| 224 |
+
int xK = x->K;
|
| 225 |
+
|
| 226 |
+
#pragma omp parallel for schedule(dynamic, 32)
|
| 227 |
+
for (int i = 0; i < out_dim; i++) {
|
| 228 |
+
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
|
| 229 |
+
long long acc = 0;
|
| 230 |
+
|
| 231 |
+
for (int c = 0; c < chunks; c++) {
|
| 232 |
+
uint64_t ws = w_sign_row[c];
|
| 233 |
+
uint64_t xs = x->sign[c];
|
| 234 |
+
uint64_t same = ~(ws ^ xs);
|
| 235 |
+
uint64_t diff = ws ^ xs;
|
| 236 |
+
|
| 237 |
+
/*
|
| 238 |
+
* PURE BASE-1: every plane pair contributes weight 1.
|
| 239 |
+
* acc += popcount(w_plane AND x_plane AND same_sign)
|
| 240 |
+
* - popcount(w_plane AND x_plane AND diff_sign)
|
| 241 |
+
*/
|
| 242 |
+
for (int p = 0; p < wK; p++) {
|
| 243 |
+
uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];
|
| 244 |
+
|
| 245 |
+
for (int q = 0; q < xK; q++) {
|
| 246 |
+
uint64_t xq = x->unary[(size_t)q * chunks + c];
|
| 247 |
+
uint64_t active = wp & xq;
|
| 248 |
+
acc += __builtin_popcountll(active & same)
|
| 249 |
+
- __builtin_popcountll(active & diff);
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
/* Single float rescale per output element */
|
| 255 |
+
y_out[i] = (float)acc * M->scales[i] * x->scale;
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
/* ============================================================
|
| 260 |
+
* OPTIMIZED MATVEC: collapse x planes first
|
| 261 |
+
*
|
| 262 |
+
* Instead of iterating wK * xK plane pairs per chunk,
|
| 263 |
+
* precompute per-chunk activation sums:
|
| 264 |
+
* x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
|
| 265 |
+
* x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
|
| 266 |
+
*
|
| 267 |
+
* Then for each weight plane p:
|
| 268 |
+
* This doesn't directly simplify because we need AND with wp first.
|
| 269 |
+
*
|
| 270 |
+
* ALTERNATIVE: precompute per-element x magnitudes in unary,
|
| 271 |
+
* then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
|
| 272 |
+
*
|
| 273 |
+
* For now: provide both the naive and a vertically-accumulated variant.
|
| 274 |
+
*
|
| 275 |
+
* VERTICAL ACCUMULATE: sum all weight planes into a per-element
|
| 276 |
+
* count, then multiply by x count. Reduces from O(wK*xK*chunks)
|
| 277 |
+
* to O((wK+xK)*chunks + dim).
|
| 278 |
+
* ============================================================ */
|
| 279 |
+
void tum_matvec_fast(
|
| 280 |
+
const TrueUnaryMat *M,
|
| 281 |
+
const TrueUnaryVec *x,
|
| 282 |
+
float *y_out
|
| 283 |
+
) {
|
| 284 |
+
int out_dim = M->rows;
|
| 285 |
+
int cols = M->cols;
|
| 286 |
+
int chunks = M->chunks;
|
| 287 |
+
int xK = x->K;
|
| 288 |
+
|
| 289 |
+
/* Step 1: compute x magnitudes (per-element popcount across planes)
|
| 290 |
+
* x_mag[j] = number of x planes where bit j is set
|
| 291 |
+
* This is O(xK * chunks) = O(xK * dim / 64)
|
| 292 |
+
*/
|
| 293 |
+
int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
|
| 294 |
+
memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
|
| 295 |
+
|
| 296 |
+
for (int q = 0; q < xK; q++) {
|
| 297 |
+
const uint64_t *xplane = x->unary + (size_t)q * chunks;
|
| 298 |
+
for (int c = 0; c < chunks; c++) {
|
| 299 |
+
uint64_t bits = xplane[c];
|
| 300 |
+
while (bits) {
|
| 301 |
+
int bit = __builtin_ctzll(bits);
|
| 302 |
+
int j = c * 64 + bit;
|
| 303 |
+
if (j < cols) x_mag[j]++;
|
| 304 |
+
bits &= bits - 1;
|
| 305 |
+
}
|
| 306 |
+
}
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
/* Apply sign to x_mag: positive if same sign as...
|
| 310 |
+
* Actually we need signed x_mag relative to each weight row's sign.
|
| 311 |
+
* So we keep x_mag unsigned and handle sign per output element.
|
| 312 |
+
*/
|
| 313 |
+
|
| 314 |
+
/* Step 2: for each output row, compute:
|
| 315 |
+
* y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
|
| 316 |
+
*
|
| 317 |
+
* w_mag[i][j] = number of weight planes where bit j is set
|
| 318 |
+
* sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
|
| 319 |
+
*
|
| 320 |
+
* We compute w_mag by vertical popcount across weight planes.
|
| 321 |
+
* This is O(wK * chunks) per row.
|
| 322 |
+
*/
|
| 323 |
+
|
| 324 |
+
#pragma omp parallel
|
| 325 |
+
{
|
| 326 |
+
int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
|
| 327 |
+
|
| 328 |
+
#pragma omp for schedule(dynamic, 32)
|
| 329 |
+
for (int i = 0; i < out_dim; i++) {
|
| 330 |
+
memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
|
| 331 |
+
|
| 332 |
+
/* Vertical popcount: count set planes per element */
|
| 333 |
+
for (int p = 0; p < M->K; p++) {
|
| 334 |
+
const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks;
|
| 335 |
+
for (int c = 0; c < chunks; c++) {
|
| 336 |
+
uint64_t bits = wplane[c];
|
| 337 |
+
while (bits) {
|
| 338 |
+
int bit = __builtin_ctzll(bits);
|
| 339 |
+
int j = c * 64 + bit;
|
| 340 |
+
if (j < cols) w_mag[j]++;
|
| 341 |
+
bits &= bits - 1;
|
| 342 |
+
}
|
| 343 |
+
}
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
/* Dot product with sign */
|
| 347 |
+
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
|
| 348 |
+
long long acc = 0;
|
| 349 |
+
|
| 350 |
+
for (int j = 0; j < cols; j++) {
|
| 351 |
+
int c = j / 64;
|
| 352 |
+
uint64_t bit = 1ULL << (j % 64);
|
| 353 |
+
int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
|
| 354 |
+
int product = (int)w_mag[j] * (int)x_mag[j];
|
| 355 |
+
acc += same_sign ? product : -product;
|
| 356 |
+
}
|
| 357 |
+
|
| 358 |
+
y_out[i] = (float)acc * M->scales[i] * x->scale;
|
| 359 |
+
}
|
| 360 |
+
|
| 361 |
+
free(w_mag);
|
| 362 |
+
}
|
| 363 |
+
|
| 364 |
+
free(x_mag);
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
/* ============================================================
|
| 368 |
+
* BENCHMARK + ACCURACY
|
| 369 |
+
* ============================================================ */
|
| 370 |
+
typedef struct {
|
| 371 |
+
float cosine;
|
| 372 |
+
float snr_db;
|
| 373 |
+
float max_rel_err;
|
| 374 |
+
double ms_naive;
|
| 375 |
+
double ms_fast;
|
| 376 |
+
double gops_naive;
|
| 377 |
+
double gops_fast;
|
| 378 |
+
} TestResult;
|
| 379 |
+
|
| 380 |
+
TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
|
| 381 |
+
TestResult r = {0};
|
| 382 |
+
srand(42);
|
| 383 |
+
|
| 384 |
+
/* Random float matrix and vector (normal distribution) */
|
| 385 |
+
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
|
| 386 |
+
float *xf = (float *)malloc(cols * sizeof(float));
|
| 387 |
+
float *y_ref = (float *)calloc(rows, sizeof(float));
|
| 388 |
+
float *y_naive = (float *)malloc(rows * sizeof(float));
|
| 389 |
+
float *y_fast = (float *)malloc(rows * sizeof(float));
|
| 390 |
+
|
| 391 |
+
for (size_t i = 0; i < (size_t)rows * cols; i++) {
|
| 392 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 393 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 394 |
+
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 395 |
+
}
|
| 396 |
+
for (int i = 0; i < cols; i++) {
|
| 397 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 398 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 399 |
+
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
/* Float reference */
|
| 403 |
+
for (int i = 0; i < rows; i++)
|
| 404 |
+
for (int j = 0; j < cols; j++)
|
| 405 |
+
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
|
| 406 |
+
|
| 407 |
+
/* Convert to true unary */
|
| 408 |
+
TrueUnaryMat *M = tum_alloc(rows, cols, wK);
|
| 409 |
+
TrueUnaryVec *x = tuv_alloc(cols, xK);
|
| 410 |
+
tum_from_float(M, Mf);
|
| 411 |
+
tuv_from_float(x, xf);
|
| 412 |
+
|
| 413 |
+
/* Naive matvec */
|
| 414 |
+
struct timespec t0, t1;
|
| 415 |
+
tum_matvec(M, x, y_naive); /* warmup */
|
| 416 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 417 |
+
for (int i = 0; i < iters; i++)
|
| 418 |
+
tum_matvec(M, x, y_naive);
|
| 419 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 420 |
+
r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
|
| 421 |
+
|
| 422 |
+
/* Fast matvec */
|
| 423 |
+
tum_matvec_fast(M, x, y_fast); /* warmup */
|
| 424 |
+
clock_gettime(CLOCK_MONOTONIC, &t0);
|
| 425 |
+
for (int i = 0; i < iters; i++)
|
| 426 |
+
tum_matvec_fast(M, x, y_fast);
|
| 427 |
+
clock_gettime(CLOCK_MONOTONIC, &t1);
|
| 428 |
+
r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
|
| 429 |
+
|
| 430 |
+
/* Accuracy vs float reference */
|
| 431 |
+
float dot = 0, na = 0, nb = 0, max_re = 0;
|
| 432 |
+
for (int i = 0; i < rows; i++) {
|
| 433 |
+
dot += y_ref[i] * y_naive[i];
|
| 434 |
+
na += y_ref[i] * y_ref[i];
|
| 435 |
+
nb += y_naive[i] * y_naive[i];
|
| 436 |
+
float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
|
| 437 |
+
if (re > max_re) max_re = re;
|
| 438 |
+
}
|
| 439 |
+
r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
|
| 440 |
+
float noise = 0;
|
| 441 |
+
for (int i = 0; i < rows; i++) {
|
| 442 |
+
float e = y_ref[i] - y_naive[i]; noise += e * e;
|
| 443 |
+
}
|
| 444 |
+
r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
|
| 445 |
+
r.max_rel_err = max_re;
|
| 446 |
+
|
| 447 |
+
/* Verify naive == fast */
|
| 448 |
+
float fast_err = 0;
|
| 449 |
+
for (int i = 0; i < rows; i++) {
|
| 450 |
+
float e = fabsf(y_naive[i] - y_fast[i]);
|
| 451 |
+
if (e > fast_err) fast_err = e;
|
| 452 |
+
}
|
| 453 |
+
if (fast_err > 0.01f)
|
| 454 |
+
printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err);
|
| 455 |
+
|
| 456 |
+
double ops = 2.0 * rows * cols;
|
| 457 |
+
r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
|
| 458 |
+
r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);
|
| 459 |
+
|
| 460 |
+
tum_free(M); tuv_free(x);
|
| 461 |
+
free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
|
| 462 |
+
return r;
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
/* ============================================================
|
| 466 |
+
* MAIN: sweep K values, show accuracy + speed tradeoff
|
| 467 |
+
* ============================================================ */
|
| 468 |
+
int main() {
|
| 469 |
+
printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
|
| 470 |
+
printf("Every bit has weight 1. Value = count of ones.\n");
|
| 471 |
+
printf("Matmul = AND + popcount, no weighting.\n\n");
|
| 472 |
+
|
| 473 |
+
/* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
|
| 474 |
+
int rows = 4096, cols = 2560;
|
| 475 |
+
printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);
|
| 476 |
+
|
| 477 |
+
printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n",
|
| 478 |
+
"wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
|
| 479 |
+
"Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");
|
| 480 |
+
|
| 481 |
+
struct { int wK; int xK; } configs[] = {
|
| 482 |
+
{8, 4},
|
| 483 |
+
{8, 8},
|
| 484 |
+
{16, 8},
|
| 485 |
+
{16, 16},
|
| 486 |
+
{32, 8},
|
| 487 |
+
{32, 16},
|
| 488 |
+
{32, 32},
|
| 489 |
+
{64, 16},
|
| 490 |
+
{64, 32},
|
| 491 |
+
};
|
| 492 |
+
int n = sizeof(configs) / sizeof(configs[0]);
|
| 493 |
+
|
| 494 |
+
for (int c = 0; c < n; c++) {
|
| 495 |
+
int wK = configs[c].wK;
|
| 496 |
+
int xK = configs[c].xK;
|
| 497 |
+
int iters = (wK <= 16 && xK <= 16) ? 3 : 1;
|
| 498 |
+
|
| 499 |
+
TestResult r = tum_test(rows, cols, wK, xK, iters);
|
| 500 |
+
|
| 501 |
+
/* Memory for this layer's weights */
|
| 502 |
+
size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
|
| 503 |
+
size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
|
| 504 |
+
size_t scale_bytes = rows * 4;
|
| 505 |
+
double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;
|
| 506 |
+
|
| 507 |
+
printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n",
|
| 508 |
+
wK, xK, r.cosine, r.snr_db, r.max_rel_err,
|
| 509 |
+
r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
/* Show first 5 values for K=32,16 case */
|
| 513 |
+
printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
|
| 514 |
+
{
|
| 515 |
+
int sr = 512, sc = 2560;
|
| 516 |
+
srand(42);
|
| 517 |
+
float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float));
|
| 518 |
+
float *xf = (float *)malloc(sc * sizeof(float));
|
| 519 |
+
float *y_ref = (float *)calloc(sr, sizeof(float));
|
| 520 |
+
float *y_unary = (float *)malloc(sr * sizeof(float));
|
| 521 |
+
|
| 522 |
+
for (size_t i = 0; i < (size_t)sr * sc; i++) {
|
| 523 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 524 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 525 |
+
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 526 |
+
}
|
| 527 |
+
for (int i = 0; i < sc; i++) {
|
| 528 |
+
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 529 |
+
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
|
| 530 |
+
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
|
| 531 |
+
}
|
| 532 |
+
for (int i = 0; i < sr; i++)
|
| 533 |
+
for (int j = 0; j < sc; j++)
|
| 534 |
+
y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];
|
| 535 |
+
|
| 536 |
+
TrueUnaryMat *M = tum_alloc(sr, sc, 32);
|
| 537 |
+
TrueUnaryVec *x = tuv_alloc(sc, 16);
|
| 538 |
+
tum_from_float(M, Mf);
|
| 539 |
+
tuv_from_float(x, xf);
|
| 540 |
+
tum_matvec(M, x, y_unary);
|
| 541 |
+
|
| 542 |
+
printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
|
| 543 |
+
for (int i = 0; i < 10; i++)
|
| 544 |
+
printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);
|
| 545 |
+
|
| 546 |
+
tum_free(M); tuv_free(x);
|
| 547 |
+
free(Mf); free(xf); free(y_ref); free(y_unary);
|
| 548 |
+
}
|
| 549 |
+
|
| 550 |
+
printf("\n=== DONE ===\n");
|
| 551 |
+
return 0;
|
| 552 |
+
}
|
unary_convert.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert model weights to UNARY (base-1) thermometer encoding.
|
| 4 |
+
|
| 5 |
+
True unary: magnitude N = N consecutive 1-bits across N bitplanes.
|
| 6 |
+
Each bitplane contributes equally (value=1), NOT binary powers.
|
| 7 |
+
|
| 8 |
+
Weight 0.3 with scale -> magnitude 5 -> planes 0,1,2,3,4 have bit set
|
| 9 |
+
Weight -0.1 with scale -> magnitude 2, sign=neg -> planes 0,1 set + sign bit
|
| 10 |
+
|
| 11 |
+
More precision than ternary (N+1 levels vs 3), still no multiplication.
|
| 12 |
+
|
| 13 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import os
|
| 17 |
+
import json
|
| 18 |
+
import numpy as np
|
| 19 |
+
from pathlib import Path
|
| 20 |
+
import time
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def load_safetensors(model_dir):
|
| 24 |
+
"""Load all tensors from safetensors files."""
|
| 25 |
+
import torch
|
| 26 |
+
from safetensors.torch import load_file
|
| 27 |
+
|
| 28 |
+
tensors = {}
|
| 29 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 30 |
+
print(f"Loading {f.name}...")
|
| 31 |
+
state = load_file(str(f))
|
| 32 |
+
for key, val in state.items():
|
| 33 |
+
tensors[key] = val.float().numpy()
|
| 34 |
+
return tensors
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def quantize_matrix_unary(weight, n_planes=7):
|
| 38 |
+
"""Quantize weight matrix to unary thermometer encoding.
|
| 39 |
+
|
| 40 |
+
n_planes determines max magnitude (and precision levels = n_planes + 1).
|
| 41 |
+
n_planes=7 gives 8 levels: {0,1,2,3,4,5,6,7} * sign = 15 distinct values.
|
| 42 |
+
|
| 43 |
+
Returns: sign_bits, mag_planes, scales, sparsity
|
| 44 |
+
"""
|
| 45 |
+
w = weight.astype(np.float32)
|
| 46 |
+
out_dim, in_dim = w.shape
|
| 47 |
+
chunks = ((in_dim + 63) // 64)
|
| 48 |
+
padded = chunks * 64
|
| 49 |
+
|
| 50 |
+
# Per-row quantization
|
| 51 |
+
row_max = np.max(np.abs(w), axis=1, keepdims=True)
|
| 52 |
+
row_max = np.where(row_max == 0, 1.0, row_max)
|
| 53 |
+
|
| 54 |
+
# Scale to [0, n_planes] range per row
|
| 55 |
+
scales = (row_max.flatten() / n_planes).astype(np.float32)
|
| 56 |
+
|
| 57 |
+
# Quantize to integer magnitudes
|
| 58 |
+
w_scaled = w / scales[:, None] # Now in [-n_planes, +n_planes]
|
| 59 |
+
magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
|
| 60 |
+
magnitudes = np.clip(magnitudes, 0, n_planes)
|
| 61 |
+
signs = (w < 0) # True = negative
|
| 62 |
+
|
| 63 |
+
# Sparsity (magnitude 0)
|
| 64 |
+
sparsity = np.mean(magnitudes == 0)
|
| 65 |
+
|
| 66 |
+
# Pad to multiple of 64
|
| 67 |
+
if in_dim < padded:
|
| 68 |
+
magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded - in_dim), dtype=np.int32)], axis=1)
|
| 69 |
+
signs = np.concatenate([signs, np.zeros((out_dim, padded - in_dim), dtype=bool)], axis=1)
|
| 70 |
+
|
| 71 |
+
# Pack sign bits - vectorized
|
| 72 |
+
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
|
| 73 |
+
signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 74 |
+
sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2) # [out_dim, chunks]
|
| 75 |
+
|
| 76 |
+
# Pack magnitude planes - thermometer encoding
|
| 77 |
+
# Plane p has bit set where magnitude > p (i.e., magnitude >= p+1)
|
| 78 |
+
mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
|
| 79 |
+
|
| 80 |
+
for p in range(n_planes):
|
| 81 |
+
active = (magnitudes >= (p + 1)) # [out_dim, padded]
|
| 82 |
+
active_r = active.reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 83 |
+
mag_planes[p] = np.bitwise_or.reduce(active_r * bit_positions, axis=2)
|
| 84 |
+
|
| 85 |
+
return sign_bits, mag_planes, scales, sparsity
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def save_unary_model(tensors, output_dir, n_planes=7):
|
| 89 |
+
"""Convert and save full model to unary format."""
|
| 90 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 91 |
+
|
| 92 |
+
config = {
|
| 93 |
+
"hidden_size": 1536,
|
| 94 |
+
"intermediate_size": 8960,
|
| 95 |
+
"num_attention_heads": 12,
|
| 96 |
+
"num_key_value_heads": 2,
|
| 97 |
+
"num_hidden_layers": 28,
|
| 98 |
+
"vocab_size": 151936,
|
| 99 |
+
"head_dim": 128,
|
| 100 |
+
"rope_theta": 1000000.0,
|
| 101 |
+
"rms_norm_eps": 1e-6,
|
| 102 |
+
"n_planes": n_planes,
|
| 103 |
+
"quant_type": "unary",
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
ternary_keys = []
|
| 107 |
+
keep_keys = []
|
| 108 |
+
|
| 109 |
+
for key in tensors:
|
| 110 |
+
if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
|
| 111 |
+
'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
|
| 112 |
+
'down_proj.weight']):
|
| 113 |
+
ternary_keys.append(key)
|
| 114 |
+
else:
|
| 115 |
+
keep_keys.append(key)
|
| 116 |
+
|
| 117 |
+
print(f"\nUnary layers: {len(ternary_keys)} (n_planes={n_planes}, levels={n_planes+1})")
|
| 118 |
+
print(f"FP16 layers: {len(keep_keys)}")
|
| 119 |
+
|
| 120 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 121 |
+
json.dump(config, f, indent=2)
|
| 122 |
+
|
| 123 |
+
total_unary_bytes = 0
|
| 124 |
+
total_original_bytes = 0
|
| 125 |
+
|
| 126 |
+
for key in ternary_keys:
|
| 127 |
+
w = tensors[key]
|
| 128 |
+
out_dim, in_dim = w.shape
|
| 129 |
+
total_original_bytes += w.nbytes
|
| 130 |
+
|
| 131 |
+
t0 = time.time()
|
| 132 |
+
sign_bits, mag_planes, scales, sparsity = quantize_matrix_unary(w, n_planes)
|
| 133 |
+
dt = time.time() - t0
|
| 134 |
+
|
| 135 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 136 |
+
sign_bits.tofile(prefix + ".sign")
|
| 137 |
+
mag_planes.tofile(prefix + ".planes")
|
| 138 |
+
scales.tofile(prefix + ".scales")
|
| 139 |
+
|
| 140 |
+
unary_bytes = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
|
| 141 |
+
total_unary_bytes += unary_bytes
|
| 142 |
+
ratio = w.nbytes / unary_bytes
|
| 143 |
+
|
| 144 |
+
# Calculate effective bits per weight
|
| 145 |
+
bpw = (unary_bytes * 8) / (out_dim * in_dim)
|
| 146 |
+
|
| 147 |
+
print(f" {key}: {w.shape} -> unary ({unary_bytes/1024:.0f}KB, "
|
| 148 |
+
f"{ratio:.1f}x compress, {bpw:.2f} bpw, {sparsity:.1%} sparse, {dt:.1f}s)")
|
| 149 |
+
|
| 150 |
+
total_fp16_bytes = 0
|
| 151 |
+
for key in keep_keys:
|
| 152 |
+
w = tensors[key].astype(np.float16)
|
| 153 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 154 |
+
w.tofile(prefix + ".fp16")
|
| 155 |
+
total_fp16_bytes += w.nbytes
|
| 156 |
+
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
|
| 157 |
+
|
| 158 |
+
manifest = {
|
| 159 |
+
"unary": {k: list(tensors[k].shape) for k in ternary_keys},
|
| 160 |
+
"fp16": {k: list(tensors[k].shape) for k in keep_keys},
|
| 161 |
+
}
|
| 162 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 163 |
+
json.dump(manifest, f, indent=2)
|
| 164 |
+
|
| 165 |
+
total_bytes = total_unary_bytes + total_fp16_bytes
|
| 166 |
+
avg_bpw = (total_unary_bytes * 8) / sum(np.prod(tensors[k].shape) for k in ternary_keys)
|
| 167 |
+
|
| 168 |
+
print(f"\n=== Summary ===")
|
| 169 |
+
print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
|
| 170 |
+
print(f"Unary linear weights: {total_unary_bytes/1024/1024:.1f} MB")
|
| 171 |
+
print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
|
| 172 |
+
print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
|
| 173 |
+
print(f"Average bits per weight (linear): {avg_bpw:.2f}")
|
| 174 |
+
print(f"Compression vs FP32: {(total_original_bytes + total_fp16_bytes)/total_bytes:.1f}x")
|
| 175 |
+
print(f"Precision levels: {n_planes + 1} (vs ternary=3, INT4=16)")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
if __name__ == "__main__":
|
| 179 |
+
import sys
|
| 180 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
|
| 181 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary"
|
| 182 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
|
| 183 |
+
|
| 184 |
+
print(f"Loading model from {model_dir}...")
|
| 185 |
+
tensors = load_safetensors(model_dir)
|
| 186 |
+
|
| 187 |
+
print(f"Converting to unary (n_planes={n_planes})...")
|
| 188 |
+
save_unary_model(tensors, output_dir, n_planes)
|
| 189 |
+
print("Done!")
|
unary_convert_v2.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Pure Unary Converter - interleaved plane layout [out_dim][chunks][n_planes]
|
| 4 |
+
for cache-friendly access in the kernel.
|
| 5 |
+
|
| 6 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os, json, sys, time
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def load_safetensors(model_dir):
|
| 15 |
+
import torch
|
| 16 |
+
from safetensors.torch import load_file
|
| 17 |
+
tensors = {}
|
| 18 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 19 |
+
print(f"Loading {f.name}...")
|
| 20 |
+
for k, v in load_file(str(f)).items():
|
| 21 |
+
tensors[k] = v.float().numpy()
|
| 22 |
+
return tensors
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def quantize_unary_interleaved(weight, n_planes):
|
| 26 |
+
"""Quantize and pack into interleaved layout [out_dim][chunks][n_planes]"""
|
| 27 |
+
w = weight.astype(np.float32)
|
| 28 |
+
out_dim, in_dim = w.shape
|
| 29 |
+
chunks = (in_dim + 63) // 64
|
| 30 |
+
padded = chunks * 64
|
| 31 |
+
|
| 32 |
+
row_max = np.max(np.abs(w), axis=1, keepdims=True)
|
| 33 |
+
row_max = np.where(row_max == 0, 1.0, row_max)
|
| 34 |
+
scales = (row_max.flatten() / n_planes).astype(np.float32)
|
| 35 |
+
|
| 36 |
+
w_scaled = w / scales[:, None]
|
| 37 |
+
magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
|
| 38 |
+
magnitudes = np.clip(magnitudes, 0, n_planes)
|
| 39 |
+
signs = (w < 0)
|
| 40 |
+
|
| 41 |
+
sparsity = np.mean(magnitudes == 0)
|
| 42 |
+
|
| 43 |
+
if in_dim < padded:
|
| 44 |
+
magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded-in_dim), dtype=np.int32)], axis=1)
|
| 45 |
+
signs = np.concatenate([signs, np.zeros((out_dim, padded-in_dim), dtype=bool)], axis=1)
|
| 46 |
+
|
| 47 |
+
# Pack sign bits [out_dim][chunks]
|
| 48 |
+
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
|
| 49 |
+
signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 50 |
+
sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
|
| 51 |
+
|
| 52 |
+
# Pack magnitude planes INTERLEAVED: [out_dim][chunks][n_planes]
|
| 53 |
+
mag_planes = np.zeros((out_dim, chunks, n_planes), dtype=np.uint64)
|
| 54 |
+
|
| 55 |
+
for p in range(n_planes):
|
| 56 |
+
active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 57 |
+
mag_planes[:, :, p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
|
| 58 |
+
|
| 59 |
+
return sign_bits, mag_planes, scales, sparsity
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def convert(model_dir, output_dir, n_planes):
|
| 63 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 64 |
+
tensors = load_safetensors(model_dir)
|
| 65 |
+
|
| 66 |
+
config = {
|
| 67 |
+
"hidden_size": 1536, "intermediate_size": 8960,
|
| 68 |
+
"num_attention_heads": 12, "num_key_value_heads": 2,
|
| 69 |
+
"num_hidden_layers": 28, "vocab_size": 151936,
|
| 70 |
+
"head_dim": 128, "rope_theta": 1000000.0,
|
| 71 |
+
"rms_norm_eps": 1e-6, "n_planes": n_planes,
|
| 72 |
+
"quant_type": "unary_interleaved",
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
linear_keys = [k for k in tensors if any(p in k for p in
|
| 76 |
+
['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
|
| 77 |
+
'gate_proj.weight','up_proj.weight','down_proj.weight'])]
|
| 78 |
+
other_keys = [k for k in tensors if k not in linear_keys]
|
| 79 |
+
|
| 80 |
+
print(f"\nUnary: {len(linear_keys)} layers, {n_planes} planes ({2*n_planes+1} levels)")
|
| 81 |
+
print(f"FP16: {len(other_keys)} layers\n")
|
| 82 |
+
|
| 83 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 84 |
+
json.dump(config, f, indent=2)
|
| 85 |
+
|
| 86 |
+
total_unary = total_orig = total_fp16 = 0
|
| 87 |
+
|
| 88 |
+
for key in linear_keys:
|
| 89 |
+
w = tensors[key]
|
| 90 |
+
total_orig += w.nbytes
|
| 91 |
+
t0 = time.time()
|
| 92 |
+
sign_bits, mag_planes, scales, sparsity = quantize_unary_interleaved(w, n_planes)
|
| 93 |
+
dt = time.time() - t0
|
| 94 |
+
|
| 95 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 96 |
+
sign_bits.tofile(prefix + ".sign")
|
| 97 |
+
mag_planes.tofile(prefix + ".planes") # [out_dim][chunks][n_planes] contiguous
|
| 98 |
+
scales.tofile(prefix + ".scales")
|
| 99 |
+
|
| 100 |
+
ub = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
|
| 101 |
+
total_unary += ub
|
| 102 |
+
bpw = (ub * 8) / (w.shape[0] * w.shape[1])
|
| 103 |
+
print(f" {key}: {w.shape} -> {ub/1024:.0f}KB ({bpw:.1f}bpw, {sparsity:.0%} sparse, {dt:.1f}s)")
|
| 104 |
+
|
| 105 |
+
for key in other_keys:
|
| 106 |
+
w = tensors[key].astype(np.float16)
|
| 107 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 108 |
+
w.tofile(prefix + ".fp16")
|
| 109 |
+
total_fp16 += w.nbytes
|
| 110 |
+
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
|
| 111 |
+
|
| 112 |
+
manifest = {
|
| 113 |
+
"unary": {k: list(tensors[k].shape) for k in linear_keys},
|
| 114 |
+
"fp16": {k: list(tensors[k].shape) for k in other_keys},
|
| 115 |
+
}
|
| 116 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 117 |
+
json.dump(manifest, f, indent=2)
|
| 118 |
+
|
| 119 |
+
total = total_unary + total_fp16
|
| 120 |
+
avg_bpw = (total_unary * 8) / sum(np.prod(tensors[k].shape) for k in linear_keys)
|
| 121 |
+
print(f"\n=== Summary ===")
|
| 122 |
+
print(f"Unary weights: {total_unary/1024/1024:.1f} MB ({avg_bpw:.1f} avg bpw)")
|
| 123 |
+
print(f"FP16 weights: {total_fp16/1024/1024:.1f} MB")
|
| 124 |
+
print(f"Total: {total/1024/1024:.1f} MB")
|
| 125 |
+
print(f"Planes: {n_planes}, Levels: {2*n_planes+1}")
|
| 126 |
+
print(f"Layout: interleaved [out_dim][chunks][n_planes]")
|
| 127 |
+
print("Done!")
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
if __name__ == "__main__":
|
| 131 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
|
| 132 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary31"
|
| 133 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 31
|
| 134 |
+
convert(model_dir, output_dir, n_planes)
|
unary_engine.c
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* PURE UNARY (BASE-1) TRANSFORMER ENGINE
|
| 3 |
+
* AVX-512 + OpenMP. Full Qwen2 forward pass in C.
|
| 4 |
+
*
|
| 5 |
+
* Thermometer encoding: magnitude M = M planes set.
|
| 6 |
+
* Each plane contributes EXACTLY 1. No powers. No binary.
|
| 7 |
+
* 7 planes = 8 levels {0,1,2,3,4,5,6,7} * sign.
|
| 8 |
+
*
|
| 9 |
+
* Model format on disk (from unary_convert.py):
|
| 10 |
+
* .sign = [out_dim * chunks] uint64 (1=negative)
|
| 11 |
+
* .planes = [n_planes * out_dim * chunks] uint64 (thermometer)
|
| 12 |
+
* .scales = [out_dim] float32 (per-row)
|
| 13 |
+
*
|
| 14 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
#include <immintrin.h>
|
| 18 |
+
#include <stdint.h>
|
| 19 |
+
#include <stdlib.h>
|
| 20 |
+
#include <string.h>
|
| 21 |
+
#include <math.h>
|
| 22 |
+
#include <stdio.h>
|
| 23 |
+
#include <time.h>
|
| 24 |
+
#include <omp.h>
|
| 25 |
+
|
| 26 |
+
#define HIDDEN 1536
|
| 27 |
+
#define INTER 8960
|
| 28 |
+
#define N_HEADS 12
|
| 29 |
+
#define N_KV_HEADS 2
|
| 30 |
+
#define HEAD_DIM 128
|
| 31 |
+
#define N_LAYERS 28
|
| 32 |
+
#define VOCAB 151936
|
| 33 |
+
#define RMS_EPS 1e-6f
|
| 34 |
+
#define ROPE_THETA 1000000.0f
|
| 35 |
+
#define MAX_SEQ 4096
|
| 36 |
+
#define GQA_RATIO (N_HEADS / N_KV_HEADS)
|
| 37 |
+
|
| 38 |
+
typedef struct {
|
| 39 |
+
uint64_t *sign_bits; /* [out_dim * chunks] */
|
| 40 |
+
uint64_t *mag_planes; /* [n_planes * out_dim * chunks] */
|
| 41 |
+
float *scales; /* [out_dim] */
|
| 42 |
+
float *bias; /* [out_dim] or NULL */
|
| 43 |
+
int out_dim, in_dim, n_planes;
|
| 44 |
+
} UL; /* Unary Linear */
|
| 45 |
+
|
| 46 |
+
typedef struct { uint16_t *w; int od, id; } FL; /* FP16 Linear */
|
| 47 |
+
|
| 48 |
+
typedef struct {
|
| 49 |
+
UL qp, kp, vp, op, gp, up, dp;
|
| 50 |
+
float *in_norm, *pn_norm;
|
| 51 |
+
float *qb, *kb, *vb;
|
| 52 |
+
} Lay;
|
| 53 |
+
|
| 54 |
+
typedef struct {
|
| 55 |
+
uint16_t *emb;
|
| 56 |
+
Lay lay[N_LAYERS];
|
| 57 |
+
float *fnorm;
|
| 58 |
+
FL lmh;
|
| 59 |
+
float *kc, *vc; /* KV cache */
|
| 60 |
+
float *h, *h2; /* hidden states */
|
| 61 |
+
float *sq, *sk, *sv; /* QKV scratch */
|
| 62 |
+
float *ao; /* attn output */
|
| 63 |
+
float *sg, *su, *sd; /* MLP scratch */
|
| 64 |
+
float *lg; /* logits */
|
| 65 |
+
float *as; /* attn scores */
|
| 66 |
+
int np;
|
| 67 |
+
} M;
|
| 68 |
+
|
| 69 |
+
/* ============================================================
|
| 70 |
+
* PURE UNARY MATVEC
|
| 71 |
+
*
|
| 72 |
+
* y[i] = scales[i] * SUM over planes p:
|
| 73 |
+
* SUM over j where plane_p bit j is set:
|
| 74 |
+
* sign[j]==0 ? +x[j] : -x[j]
|
| 75 |
+
*
|
| 76 |
+
* Each plane contributes 1. Seven planes, seven passes.
|
| 77 |
+
* Embarrassingly parallel over output rows.
|
| 78 |
+
* ============================================================ */
|
| 79 |
+
static void umv(const UL *L, const float *x, float *y) {
|
| 80 |
+
const int od = L->out_dim, id = L->in_dim, np = L->n_planes;
|
| 81 |
+
const int ch = (id + 63) / 64;
|
| 82 |
+
const int idp = (id + 15) & ~15;
|
| 83 |
+
|
| 84 |
+
float *xp = (float*)aligned_alloc(64, idp * sizeof(float));
|
| 85 |
+
memcpy(xp, x, id * sizeof(float));
|
| 86 |
+
if (idp > id) memset(xp + id, 0, (idp - id) * sizeof(float));
|
| 87 |
+
|
| 88 |
+
#pragma omp parallel for schedule(dynamic, 64)
|
| 89 |
+
for (int i = 0; i < od; i++) {
|
| 90 |
+
const uint64_t *rs = L->sign_bits + (size_t)i * ch;
|
| 91 |
+
float tot = 0.0f;
|
| 92 |
+
|
| 93 |
+
for (int p = 0; p < np; p++) {
|
| 94 |
+
const uint64_t *pr = L->mag_planes + ((size_t)p * od + i) * ch;
|
| 95 |
+
__m512 acc = _mm512_setzero_ps();
|
| 96 |
+
|
| 97 |
+
for (int c = 0; c < ch; c++) {
|
| 98 |
+
uint64_t mb = pr[c], sb = rs[c];
|
| 99 |
+
uint64_t pos = mb & ~sb;
|
| 100 |
+
uint64_t neg = mb & sb;
|
| 101 |
+
|
| 102 |
+
for (int g = 0; g < 4; g++) {
|
| 103 |
+
int off = c * 64 + g * 16;
|
| 104 |
+
if (off >= idp) break;
|
| 105 |
+
__m512 xv = _mm512_load_ps(xp + off);
|
| 106 |
+
__mmask16 pm = (__mmask16)((pos >> (g*16)) & 0xFFFF);
|
| 107 |
+
__mmask16 nm = (__mmask16)((neg >> (g*16)) & 0xFFFF);
|
| 108 |
+
acc = _mm512_mask_add_ps(acc, pm, acc, xv);
|
| 109 |
+
acc = _mm512_mask_sub_ps(acc, nm, acc, xv);
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
/* PURE UNARY: each plane worth exactly 1 */
|
| 113 |
+
tot += _mm512_reduce_add_ps(acc);
|
| 114 |
+
}
|
| 115 |
+
y[i] = tot * L->scales[i];
|
| 116 |
+
if (L->bias) y[i] += L->bias[i];
|
| 117 |
+
}
|
| 118 |
+
free(xp);
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/* FP16 matvec (lm_head only) */
|
| 122 |
+
static void fmv(const FL *L, const float *x, float *y) {
|
| 123 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 124 |
+
for (int i = 0; i < L->od; i++) {
|
| 125 |
+
__m512 acc = _mm512_setzero_ps();
|
| 126 |
+
const uint16_t *row = L->w + (size_t)i * L->id;
|
| 127 |
+
int j;
|
| 128 |
+
for (j = 0; j + 16 <= L->id; j += 16) {
|
| 129 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + j));
|
| 130 |
+
acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
|
| 131 |
+
}
|
| 132 |
+
float s = _mm512_reduce_add_ps(acc);
|
| 133 |
+
for (; j < L->id; j++) {
|
| 134 |
+
float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
|
| 135 |
+
s += wf * x[j];
|
| 136 |
+
}
|
| 137 |
+
y[i] = s;
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
/* RMSNorm */
|
| 142 |
+
static void rn(const float *x, const float *w, float *y, int d) {
|
| 143 |
+
__m512 sq = _mm512_setzero_ps();
|
| 144 |
+
int i;
|
| 145 |
+
for (i = 0; i+16 <= d; i += 16) {
|
| 146 |
+
__m512 v = _mm512_loadu_ps(x+i);
|
| 147 |
+
sq = _mm512_fmadd_ps(v, v, sq);
|
| 148 |
+
}
|
| 149 |
+
float ss = _mm512_reduce_add_ps(sq);
|
| 150 |
+
for (; i < d; i++) ss += x[i]*x[i];
|
| 151 |
+
float r = 1.0f / sqrtf(ss/d + RMS_EPS);
|
| 152 |
+
__m512 rv = _mm512_set1_ps(r);
|
| 153 |
+
for (i = 0; i+16 <= d; i += 16)
|
| 154 |
+
_mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
|
| 155 |
+
_mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
|
| 156 |
+
for (; i < d; i++) y[i] = x[i]*r*w[i];
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
static void silu(float *x, int n) {
|
| 160 |
+
for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
static void emul(const float *a, const float *b, float *c, int n) {
|
| 164 |
+
int i;
|
| 165 |
+
for (i = 0; i+16 <= n; i += 16)
|
| 166 |
+
_mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
|
| 167 |
+
for (; i < n; i++) c[i] = a[i]*b[i];
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
static void va(float *y, const float *x, int n) {
|
| 171 |
+
int i;
|
| 172 |
+
for (i = 0; i+16 <= n; i += 16)
|
| 173 |
+
_mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
|
| 174 |
+
for (; i < n; i++) y[i] += x[i];
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
static void rope(float *v, int pos, int d) {
|
| 178 |
+
for (int i = 0; i < d; i += 2) {
|
| 179 |
+
float f = 1.0f / powf(ROPE_THETA, (float)i/d);
|
| 180 |
+
float a = pos*f, co = cosf(a), si = sinf(a);
|
| 181 |
+
float v0 = v[i], v1 = v[i+1];
|
| 182 |
+
v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
static void sm(float *x, int n) {
|
| 187 |
+
float mx = x[0];
|
| 188 |
+
for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
|
| 189 |
+
float s = 0;
|
| 190 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
|
| 191 |
+
float iv = 1.0f/s;
|
| 192 |
+
for (int i = 0; i < n; i++) x[i] *= iv;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
static void etok(const M *m, int t, float *o) {
|
| 196 |
+
const uint16_t *r = m->emb + (size_t)t * HIDDEN;
|
| 197 |
+
int i;
|
| 198 |
+
for (i = 0; i+16 <= HIDDEN; i += 16)
|
| 199 |
+
_mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
|
| 200 |
+
for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
static float* kvp(float *c, int l, int p, int h) {
|
| 204 |
+
return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
static void do_attn(M *m, int l, int pos) {
|
| 208 |
+
Lay *ly = &m->lay[l];
|
| 209 |
+
umv(&ly->qp, m->h2, m->sq);
|
| 210 |
+
umv(&ly->kp, m->h2, m->sk);
|
| 211 |
+
umv(&ly->vp, m->h2, m->sv);
|
| 212 |
+
if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
|
| 213 |
+
if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
|
| 214 |
+
if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
|
| 215 |
+
for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
|
| 216 |
+
for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
|
| 217 |
+
for (int h = 0; h < N_KV_HEADS; h++) {
|
| 218 |
+
memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
|
| 219 |
+
memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
|
| 220 |
+
}
|
| 221 |
+
float sc = 1.0f/sqrtf((float)HEAD_DIM);
|
| 222 |
+
memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
|
| 223 |
+
for (int h = 0; h < N_HEADS; h++) {
|
| 224 |
+
int kvh = h / GQA_RATIO;
|
| 225 |
+
float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
|
| 226 |
+
for (int t = 0; t <= pos; t++) {
|
| 227 |
+
float *kk = kvp(m->kc,l,t,kvh);
|
| 228 |
+
__m512 a = _mm512_setzero_ps();
|
| 229 |
+
int d;
|
| 230 |
+
for (d = 0; d+16 <= HEAD_DIM; d += 16)
|
| 231 |
+
a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
|
| 232 |
+
float dot = _mm512_reduce_add_ps(a);
|
| 233 |
+
for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
|
| 234 |
+
m->as[t] = dot * sc;
|
| 235 |
+
}
|
| 236 |
+
sm(m->as, pos+1);
|
| 237 |
+
for (int t = 0; t <= pos; t++) {
|
| 238 |
+
float w = m->as[t];
|
| 239 |
+
if (w < 1e-8f) continue;
|
| 240 |
+
float *vv = kvp(m->vc,l,t,kvh);
|
| 241 |
+
__m512 wv = _mm512_set1_ps(w);
|
| 242 |
+
int d;
|
| 243 |
+
for (d = 0; d+16 <= HEAD_DIM; d += 16)
|
| 244 |
+
_mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
|
| 245 |
+
for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
|
| 246 |
+
}
|
| 247 |
+
}
|
| 248 |
+
umv(&ly->op, m->ao, m->h2);
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
static void do_mlp(M *m, int l) {
|
| 252 |
+
Lay *ly = &m->lay[l];
|
| 253 |
+
umv(&ly->gp, m->h2, m->sg);
|
| 254 |
+
umv(&ly->up, m->h2, m->su);
|
| 255 |
+
silu(m->sg, INTER);
|
| 256 |
+
emul(m->sg, m->su, m->sd, INTER);
|
| 257 |
+
umv(&ly->dp, m->sd, m->h2);
|
| 258 |
+
}
|
| 259 |
+
|
| 260 |
+
float* forward_token(M *m, int tid, int pos) {
|
| 261 |
+
etok(m, tid, m->h);
|
| 262 |
+
for (int l = 0; l < N_LAYERS; l++) {
|
| 263 |
+
rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
|
| 264 |
+
do_attn(m, l, pos);
|
| 265 |
+
va(m->h, m->h2, HIDDEN);
|
| 266 |
+
rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
|
| 267 |
+
do_mlp(m, l);
|
| 268 |
+
va(m->h, m->h2, HIDDEN);
|
| 269 |
+
}
|
| 270 |
+
rn(m->h, m->fnorm, m->h2, HIDDEN);
|
| 271 |
+
fmv(&m->lmh, m->h2, m->lg);
|
| 272 |
+
return m->lg;
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
static int samp(float *lg, int V, float T, float tp) {
|
| 276 |
+
if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
|
| 277 |
+
sm(lg, V);
|
| 278 |
+
float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
|
| 279 |
+
memcpy(pr, lg, V*4);
|
| 280 |
+
for (int i = 0; i < V; i++) ix[i] = i;
|
| 281 |
+
float cum = 0; int nk = 0;
|
| 282 |
+
while (cum < tp && nk < V && nk < 50) {
|
| 283 |
+
int b = nk;
|
| 284 |
+
for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
|
| 285 |
+
float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
|
| 286 |
+
int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
|
| 287 |
+
cum += pr[nk]; nk++;
|
| 288 |
+
}
|
| 289 |
+
float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
|
| 290 |
+
float r = (float)rand()/RAND_MAX * s, ac = 0;
|
| 291 |
+
int ch = ix[0];
|
| 292 |
+
for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
|
| 293 |
+
free(pr); free(ix);
|
| 294 |
+
return ch;
|
| 295 |
+
}
|
| 296 |
+
|
| 297 |
+
int generate(M *m, const int *pr, int pl, int *out, int mx,
|
| 298 |
+
float T, float tp, int eos) {
|
| 299 |
+
srand(time(NULL));
|
| 300 |
+
for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
|
| 301 |
+
int pos = pl, gen = 0;
|
| 302 |
+
for (int t = 0; t < mx; t++) {
|
| 303 |
+
int nx;
|
| 304 |
+
if (T <= 0) {
|
| 305 |
+
nx = 0;
|
| 306 |
+
for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
|
| 307 |
+
} else {
|
| 308 |
+
nx = samp(m->lg, VOCAB, T, tp);
|
| 309 |
+
}
|
| 310 |
+
out[t] = nx; gen++;
|
| 311 |
+
if (nx == eos) break;
|
| 312 |
+
forward_token(m, nx, pos); pos++;
|
| 313 |
+
}
|
| 314 |
+
return gen;
|
| 315 |
+
}
|
| 316 |
+
|
| 317 |
+
/* ============================================================
|
| 318 |
+
* ALLOCATION + WEIGHT SETTERS (called from Python)
|
| 319 |
+
* ============================================================ */
|
| 320 |
+
M* model_alloc(int np) {
|
| 321 |
+
M *m = (M*)calloc(1, sizeof(M));
|
| 322 |
+
m->np = np;
|
| 323 |
+
size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
|
| 324 |
+
m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
|
| 325 |
+
m->h = (float*)aligned_alloc(64,HIDDEN*4);
|
| 326 |
+
m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
|
| 327 |
+
m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
|
| 328 |
+
m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
|
| 329 |
+
m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
|
| 330 |
+
m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
|
| 331 |
+
m->sg = (float*)aligned_alloc(64,INTER*4);
|
| 332 |
+
m->su = (float*)aligned_alloc(64,INTER*4);
|
| 333 |
+
m->sd = (float*)aligned_alloc(64,INTER*4);
|
| 334 |
+
m->lg = (float*)aligned_alloc(64,VOCAB*4);
|
| 335 |
+
m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
|
| 336 |
+
m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
|
| 337 |
+
printf("Alloc: KV=%zuMB np=%d\n", kv*2*4/1024/1024, np);
|
| 338 |
+
return m;
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
|
| 342 |
+
void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
|
| 343 |
+
void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
|
| 344 |
+
m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
|
| 345 |
+
}
|
| 346 |
+
void layer_set_norms(M *m, int l, float *i, float *p) {
|
| 347 |
+
m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
|
| 348 |
+
}
|
| 349 |
+
void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
|
| 350 |
+
m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
|
| 351 |
+
}
|
| 352 |
+
static void set_ul(UL *u, uint64_t *s, uint64_t *p, float *sc, int o, int i, int np) {
|
| 353 |
+
u->sign_bits=s; u->mag_planes=p; u->scales=sc;
|
| 354 |
+
u->out_dim=o; u->in_dim=i; u->n_planes=np; u->bias=NULL;
|
| 355 |
+
}
|
| 356 |
+
void layer_set_linears(M *m, int l,
|
| 357 |
+
uint64_t*qs,uint64_t*qp,float*qc,int qo,int qi,
|
| 358 |
+
uint64_t*ks,uint64_t*kp,float*kc,int ko,int ki,
|
| 359 |
+
uint64_t*vs,uint64_t*vp,float*vc,int vo,int vi,
|
| 360 |
+
uint64_t*os,uint64_t*op,float*oc,int oo,int oi,
|
| 361 |
+
uint64_t*gs,uint64_t*gp,float*gc,int go,int gi,
|
| 362 |
+
uint64_t*us,uint64_t*up,float*uc,int uo,int ui,
|
| 363 |
+
uint64_t*ds,uint64_t*dp,float*dc,int doo,int di, int np) {
|
| 364 |
+
set_ul(&m->lay[l].qp,qs,qp,qc,qo,qi,np);
|
| 365 |
+
set_ul(&m->lay[l].kp,ks,kp,kc,ko,ki,np);
|
| 366 |
+
set_ul(&m->lay[l].vp,vs,vp,vc,vo,vi,np);
|
| 367 |
+
set_ul(&m->lay[l].op,os,op,oc,oo,oi,np);
|
| 368 |
+
set_ul(&m->lay[l].gp,gs,gp,gc,go,gi,np);
|
| 369 |
+
set_ul(&m->lay[l].up,us,up,uc,uo,ui,np);
|
| 370 |
+
set_ul(&m->lay[l].dp,ds,dp,dc,doo,di,np);
|
| 371 |
+
}
|
| 372 |
+
void model_reset_cache(M *m) {
|
| 373 |
+
size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
|
| 374 |
+
memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
|
| 375 |
+
}
|
| 376 |
+
void model_free(M *m) {
|
| 377 |
+
free(m->kc);free(m->vc);free(m->h);free(m->h2);
|
| 378 |
+
free(m->sq);free(m->sk);free(m->sv);free(m->ao);
|
| 379 |
+
free(m->sg);free(m->su);free(m->sd);
|
| 380 |
+
free(m->lg);free(m->as);free(m->fnorm);free(m);
|
| 381 |
+
}
|
unary_engine_v2.c
ADDED
|
@@ -0,0 +1,629 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* UNARY TRANSFORMER ENGINE v2 - Configurable dimensions
|
| 3 |
+
*
|
| 4 |
+
* Full Qwen2/Qwen3 forward pass in C with AVX-512 + OpenMP.
|
| 5 |
+
* Supports any model size via runtime config.
|
| 6 |
+
*
|
| 7 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 8 |
+
*/
|
| 9 |
+
|
| 10 |
+
#include <immintrin.h>
|
| 11 |
+
#include <omp.h>
|
| 12 |
+
#include <stdint.h>
|
| 13 |
+
#include <stdlib.h>
|
| 14 |
+
#include <string.h>
|
| 15 |
+
#include <math.h>
|
| 16 |
+
#include <stdio.h>
|
| 17 |
+
#include <time.h>
|
| 18 |
+
|
| 19 |
+
#define MAX_SEQ 4096
|
| 20 |
+
#define RMS_EPS 1e-6f
|
| 21 |
+
|
| 22 |
+
/* ============================================================
|
| 23 |
+
* Config - set at init time
|
| 24 |
+
* ============================================================ */
|
| 25 |
+
typedef struct {
|
| 26 |
+
int hidden;
|
| 27 |
+
int inter;
|
| 28 |
+
int n_heads;
|
| 29 |
+
int n_kv_heads;
|
| 30 |
+
int head_dim;
|
| 31 |
+
int n_layers;
|
| 32 |
+
int vocab;
|
| 33 |
+
float rope_theta;
|
| 34 |
+
int has_attn_bias; /* 1 for Qwen2 (1.5B), 0 for Qwen3 (4B) */
|
| 35 |
+
int tie_embeddings; /* 1 if lm_head shares embed weights */
|
| 36 |
+
} Config;
|
| 37 |
+
|
| 38 |
+
/* ============================================================
|
| 39 |
+
* Unary linear layer
|
| 40 |
+
* ============================================================ */
|
| 41 |
+
typedef struct {
|
| 42 |
+
uint64_t *sign_bits;
|
| 43 |
+
uint64_t *mag_planes;
|
| 44 |
+
float *scales;
|
| 45 |
+
float *bias;
|
| 46 |
+
int out_dim;
|
| 47 |
+
int in_dim;
|
| 48 |
+
int n_planes;
|
| 49 |
+
} UnaryLinear;
|
| 50 |
+
|
| 51 |
+
/* FP16 linear (for lm_head when not tied) */
|
| 52 |
+
typedef struct {
|
| 53 |
+
uint16_t *weight;
|
| 54 |
+
int out_dim;
|
| 55 |
+
int in_dim;
|
| 56 |
+
} FP16Linear;
|
| 57 |
+
|
| 58 |
+
/* ============================================================
|
| 59 |
+
* Transformer layer
|
| 60 |
+
* ============================================================ */
|
| 61 |
+
typedef struct {
|
| 62 |
+
UnaryLinear q_proj, k_proj, v_proj, o_proj;
|
| 63 |
+
UnaryLinear gate_proj, up_proj, down_proj;
|
| 64 |
+
float *input_norm;
|
| 65 |
+
float *post_norm;
|
| 66 |
+
float *q_bias, *k_bias, *v_bias;
|
| 67 |
+
float *q_norm, *k_norm; /* QK-Norm (Qwen3) */
|
| 68 |
+
} Layer;
|
| 69 |
+
|
| 70 |
+
/* ============================================================
|
| 71 |
+
* Full model
|
| 72 |
+
* ============================================================ */
|
| 73 |
+
typedef struct {
|
| 74 |
+
Config cfg;
|
| 75 |
+
|
| 76 |
+
uint16_t *embed; /* FP16 embeddings */
|
| 77 |
+
Layer *layers; /* Dynamic array */
|
| 78 |
+
float *final_norm;
|
| 79 |
+
FP16Linear lm_head; /* Only used if !tie_embeddings */
|
| 80 |
+
|
| 81 |
+
/* KV cache */
|
| 82 |
+
float *k_cache;
|
| 83 |
+
float *v_cache;
|
| 84 |
+
|
| 85 |
+
/* Scratch buffers */
|
| 86 |
+
float *hidden;
|
| 87 |
+
float *hidden2;
|
| 88 |
+
float *q;
|
| 89 |
+
float *k;
|
| 90 |
+
float *v;
|
| 91 |
+
float *attn_out;
|
| 92 |
+
float *gate;
|
| 93 |
+
float *up;
|
| 94 |
+
float *down_in;
|
| 95 |
+
float *logits;
|
| 96 |
+
float *attn_scores;
|
| 97 |
+
|
| 98 |
+
int n_planes;
|
| 99 |
+
} Model;
|
| 100 |
+
|
| 101 |
+
/* ============================================================
|
| 102 |
+
* AVX-512 Unary matvec: y = W @ x
|
| 103 |
+
* ============================================================ */
|
| 104 |
+
static void unary_matvec(
|
| 105 |
+
const UnaryLinear *layer, const float *x, float *y
|
| 106 |
+
) {
|
| 107 |
+
int out_dim = layer->out_dim;
|
| 108 |
+
int in_dim = layer->in_dim;
|
| 109 |
+
int n_planes = layer->n_planes;
|
| 110 |
+
int chunks = (in_dim + 63) / 64;
|
| 111 |
+
int in_padded = (in_dim + 15) & ~15;
|
| 112 |
+
|
| 113 |
+
#pragma omp parallel for schedule(dynamic, 64)
|
| 114 |
+
for (int i = 0; i < out_dim; i++) {
|
| 115 |
+
const uint64_t *row_sign = layer->sign_bits + (size_t)i * chunks;
|
| 116 |
+
float total = 0.0f;
|
| 117 |
+
|
| 118 |
+
/* Aligned local copy of input for this thread */
|
| 119 |
+
float x_local[in_padded] __attribute__((aligned(64)));
|
| 120 |
+
memcpy(x_local, x, in_dim * sizeof(float));
|
| 121 |
+
if (in_padded > in_dim)
|
| 122 |
+
memset(x_local + in_dim, 0, (in_padded - in_dim) * sizeof(float));
|
| 123 |
+
|
| 124 |
+
for (int p = 0; p < n_planes; p++) {
|
| 125 |
+
const uint64_t *plane_row = layer->mag_planes +
|
| 126 |
+
((size_t)p * out_dim + i) * chunks;
|
| 127 |
+
__m512 acc = _mm512_setzero_ps();
|
| 128 |
+
|
| 129 |
+
for (int c = 0; c < chunks; c++) {
|
| 130 |
+
uint64_t mbits = plane_row[c];
|
| 131 |
+
uint64_t sbits = row_sign[c];
|
| 132 |
+
uint64_t pos_bits = mbits & ~sbits;
|
| 133 |
+
uint64_t neg_bits = mbits & sbits;
|
| 134 |
+
|
| 135 |
+
for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
|
| 136 |
+
int offset = c * 64 + g * 16;
|
| 137 |
+
__m512 xv = _mm512_load_ps(x_local + offset);
|
| 138 |
+
__mmask16 pmask = (__mmask16)((pos_bits >> (g * 16)) & 0xFFFF);
|
| 139 |
+
__mmask16 nmask = (__mmask16)((neg_bits >> (g * 16)) & 0xFFFF);
|
| 140 |
+
acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
|
| 141 |
+
acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
|
| 142 |
+
}
|
| 143 |
+
}
|
| 144 |
+
total += _mm512_reduce_add_ps(acc);
|
| 145 |
+
}
|
| 146 |
+
y[i] = total * layer->scales[i];
|
| 147 |
+
if (layer->bias) y[i] += layer->bias[i];
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
/* FP16 matvec for lm_head */
|
| 152 |
+
static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
|
| 153 |
+
int out_dim = layer->out_dim;
|
| 154 |
+
int in_dim = layer->in_dim;
|
| 155 |
+
const uint16_t *w = layer->weight;
|
| 156 |
+
|
| 157 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 158 |
+
for (int i = 0; i < out_dim; i++) {
|
| 159 |
+
__m512 acc = _mm512_setzero_ps();
|
| 160 |
+
int j;
|
| 161 |
+
for (j = 0; j + 16 <= in_dim; j += 16) {
|
| 162 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
|
| 163 |
+
__m512 wv = _mm512_cvtph_ps(h);
|
| 164 |
+
__m512 xv = _mm512_loadu_ps(x + j);
|
| 165 |
+
acc = _mm512_fmadd_ps(wv, xv, acc);
|
| 166 |
+
}
|
| 167 |
+
float sum = _mm512_reduce_add_ps(acc);
|
| 168 |
+
for (; j < in_dim; j++) {
|
| 169 |
+
__m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
|
| 170 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 171 |
+
float wf;
|
| 172 |
+
_mm_store_ss(&wf, fv);
|
| 173 |
+
sum += wf * x[j];
|
| 174 |
+
}
|
| 175 |
+
y[i] = sum;
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
/* ============================================================
|
| 180 |
+
* Basic ops - all AVX-512 vectorized
|
| 181 |
+
* ============================================================ */
|
| 182 |
+
|
| 183 |
+
static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
|
| 184 |
+
__m512 sum_sq = _mm512_setzero_ps();
|
| 185 |
+
int i;
|
| 186 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 187 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 188 |
+
sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
|
| 189 |
+
}
|
| 190 |
+
float ss = _mm512_reduce_add_ps(sum_sq);
|
| 191 |
+
for (; i < dim; i++) ss += x[i] * x[i];
|
| 192 |
+
float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
|
| 193 |
+
|
| 194 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 195 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 196 |
+
__m512 wv = _mm512_loadu_ps(weight + i);
|
| 197 |
+
__m512 rv = _mm512_set1_ps(rms);
|
| 198 |
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
|
| 199 |
+
}
|
| 200 |
+
for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
static void silu_inplace(float *x, int n) {
|
| 204 |
+
int i;
|
| 205 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 206 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 207 |
+
__m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), xv);
|
| 208 |
+
/* exp(-x) approximation not great with AVX, use scalar */
|
| 209 |
+
float tmp[16];
|
| 210 |
+
_mm512_storeu_ps(tmp, xv);
|
| 211 |
+
for (int j = 0; j < 16; j++)
|
| 212 |
+
tmp[j] = tmp[j] / (1.0f + expf(-tmp[j]));
|
| 213 |
+
_mm512_storeu_ps(x + i, _mm512_loadu_ps(tmp));
|
| 214 |
+
}
|
| 215 |
+
for (; i < n; i++)
|
| 216 |
+
x[i] = x[i] / (1.0f + expf(-x[i]));
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
static void elemwise_mul(const float *a, const float *b, float *c, int n) {
|
| 220 |
+
int i;
|
| 221 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 222 |
+
__m512 av = _mm512_loadu_ps(a + i);
|
| 223 |
+
__m512 bv = _mm512_loadu_ps(b + i);
|
| 224 |
+
_mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
|
| 225 |
+
}
|
| 226 |
+
for (; i < n; i++) c[i] = a[i] * b[i];
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
static void vec_add(float *y, const float *x, int n) {
|
| 230 |
+
int i;
|
| 231 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 232 |
+
__m512 yv = _mm512_loadu_ps(y + i);
|
| 233 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 234 |
+
_mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
|
| 235 |
+
}
|
| 236 |
+
for (; i < n; i++) y[i] += x[i];
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
static void apply_rope(float *vec, int pos, int dim, float theta) {
|
| 240 |
+
for (int i = 0; i < dim; i += 2) {
|
| 241 |
+
float freq = 1.0f / powf(theta, (float)i / dim);
|
| 242 |
+
float angle = pos * freq;
|
| 243 |
+
float cos_a = cosf(angle);
|
| 244 |
+
float sin_a = sinf(angle);
|
| 245 |
+
float v0 = vec[i];
|
| 246 |
+
float v1 = vec[i + 1];
|
| 247 |
+
vec[i] = v0 * cos_a - v1 * sin_a;
|
| 248 |
+
vec[i + 1] = v0 * sin_a + v1 * cos_a;
|
| 249 |
+
}
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
static void softmax(float *x, int n) {
|
| 253 |
+
float max_val = x[0];
|
| 254 |
+
for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
|
| 255 |
+
float sum = 0.0f;
|
| 256 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
|
| 257 |
+
float inv = 1.0f / sum;
|
| 258 |
+
for (int i = 0; i < n; i++) x[i] *= inv;
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
/* ============================================================
|
| 262 |
+
* Embedding lookup (FP16 -> FP32)
|
| 263 |
+
* ============================================================ */
|
| 264 |
+
static void embed_token(const Model *m, int token_id, float *out) {
|
| 265 |
+
int hidden = m->cfg.hidden;
|
| 266 |
+
const uint16_t *row = m->embed + (size_t)token_id * hidden;
|
| 267 |
+
int i;
|
| 268 |
+
for (i = 0; i + 16 <= hidden; i += 16) {
|
| 269 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + i));
|
| 270 |
+
__m512 fv = _mm512_cvtph_ps(h);
|
| 271 |
+
_mm512_storeu_ps(out + i, fv);
|
| 272 |
+
}
|
| 273 |
+
for (; i < hidden; i++) {
|
| 274 |
+
__m128i hv = _mm_set1_epi16(row[i]);
|
| 275 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 276 |
+
_mm_store_ss(out + i, fv);
|
| 277 |
+
}
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
/* KV cache helpers */
|
| 281 |
+
static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
|
| 282 |
+
return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
|
| 283 |
+
(size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
/* ============================================================
|
| 287 |
+
* ATTENTION
|
| 288 |
+
* ============================================================ */
|
| 289 |
+
static void attention(Model *m, int layer_idx, int pos) {
|
| 290 |
+
Config *c = &m->cfg;
|
| 291 |
+
Layer *layer = &m->layers[layer_idx];
|
| 292 |
+
int heads_per_kv = c->n_heads / c->n_kv_heads;
|
| 293 |
+
|
| 294 |
+
unary_matvec(&layer->q_proj, m->hidden2, m->q);
|
| 295 |
+
unary_matvec(&layer->k_proj, m->hidden2, m->k);
|
| 296 |
+
unary_matvec(&layer->v_proj, m->hidden2, m->v);
|
| 297 |
+
|
| 298 |
+
if (c->has_attn_bias) {
|
| 299 |
+
if (layer->q_bias) vec_add(m->q, layer->q_bias, c->n_heads * c->head_dim);
|
| 300 |
+
if (layer->k_bias) vec_add(m->k, layer->k_bias, c->n_kv_heads * c->head_dim);
|
| 301 |
+
if (layer->v_bias) vec_add(m->v, layer->v_bias, c->n_kv_heads * c->head_dim);
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
/* QK-Norm (Qwen3): RMSNorm each head's Q and K before RoPE */
|
| 305 |
+
if (layer->q_norm) {
|
| 306 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 307 |
+
rmsnorm(m->q + h * c->head_dim, layer->q_norm, m->q + h * c->head_dim, c->head_dim);
|
| 308 |
+
}
|
| 309 |
+
if (layer->k_norm) {
|
| 310 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 311 |
+
rmsnorm(m->k + h * c->head_dim, layer->k_norm, m->k + h * c->head_dim, c->head_dim);
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
for (int h = 0; h < c->n_heads; h++)
|
| 315 |
+
apply_rope(m->q + h * c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 316 |
+
for (int h = 0; h < c->n_kv_heads; h++)
|
| 317 |
+
apply_rope(m->k + h * c->head_dim, pos, c->head_dim, c->rope_theta);
|
| 318 |
+
|
| 319 |
+
for (int h = 0; h < c->n_kv_heads; h++) {
|
| 320 |
+
memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
|
| 321 |
+
m->k + h * c->head_dim, c->head_dim * sizeof(float));
|
| 322 |
+
memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
|
| 323 |
+
m->v + h * c->head_dim, c->head_dim * sizeof(float));
|
| 324 |
+
}
|
| 325 |
+
|
| 326 |
+
float scale = 1.0f / sqrtf((float)c->head_dim);
|
| 327 |
+
memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
|
| 328 |
+
|
| 329 |
+
for (int h = 0; h < c->n_heads; h++) {
|
| 330 |
+
int kv_h = h / heads_per_kv;
|
| 331 |
+
float *q_head = m->q + h * c->head_dim;
|
| 332 |
+
float *out_head = m->attn_out + h * c->head_dim;
|
| 333 |
+
|
| 334 |
+
for (int t = 0; t <= pos; t++) {
|
| 335 |
+
float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
|
| 336 |
+
__m512 acc = _mm512_setzero_ps();
|
| 337 |
+
int d;
|
| 338 |
+
for (d = 0; d + 16 <= c->head_dim; d += 16) {
|
| 339 |
+
__m512 qv = _mm512_loadu_ps(q_head + d);
|
| 340 |
+
__m512 kv = _mm512_loadu_ps(k_cached + d);
|
| 341 |
+
acc = _mm512_fmadd_ps(qv, kv, acc);
|
| 342 |
+
}
|
| 343 |
+
float dot = _mm512_reduce_add_ps(acc);
|
| 344 |
+
for (; d < c->head_dim; d++) dot += q_head[d] * k_cached[d];
|
| 345 |
+
m->attn_scores[t] = dot * scale;
|
| 346 |
+
}
|
| 347 |
+
|
| 348 |
+
softmax(m->attn_scores, pos + 1);
|
| 349 |
+
|
| 350 |
+
for (int t = 0; t <= pos; t++) {
|
| 351 |
+
float w = m->attn_scores[t];
|
| 352 |
+
if (w < 1e-8f) continue;
|
| 353 |
+
float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
|
| 354 |
+
__m512 wv = _mm512_set1_ps(w);
|
| 355 |
+
int d;
|
| 356 |
+
for (d = 0; d + 16 <= c->head_dim; d += 16) {
|
| 357 |
+
__m512 ov = _mm512_loadu_ps(out_head + d);
|
| 358 |
+
__m512 vv = _mm512_loadu_ps(v_cached + d);
|
| 359 |
+
_mm512_storeu_ps(out_head + d, _mm512_fmadd_ps(wv, vv, ov));
|
| 360 |
+
}
|
| 361 |
+
for (; d < c->head_dim; d++) out_head[d] += w * v_cached[d];
|
| 362 |
+
}
|
| 363 |
+
}
|
| 364 |
+
|
| 365 |
+
unary_matvec(&layer->o_proj, m->attn_out, m->hidden2);
|
| 366 |
+
}
|
| 367 |
+
|
| 368 |
+
/* ============================================================
|
| 369 |
+
* MLP - SwiGLU
|
| 370 |
+
* ============================================================ */
|
| 371 |
+
static void mlp(Model *m, int layer_idx) {
|
| 372 |
+
Layer *layer = &m->layers[layer_idx];
|
| 373 |
+
int inter = m->cfg.inter;
|
| 374 |
+
|
| 375 |
+
unary_matvec(&layer->gate_proj, m->hidden2, m->gate);
|
| 376 |
+
unary_matvec(&layer->up_proj, m->hidden2, m->up);
|
| 377 |
+
|
| 378 |
+
silu_inplace(m->gate, inter);
|
| 379 |
+
elemwise_mul(m->gate, m->up, m->down_in, inter);
|
| 380 |
+
|
| 381 |
+
unary_matvec(&layer->down_proj, m->down_in, m->hidden2);
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
/* ============================================================
|
| 385 |
+
* FORWARD ONE TOKEN
|
| 386 |
+
* ============================================================ */
|
| 387 |
+
float* forward_token(Model *m, int token_id, int pos) {
|
| 388 |
+
Config *c = &m->cfg;
|
| 389 |
+
|
| 390 |
+
embed_token(m, token_id, m->hidden);
|
| 391 |
+
|
| 392 |
+
for (int l = 0; l < c->n_layers; l++) {
|
| 393 |
+
rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, c->hidden);
|
| 394 |
+
attention(m, l, pos);
|
| 395 |
+
vec_add(m->hidden, m->hidden2, c->hidden);
|
| 396 |
+
rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, c->hidden);
|
| 397 |
+
mlp(m, l);
|
| 398 |
+
vec_add(m->hidden, m->hidden2, c->hidden);
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
rmsnorm(m->hidden, m->final_norm, m->hidden2, c->hidden);
|
| 402 |
+
|
| 403 |
+
/* LM head - either tied embeddings or separate FP16 */
|
| 404 |
+
if (c->tie_embeddings) {
|
| 405 |
+
/* Use embed weights as lm_head (FP16 matvec) */
|
| 406 |
+
FP16Linear tied;
|
| 407 |
+
tied.weight = m->embed;
|
| 408 |
+
tied.out_dim = c->vocab;
|
| 409 |
+
tied.in_dim = c->hidden;
|
| 410 |
+
fp16_matvec(&tied, m->hidden2, m->logits);
|
| 411 |
+
} else {
|
| 412 |
+
fp16_matvec(&m->lm_head, m->hidden2, m->logits);
|
| 413 |
+
}
|
| 414 |
+
|
| 415 |
+
return m->logits;
|
| 416 |
+
}
|
| 417 |
+
|
| 418 |
+
/* ============================================================
|
| 419 |
+
* TOP-P SAMPLING
|
| 420 |
+
* ============================================================ */
|
| 421 |
+
static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
|
| 422 |
+
if (temperature > 0) {
|
| 423 |
+
float inv_t = 1.0f / temperature;
|
| 424 |
+
for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
|
| 425 |
+
}
|
| 426 |
+
softmax(logits, vocab);
|
| 427 |
+
|
| 428 |
+
float *probs = (float *)malloc(vocab * sizeof(float));
|
| 429 |
+
int *indices = (int *)malloc(vocab * sizeof(int));
|
| 430 |
+
memcpy(probs, logits, vocab * sizeof(float));
|
| 431 |
+
for (int i = 0; i < vocab; i++) indices[i] = i;
|
| 432 |
+
|
| 433 |
+
int n_keep = 0;
|
| 434 |
+
float cum = 0.0f;
|
| 435 |
+
while (cum < top_p && n_keep < vocab) {
|
| 436 |
+
int best = n_keep;
|
| 437 |
+
for (int i = n_keep + 1; i < vocab; i++)
|
| 438 |
+
if (probs[i] > probs[best]) best = i;
|
| 439 |
+
float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
|
| 440 |
+
int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
|
| 441 |
+
cum += probs[n_keep];
|
| 442 |
+
n_keep++;
|
| 443 |
+
if (n_keep >= 40) break;
|
| 444 |
+
}
|
| 445 |
+
|
| 446 |
+
float sum = 0.0f;
|
| 447 |
+
for (int i = 0; i < n_keep; i++) sum += probs[i];
|
| 448 |
+
float r = (float)rand() / RAND_MAX * sum;
|
| 449 |
+
float acc = 0.0f;
|
| 450 |
+
int chosen = indices[0];
|
| 451 |
+
for (int i = 0; i < n_keep; i++) {
|
| 452 |
+
acc += probs[i];
|
| 453 |
+
if (acc >= r) { chosen = indices[i]; break; }
|
| 454 |
+
}
|
| 455 |
+
|
| 456 |
+
free(probs);
|
| 457 |
+
free(indices);
|
| 458 |
+
return chosen;
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
/* ============================================================
|
| 462 |
+
* GENERATE
|
| 463 |
+
* ============================================================ */
|
| 464 |
+
int generate(
|
| 465 |
+
Model *m,
|
| 466 |
+
const int *prompt_ids, int prompt_len,
|
| 467 |
+
int *out_tokens, int max_new_tokens,
|
| 468 |
+
float temperature, float top_p,
|
| 469 |
+
int eos_token
|
| 470 |
+
) {
|
| 471 |
+
srand(time(NULL));
|
| 472 |
+
|
| 473 |
+
for (int i = 0; i < prompt_len; i++) {
|
| 474 |
+
forward_token(m, prompt_ids[i], i);
|
| 475 |
+
}
|
| 476 |
+
|
| 477 |
+
int pos = prompt_len;
|
| 478 |
+
int generated = 0;
|
| 479 |
+
|
| 480 |
+
for (int t = 0; t < max_new_tokens; t++) {
|
| 481 |
+
float *logits = m->logits;
|
| 482 |
+
|
| 483 |
+
int next_token;
|
| 484 |
+
if (temperature <= 0) {
|
| 485 |
+
next_token = 0;
|
| 486 |
+
for (int i = 1; i < m->cfg.vocab; i++)
|
| 487 |
+
if (logits[i] > logits[next_token]) next_token = i;
|
| 488 |
+
} else {
|
| 489 |
+
next_token = sample_top_p(logits, m->cfg.vocab, temperature, top_p);
|
| 490 |
+
}
|
| 491 |
+
|
| 492 |
+
out_tokens[t] = next_token;
|
| 493 |
+
generated++;
|
| 494 |
+
|
| 495 |
+
if (next_token == eos_token) break;
|
| 496 |
+
|
| 497 |
+
forward_token(m, next_token, pos);
|
| 498 |
+
pos++;
|
| 499 |
+
}
|
| 500 |
+
|
| 501 |
+
return generated;
|
| 502 |
+
}
|
| 503 |
+
|
| 504 |
+
/* ============================================================
|
| 505 |
+
* MODEL ALLOCATION with config
|
| 506 |
+
* ============================================================ */
|
| 507 |
+
Model* model_alloc(
|
| 508 |
+
int n_planes,
|
| 509 |
+
int hidden, int inter, int n_heads, int n_kv_heads,
|
| 510 |
+
int head_dim, int n_layers, int vocab,
|
| 511 |
+
float rope_theta, int has_attn_bias, int tie_embeddings
|
| 512 |
+
) {
|
| 513 |
+
Model *m = (Model *)calloc(1, sizeof(Model));
|
| 514 |
+
m->n_planes = n_planes;
|
| 515 |
+
|
| 516 |
+
Config *c = &m->cfg;
|
| 517 |
+
c->hidden = hidden;
|
| 518 |
+
c->inter = inter;
|
| 519 |
+
c->n_heads = n_heads;
|
| 520 |
+
c->n_kv_heads = n_kv_heads;
|
| 521 |
+
c->head_dim = head_dim;
|
| 522 |
+
c->n_layers = n_layers;
|
| 523 |
+
c->vocab = vocab;
|
| 524 |
+
c->rope_theta = rope_theta;
|
| 525 |
+
c->has_attn_bias = has_attn_bias;
|
| 526 |
+
c->tie_embeddings = tie_embeddings;
|
| 527 |
+
|
| 528 |
+
m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
|
| 529 |
+
|
| 530 |
+
size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
|
| 531 |
+
m->k_cache = (float *)calloc(kv_size, sizeof(float));
|
| 532 |
+
m->v_cache = (float *)calloc(kv_size, sizeof(float));
|
| 533 |
+
|
| 534 |
+
m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 535 |
+
m->hidden2 = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 536 |
+
m->q = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 537 |
+
m->k = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 538 |
+
m->v = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
|
| 539 |
+
m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
|
| 540 |
+
m->gate = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 541 |
+
m->up = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 542 |
+
m->down_in = (float *)aligned_alloc(64, inter * sizeof(float));
|
| 543 |
+
m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
|
| 544 |
+
m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
|
| 545 |
+
m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
|
| 546 |
+
|
| 547 |
+
size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
|
| 548 |
+
printf("Model config: hidden=%d inter=%d heads=%d kv_heads=%d layers=%d vocab=%d\n",
|
| 549 |
+
hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
|
| 550 |
+
printf("KV cache: %zu MB, tied_embed=%d, attn_bias=%d\n",
|
| 551 |
+
kv_mb, tie_embeddings, has_attn_bias);
|
| 552 |
+
|
| 553 |
+
return m;
|
| 554 |
+
}
|
| 555 |
+
|
| 556 |
+
/* Weight setters */
|
| 557 |
+
void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
|
| 558 |
+
void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
|
| 559 |
+
void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
|
| 560 |
+
m->lm_head.weight = data;
|
| 561 |
+
m->lm_head.out_dim = out_dim;
|
| 562 |
+
m->lm_head.in_dim = in_dim;
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
void layer_set_norms(Model *m, int l, float *input_norm, float *post_norm) {
|
| 566 |
+
m->layers[l].input_norm = input_norm;
|
| 567 |
+
m->layers[l].post_norm = post_norm;
|
| 568 |
+
}
|
| 569 |
+
|
| 570 |
+
void layer_set_bias(Model *m, int l, float *q_bias, float *k_bias, float *v_bias) {
|
| 571 |
+
m->layers[l].q_bias = q_bias;
|
| 572 |
+
m->layers[l].k_bias = k_bias;
|
| 573 |
+
m->layers[l].v_bias = v_bias;
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
|
| 577 |
+
m->layers[l].q_norm = q_norm;
|
| 578 |
+
m->layers[l].k_norm = k_norm;
|
| 579 |
+
}
|
| 580 |
+
|
| 581 |
+
void layer_set_unary(
|
| 582 |
+
UnaryLinear *ul,
|
| 583 |
+
uint64_t *sign, uint64_t *planes, float *scales,
|
| 584 |
+
int out_dim, int in_dim, int n_planes
|
| 585 |
+
) {
|
| 586 |
+
ul->sign_bits = sign;
|
| 587 |
+
ul->mag_planes = planes;
|
| 588 |
+
ul->scales = scales;
|
| 589 |
+
ul->out_dim = out_dim;
|
| 590 |
+
ul->in_dim = in_dim;
|
| 591 |
+
ul->n_planes = n_planes;
|
| 592 |
+
ul->bias = NULL;
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
void layer_set_linears(
|
| 596 |
+
Model *m, int l,
|
| 597 |
+
uint64_t *q_sign, uint64_t *q_planes, float *q_scales, int q_out, int q_in,
|
| 598 |
+
uint64_t *k_sign, uint64_t *k_planes, float *k_scales, int k_out, int k_in,
|
| 599 |
+
uint64_t *v_sign, uint64_t *v_planes, float *v_scales, int v_out, int v_in,
|
| 600 |
+
uint64_t *o_sign, uint64_t *o_planes, float *o_scales, int o_out, int o_in,
|
| 601 |
+
uint64_t *g_sign, uint64_t *g_planes, float *g_scales, int g_out, int g_in,
|
| 602 |
+
uint64_t *u_sign, uint64_t *u_planes, float *u_scales, int u_out, int u_in,
|
| 603 |
+
uint64_t *d_sign, uint64_t *d_planes, float *d_scales, int d_out, int d_in,
|
| 604 |
+
int n_planes
|
| 605 |
+
) {
|
| 606 |
+
layer_set_unary(&m->layers[l].q_proj, q_sign, q_planes, q_scales, q_out, q_in, n_planes);
|
| 607 |
+
layer_set_unary(&m->layers[l].k_proj, k_sign, k_planes, k_scales, k_out, k_in, n_planes);
|
| 608 |
+
layer_set_unary(&m->layers[l].v_proj, v_sign, v_planes, v_scales, v_out, v_in, n_planes);
|
| 609 |
+
layer_set_unary(&m->layers[l].o_proj, o_sign, o_planes, o_scales, o_out, o_in, n_planes);
|
| 610 |
+
layer_set_unary(&m->layers[l].gate_proj, g_sign, g_planes, g_scales, g_out, g_in, n_planes);
|
| 611 |
+
layer_set_unary(&m->layers[l].up_proj, u_sign, u_planes, u_scales, u_out, u_in, n_planes);
|
| 612 |
+
layer_set_unary(&m->layers[l].down_proj, d_sign, d_planes, d_scales, d_out, d_in, n_planes);
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
void model_reset_cache(Model *m) {
|
| 616 |
+
size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
|
| 617 |
+
memset(m->k_cache, 0, kv_size * sizeof(float));
|
| 618 |
+
memset(m->v_cache, 0, kv_size * sizeof(float));
|
| 619 |
+
}
|
| 620 |
+
|
| 621 |
+
void model_free(Model *m) {
|
| 622 |
+
free(m->k_cache); free(m->v_cache);
|
| 623 |
+
free(m->hidden); free(m->hidden2);
|
| 624 |
+
free(m->q); free(m->k); free(m->v);
|
| 625 |
+
free(m->attn_out); free(m->gate); free(m->up); free(m->down_in);
|
| 626 |
+
free(m->logits); free(m->attn_scores); free(m->final_norm);
|
| 627 |
+
free(m->layers);
|
| 628 |
+
free(m);
|
| 629 |
+
}
|
unary_full.c
ADDED
|
@@ -0,0 +1,742 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* FULL UNARY ENGINE - Weights AND Activations in Base-1
|
| 3 |
+
*
|
| 4 |
+
* True unary: the entire matmul is popcount of ANDed bitplanes.
|
| 5 |
+
* No floating point in the inner loop. No multiplication anywhere.
|
| 6 |
+
*
|
| 7 |
+
* Weight w with magnitude M_w (thermometer: M_w planes with bit set)
|
| 8 |
+
* Activation x with magnitude M_x (thermometer: M_x planes with bit set)
|
| 9 |
+
*
|
| 10 |
+
* dot(w, x) for row i:
|
| 11 |
+
* For each weight plane p (0..W-1) and act plane q (0..A-1):
|
| 12 |
+
* contribution = popcount( w_plane_p[i] AND act_plane_q AND same_sign )
|
| 13 |
+
* - popcount( w_plane_p[i] AND act_plane_q AND diff_sign )
|
| 14 |
+
* y[i] = sum_of_contributions * w_scale[i] * act_scale
|
| 15 |
+
*
|
| 16 |
+
* The outer sum has W*A terms, each is a popcount over 64 elements.
|
| 17 |
+
* With W=4, A=4: 16 popcounts per 64 elements = insanely fast.
|
| 18 |
+
*
|
| 19 |
+
* AVX-512 VPOPCNTDQ: one instruction for 8x64-bit popcounts.
|
| 20 |
+
* On Skylake (no VPOPCNTDQ): use Harley-Seal or scalar POPCNT.
|
| 21 |
+
*
|
| 22 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 23 |
+
*/
|
| 24 |
+
|
| 25 |
+
#include <immintrin.h>
|
| 26 |
+
#include <stdint.h>
|
| 27 |
+
#include <stdlib.h>
|
| 28 |
+
#include <string.h>
|
| 29 |
+
#include <math.h>
|
| 30 |
+
#include <stdio.h>
|
| 31 |
+
#include <time.h>
|
| 32 |
+
#include <omp.h>
|
| 33 |
+
#include <x86intrin.h>
|
| 34 |
+
|
| 35 |
+
/* ============================================================
|
| 36 |
+
* Config (DeepSeek-R1-Distill-Qwen-1.5B)
|
| 37 |
+
* ============================================================ */
|
| 38 |
+
#define HIDDEN 1536
|
| 39 |
+
#define INTER 8960
|
| 40 |
+
#define N_HEADS 12
|
| 41 |
+
#define N_KV_HEADS 2
|
| 42 |
+
#define HEAD_DIM 128
|
| 43 |
+
#define N_LAYERS 28
|
| 44 |
+
#define VOCAB 151936
|
| 45 |
+
#define RMS_EPS 1e-6f
|
| 46 |
+
#define ROPE_THETA 1000000.0f
|
| 47 |
+
#define MAX_SEQ 4096
|
| 48 |
+
#define HEADS_PER_KV (N_HEADS / N_KV_HEADS)
|
| 49 |
+
|
| 50 |
+
/* Unary config */
|
| 51 |
+
#define W_PLANES 4 /* weight magnitude planes (5 levels: 0-4) */
|
| 52 |
+
#define A_PLANES 8 /* activation magnitude planes (9 levels: 0-8) */
|
| 53 |
+
|
| 54 |
+
/* ============================================================
|
| 55 |
+
* Portable popcount for 64-bit
|
| 56 |
+
* Uses hardware POPCNT (available on Skylake)
|
| 57 |
+
* ============================================================ */
|
| 58 |
+
static inline int popcnt64(uint64_t x) {
|
| 59 |
+
return __builtin_popcountll(x);
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
/* ============================================================
|
| 63 |
+
* Unary Linear Layer (weight storage)
|
| 64 |
+
* ============================================================ */
|
| 65 |
+
typedef struct {
|
| 66 |
+
uint64_t *sign_bits; /* [out_dim * chunks] - 1=negative */
|
| 67 |
+
uint64_t *mag_planes; /* [W_PLANES * out_dim * chunks] */
|
| 68 |
+
float *scales; /* [out_dim] per-row scale */
|
| 69 |
+
float *bias; /* [out_dim] or NULL */
|
| 70 |
+
int out_dim;
|
| 71 |
+
int in_dim;
|
| 72 |
+
int n_planes;
|
| 73 |
+
} UnaryLinear;
|
| 74 |
+
|
| 75 |
+
/* FP16 Linear (for lm_head and embed) */
|
| 76 |
+
typedef struct {
|
| 77 |
+
uint16_t *weight;
|
| 78 |
+
int out_dim;
|
| 79 |
+
int in_dim;
|
| 80 |
+
} FP16Linear;
|
| 81 |
+
|
| 82 |
+
/* ============================================================
|
| 83 |
+
* Quantized Activation Buffer
|
| 84 |
+
* Activations quantized to unary thermometer on the fly.
|
| 85 |
+
* ============================================================ */
|
| 86 |
+
typedef struct {
|
| 87 |
+
uint64_t *sign_bits; /* [chunks] */
|
| 88 |
+
uint64_t *mag_planes; /* [A_PLANES * chunks] */
|
| 89 |
+
float scale; /* single scale for entire vector */
|
| 90 |
+
int dim;
|
| 91 |
+
int chunks;
|
| 92 |
+
} QuantAct;
|
| 93 |
+
|
| 94 |
+
/* ============================================================
|
| 95 |
+
* Transformer Layer
|
| 96 |
+
* ============================================================ */
|
| 97 |
+
typedef struct {
|
| 98 |
+
UnaryLinear q_proj, k_proj, v_proj, o_proj;
|
| 99 |
+
UnaryLinear gate_proj, up_proj, down_proj;
|
| 100 |
+
float *input_norm;
|
| 101 |
+
float *post_norm;
|
| 102 |
+
float *q_bias, *k_bias, *v_bias;
|
| 103 |
+
} Layer;
|
| 104 |
+
|
| 105 |
+
/* ============================================================
|
| 106 |
+
* Full Model
|
| 107 |
+
* ============================================================ */
|
| 108 |
+
typedef struct {
|
| 109 |
+
uint16_t *embed;
|
| 110 |
+
Layer layers[N_LAYERS];
|
| 111 |
+
float *final_norm;
|
| 112 |
+
FP16Linear lm_head;
|
| 113 |
+
|
| 114 |
+
/* KV cache (keep as float - only used in attention dot products) */
|
| 115 |
+
float *k_cache;
|
| 116 |
+
float *v_cache;
|
| 117 |
+
|
| 118 |
+
/* Float scratch (for between operations) */
|
| 119 |
+
float *hidden;
|
| 120 |
+
float *hidden2;
|
| 121 |
+
float *q_buf; /* [N_HEADS * HEAD_DIM] */
|
| 122 |
+
float *k_buf; /* [N_KV_HEADS * HEAD_DIM] */
|
| 123 |
+
float *v_buf;
|
| 124 |
+
float *attn_out;
|
| 125 |
+
float *gate_buf; /* [INTER] */
|
| 126 |
+
float *up_buf;
|
| 127 |
+
float *mlp_buf; /* [INTER] for gate*up result */
|
| 128 |
+
float *logits;
|
| 129 |
+
float *attn_scores;
|
| 130 |
+
|
| 131 |
+
/* Quantized activation buffers (reusable) */
|
| 132 |
+
QuantAct qa_hidden; /* for HIDDEN-dim activations */
|
| 133 |
+
QuantAct qa_inter; /* for INTER-dim activations */
|
| 134 |
+
|
| 135 |
+
int n_w_planes;
|
| 136 |
+
int n_a_planes;
|
| 137 |
+
} Model;
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
/* ============================================================
|
| 141 |
+
* QUANTIZE ACTIVATION TO UNARY (on the fly)
|
| 142 |
+
*
|
| 143 |
+
* Takes float vector, produces unary bitplanes.
|
| 144 |
+
* This is the key operation that enables full-unary matmul.
|
| 145 |
+
* ============================================================ */
|
| 146 |
+
static void quantize_activation(const float *x, QuantAct *qa) {
|
| 147 |
+
int dim = qa->dim;
|
| 148 |
+
int chunks = qa->chunks;
|
| 149 |
+
int n_planes = A_PLANES;
|
| 150 |
+
|
| 151 |
+
/* Find absmax for scale */
|
| 152 |
+
float absmax = 0.0f;
|
| 153 |
+
for (int i = 0; i < dim; i++) {
|
| 154 |
+
float a = fabsf(x[i]);
|
| 155 |
+
if (a > absmax) absmax = a;
|
| 156 |
+
}
|
| 157 |
+
if (absmax == 0.0f) absmax = 1.0f;
|
| 158 |
+
|
| 159 |
+
qa->scale = absmax / n_planes;
|
| 160 |
+
float inv_scale = 1.0f / qa->scale;
|
| 161 |
+
|
| 162 |
+
/* Clear bitplanes */
|
| 163 |
+
memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
|
| 164 |
+
memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
|
| 165 |
+
|
| 166 |
+
/* Quantize and pack into bitplanes */
|
| 167 |
+
for (int i = 0; i < dim; i++) {
|
| 168 |
+
int chunk = i / 64;
|
| 169 |
+
int bit = i % 64;
|
| 170 |
+
uint64_t mask = (uint64_t)1 << bit;
|
| 171 |
+
|
| 172 |
+
float val = x[i];
|
| 173 |
+
if (val < 0) {
|
| 174 |
+
qa->sign_bits[chunk] |= mask;
|
| 175 |
+
val = -val;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
int mag = (int)(val * inv_scale + 0.5f);
|
| 179 |
+
if (mag > n_planes) mag = n_planes;
|
| 180 |
+
|
| 181 |
+
/* Thermometer: set planes 0..mag-1 */
|
| 182 |
+
for (int p = 0; p < mag; p++) {
|
| 183 |
+
qa->mag_planes[p * chunks + chunk] |= mask;
|
| 184 |
+
}
|
| 185 |
+
}
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
/* Vectorized quantize - process 64 elements at a time */
|
| 189 |
+
static void quantize_activation_fast(const float *x, QuantAct *qa) {
|
| 190 |
+
int dim = qa->dim;
|
| 191 |
+
int chunks = qa->chunks;
|
| 192 |
+
int padded = chunks * 64;
|
| 193 |
+
int n_planes = A_PLANES;
|
| 194 |
+
|
| 195 |
+
/* Find absmax with AVX-512 */
|
| 196 |
+
__m512 vmax = _mm512_setzero_ps();
|
| 197 |
+
int i;
|
| 198 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 199 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 200 |
+
__m512 av = _mm512_abs_ps(xv);
|
| 201 |
+
vmax = _mm512_max_ps(vmax, av);
|
| 202 |
+
}
|
| 203 |
+
float absmax = _mm512_reduce_max_ps(vmax);
|
| 204 |
+
for (; i < dim; i++) {
|
| 205 |
+
float a = fabsf(x[i]);
|
| 206 |
+
if (a > absmax) absmax = a;
|
| 207 |
+
}
|
| 208 |
+
if (absmax == 0.0f) absmax = 1.0f;
|
| 209 |
+
|
| 210 |
+
qa->scale = absmax / n_planes;
|
| 211 |
+
float inv_scale = (float)n_planes / absmax;
|
| 212 |
+
|
| 213 |
+
/* Clear */
|
| 214 |
+
memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
|
| 215 |
+
memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
|
| 216 |
+
|
| 217 |
+
/* Process 16 floats at a time, pack bits */
|
| 218 |
+
__m512 v_inv = _mm512_set1_ps(inv_scale);
|
| 219 |
+
__m512 v_half = _mm512_set1_ps(0.5f);
|
| 220 |
+
__m512 v_zero = _mm512_setzero_ps();
|
| 221 |
+
|
| 222 |
+
for (int c = 0; c < chunks; c++) {
|
| 223 |
+
uint64_t sign_word = 0;
|
| 224 |
+
uint64_t plane_words[A_PLANES];
|
| 225 |
+
memset(plane_words, 0, sizeof(plane_words));
|
| 226 |
+
|
| 227 |
+
for (int g = 0; g < 4; g++) {
|
| 228 |
+
int offset = c * 64 + g * 16;
|
| 229 |
+
if (offset >= dim) break;
|
| 230 |
+
|
| 231 |
+
/* Load 16 floats */
|
| 232 |
+
__m512 xv;
|
| 233 |
+
if (offset + 16 <= dim) {
|
| 234 |
+
xv = _mm512_loadu_ps(x + offset);
|
| 235 |
+
} else {
|
| 236 |
+
/* Partial load at end */
|
| 237 |
+
xv = _mm512_setzero_ps();
|
| 238 |
+
for (int j = 0; j < dim - offset; j++) {
|
| 239 |
+
((float*)&xv)[j] = x[offset + j];
|
| 240 |
+
}
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
/* Sign: negative mask */
|
| 244 |
+
__mmask16 neg_mask = _mm512_cmplt_ps_mask(xv, v_zero);
|
| 245 |
+
sign_word |= ((uint64_t)neg_mask << (g * 16));
|
| 246 |
+
|
| 247 |
+
/* Absolute value and quantize */
|
| 248 |
+
__m512 av = _mm512_abs_ps(xv);
|
| 249 |
+
__m512 qv = _mm512_fmadd_ps(av, v_inv, v_half);
|
| 250 |
+
|
| 251 |
+
/* Convert to int and clamp */
|
| 252 |
+
__m512i iv = _mm512_cvttps_epi32(qv);
|
| 253 |
+
__m512i v_max = _mm512_set1_epi32(n_planes);
|
| 254 |
+
iv = _mm512_min_epi32(iv, v_max);
|
| 255 |
+
|
| 256 |
+
/* Thermometer encode: plane p has bit set if magnitude > p */
|
| 257 |
+
for (int p = 0; p < n_planes; p++) {
|
| 258 |
+
__m512i vp = _mm512_set1_epi32(p + 1);
|
| 259 |
+
__mmask16 active = _mm512_cmpge_epi32_mask(iv, vp);
|
| 260 |
+
plane_words[p] |= ((uint64_t)active << (g * 16));
|
| 261 |
+
}
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
qa->sign_bits[c] = sign_word;
|
| 265 |
+
for (int p = 0; p < n_planes; p++) {
|
| 266 |
+
qa->mag_planes[p * chunks + c] = plane_words[p];
|
| 267 |
+
}
|
| 268 |
+
}
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
/* ============================================================
|
| 273 |
+
* FULL-UNARY MATVEC via POPCOUNT
|
| 274 |
+
*
|
| 275 |
+
* y[i] = w_scale[i] * act_scale *
|
| 276 |
+
* sum_{p=0}^{W-1} sum_{q=0}^{A-1}
|
| 277 |
+
* ( popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND ~a_sign) // both positive
|
| 278 |
+
* + popcount(w_plane_p[i] AND a_plane_q AND w_sign AND a_sign) // both negative (neg*neg=pos)
|
| 279 |
+
* - popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND a_sign) // pos weight * neg act
|
| 280 |
+
* - popcount(w_plane_p[i] AND a_plane_q AND w_sign AND ~a_sign) ) // neg weight * pos act
|
| 281 |
+
*
|
| 282 |
+
* Simplification: same_sign = ~(w_sign XOR a_sign), diff_sign = w_sign XOR a_sign
|
| 283 |
+
* contribution = popcount(w_plane AND a_plane AND same_sign)
|
| 284 |
+
* - popcount(w_plane AND a_plane AND diff_sign)
|
| 285 |
+
* ============================================================ */
|
| 286 |
+
static void unary_matvec_popcount(
|
| 287 |
+
const UnaryLinear *layer, const QuantAct *qa, float *y
|
| 288 |
+
) {
|
| 289 |
+
int out_dim = layer->out_dim;
|
| 290 |
+
int chunks = qa->chunks;
|
| 291 |
+
int n_w = layer->n_planes;
|
| 292 |
+
int n_a = A_PLANES;
|
| 293 |
+
float act_scale = qa->scale;
|
| 294 |
+
|
| 295 |
+
#pragma omp parallel for schedule(dynamic, 64)
|
| 296 |
+
for (int i = 0; i < out_dim; i++) {
|
| 297 |
+
const uint64_t *w_sign = layer->sign_bits + (size_t)i * chunks;
|
| 298 |
+
long total = 0; /* integer accumulator! */
|
| 299 |
+
|
| 300 |
+
for (int c = 0; c < chunks; c++) {
|
| 301 |
+
uint64_t ws = w_sign[c];
|
| 302 |
+
uint64_t as = qa->sign_bits[c];
|
| 303 |
+
uint64_t same_sign = ~(ws ^ as); /* bits where signs agree */
|
| 304 |
+
uint64_t diff_sign = ws ^ as; /* bits where signs differ */
|
| 305 |
+
|
| 306 |
+
for (int p = 0; p < n_w; p++) {
|
| 307 |
+
uint64_t wp = layer->mag_planes[((size_t)p * out_dim + i) * chunks + c];
|
| 308 |
+
|
| 309 |
+
for (int q = 0; q < n_a; q++) {
|
| 310 |
+
uint64_t aq = qa->mag_planes[q * chunks + c];
|
| 311 |
+
uint64_t active = wp & aq; /* both have magnitude at this level */
|
| 312 |
+
|
| 313 |
+
total += popcnt64(active & same_sign);
|
| 314 |
+
total -= popcnt64(active & diff_sign);
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
y[i] = (float)total * layer->scales[i] * act_scale;
|
| 320 |
+
if (layer->bias) y[i] += layer->bias[i];
|
| 321 |
+
}
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
|
| 325 |
+
/* ============================================================
|
| 326 |
+
* FP16 matvec for lm_head (final projection to vocab)
|
| 327 |
+
* ============================================================ */
|
| 328 |
+
static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
|
| 329 |
+
int out_dim = layer->out_dim;
|
| 330 |
+
int in_dim = layer->in_dim;
|
| 331 |
+
const uint16_t *w = layer->weight;
|
| 332 |
+
|
| 333 |
+
#pragma omp parallel for schedule(dynamic, 256)
|
| 334 |
+
for (int i = 0; i < out_dim; i++) {
|
| 335 |
+
__m512 acc = _mm512_setzero_ps();
|
| 336 |
+
int j;
|
| 337 |
+
for (j = 0; j + 16 <= in_dim; j += 16) {
|
| 338 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
|
| 339 |
+
__m512 wv = _mm512_cvtph_ps(h);
|
| 340 |
+
__m512 xv = _mm512_loadu_ps(x + j);
|
| 341 |
+
acc = _mm512_fmadd_ps(wv, xv, acc);
|
| 342 |
+
}
|
| 343 |
+
float sum = _mm512_reduce_add_ps(acc);
|
| 344 |
+
for (; j < in_dim; j++) {
|
| 345 |
+
__m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
|
| 346 |
+
__m128 fv = _mm_cvtph_ps(hv);
|
| 347 |
+
float wf;
|
| 348 |
+
_mm_store_ss(&wf, fv);
|
| 349 |
+
sum += wf * x[j];
|
| 350 |
+
}
|
| 351 |
+
y[i] = sum;
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
|
| 355 |
+
/* ============================================================
|
| 356 |
+
* Basic ops (still float for norms, residuals, attention)
|
| 357 |
+
* ============================================================ */
|
| 358 |
+
static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
|
| 359 |
+
__m512 sum_sq = _mm512_setzero_ps();
|
| 360 |
+
int i;
|
| 361 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 362 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 363 |
+
sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
|
| 364 |
+
}
|
| 365 |
+
float ss = _mm512_reduce_add_ps(sum_sq);
|
| 366 |
+
for (; i < dim; i++) ss += x[i] * x[i];
|
| 367 |
+
float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
|
| 368 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 369 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 370 |
+
__m512 wv = _mm512_loadu_ps(weight + i);
|
| 371 |
+
__m512 rv = _mm512_set1_ps(rms);
|
| 372 |
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
|
| 373 |
+
}
|
| 374 |
+
for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
static void silu_inplace(float *x, int n) {
|
| 378 |
+
int i;
|
| 379 |
+
__m512 one = _mm512_set1_ps(1.0f);
|
| 380 |
+
/* SiLU vectorized: x / (1 + exp(-x)) */
|
| 381 |
+
for (i = 0; i < n; i++) {
|
| 382 |
+
x[i] = x[i] / (1.0f + expf(-x[i]));
|
| 383 |
+
}
|
| 384 |
+
}
|
| 385 |
+
|
| 386 |
+
static void elemwise_mul(const float *a, const float *b, float *c, int n) {
|
| 387 |
+
int i;
|
| 388 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 389 |
+
__m512 av = _mm512_loadu_ps(a + i);
|
| 390 |
+
__m512 bv = _mm512_loadu_ps(b + i);
|
| 391 |
+
_mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
|
| 392 |
+
}
|
| 393 |
+
for (; i < n; i++) c[i] = a[i] * b[i];
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
static void vec_add(float *y, const float *x, int n) {
|
| 397 |
+
int i;
|
| 398 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 399 |
+
__m512 yv = _mm512_loadu_ps(y + i);
|
| 400 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 401 |
+
_mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
|
| 402 |
+
}
|
| 403 |
+
for (; i < n; i++) y[i] += x[i];
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
static void apply_rope(float *vec, int pos, int dim) {
|
| 407 |
+
for (int i = 0; i < dim; i += 2) {
|
| 408 |
+
float freq = 1.0f / powf(ROPE_THETA, (float)i / dim);
|
| 409 |
+
float angle = pos * freq;
|
| 410 |
+
float c = cosf(angle), s = sinf(angle);
|
| 411 |
+
float v0 = vec[i], v1 = vec[i + 1];
|
| 412 |
+
vec[i] = v0 * c - v1 * s;
|
| 413 |
+
vec[i + 1] = v0 * s + v1 * c;
|
| 414 |
+
}
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
static void softmax(float *x, int n) {
|
| 418 |
+
float max_val = x[0];
|
| 419 |
+
for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
|
| 420 |
+
float sum = 0.0f;
|
| 421 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
|
| 422 |
+
float inv = 1.0f / sum;
|
| 423 |
+
for (int i = 0; i < n; i++) x[i] *= inv;
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
static void embed_token(const Model *m, int token_id, float *out) {
|
| 427 |
+
const uint16_t *row = m->embed + (size_t)token_id * HIDDEN;
|
| 428 |
+
int i;
|
| 429 |
+
for (i = 0; i + 16 <= HIDDEN; i += 16) {
|
| 430 |
+
__m256i h = _mm256_loadu_si256((__m256i*)(row + i));
|
| 431 |
+
_mm512_storeu_ps(out + i, _mm512_cvtph_ps(h));
|
| 432 |
+
}
|
| 433 |
+
for (; i < HIDDEN; i++) {
|
| 434 |
+
__m128i hv = _mm_set1_epi16(row[i]);
|
| 435 |
+
_mm_store_ss(out + i, _mm_cvtph_ps(hv));
|
| 436 |
+
}
|
| 437 |
+
}
|
| 438 |
+
|
| 439 |
+
static float* kv_ptr(float *cache, int layer, int pos, int kv_head) {
|
| 440 |
+
return cache + ((size_t)layer * MAX_SEQ * N_KV_HEADS +
|
| 441 |
+
(size_t)pos * N_KV_HEADS + kv_head) * HEAD_DIM;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
/* ============================================================
|
| 445 |
+
* ATTENTION
|
| 446 |
+
*
|
| 447 |
+
* Q/K/V projections use full-unary popcount matmul.
|
| 448 |
+
* Attention scores and value accumulation stay float
|
| 449 |
+
* (these are O(seq_len) not O(dim²), not the bottleneck).
|
| 450 |
+
* ============================================================ */
|
| 451 |
+
static void attention(Model *m, int layer_idx, int pos) {
|
| 452 |
+
Layer *layer = &m->layers[layer_idx];
|
| 453 |
+
|
| 454 |
+
/* Quantize hidden2 to unary for projections */
|
| 455 |
+
quantize_activation_fast(m->hidden2, &m->qa_hidden);
|
| 456 |
+
|
| 457 |
+
/* Project Q, K, V via popcount matvec */
|
| 458 |
+
unary_matvec_popcount(&layer->q_proj, &m->qa_hidden, m->q_buf);
|
| 459 |
+
unary_matvec_popcount(&layer->k_proj, &m->qa_hidden, m->k_buf);
|
| 460 |
+
unary_matvec_popcount(&layer->v_proj, &m->qa_hidden, m->v_buf);
|
| 461 |
+
|
| 462 |
+
/* Add biases */
|
| 463 |
+
if (layer->q_bias) vec_add(m->q_buf, layer->q_bias, N_HEADS * HEAD_DIM);
|
| 464 |
+
if (layer->k_bias) vec_add(m->k_buf, layer->k_bias, N_KV_HEADS * HEAD_DIM);
|
| 465 |
+
if (layer->v_bias) vec_add(m->v_buf, layer->v_bias, N_KV_HEADS * HEAD_DIM);
|
| 466 |
+
|
| 467 |
+
/* RoPE */
|
| 468 |
+
for (int h = 0; h < N_HEADS; h++)
|
| 469 |
+
apply_rope(m->q_buf + h * HEAD_DIM, pos, HEAD_DIM);
|
| 470 |
+
for (int h = 0; h < N_KV_HEADS; h++)
|
| 471 |
+
apply_rope(m->k_buf + h * HEAD_DIM, pos, HEAD_DIM);
|
| 472 |
+
|
| 473 |
+
/* Store KV */
|
| 474 |
+
for (int h = 0; h < N_KV_HEADS; h++) {
|
| 475 |
+
memcpy(kv_ptr(m->k_cache, layer_idx, pos, h), m->k_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
|
| 476 |
+
memcpy(kv_ptr(m->v_cache, layer_idx, pos, h), m->v_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
|
| 477 |
+
}
|
| 478 |
+
|
| 479 |
+
/* Attention */
|
| 480 |
+
float scale = 1.0f / sqrtf((float)HEAD_DIM);
|
| 481 |
+
memset(m->attn_out, 0, N_HEADS * HEAD_DIM * sizeof(float));
|
| 482 |
+
|
| 483 |
+
for (int h = 0; h < N_HEADS; h++) {
|
| 484 |
+
int kv_h = h / HEADS_PER_KV;
|
| 485 |
+
float *qh = m->q_buf + h * HEAD_DIM;
|
| 486 |
+
float *oh = m->attn_out + h * HEAD_DIM;
|
| 487 |
+
|
| 488 |
+
for (int t = 0; t <= pos; t++) {
|
| 489 |
+
float *kc = kv_ptr(m->k_cache, layer_idx, t, kv_h);
|
| 490 |
+
__m512 acc = _mm512_setzero_ps();
|
| 491 |
+
int d;
|
| 492 |
+
for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
|
| 493 |
+
acc = _mm512_fmadd_ps(_mm512_loadu_ps(qh + d), _mm512_loadu_ps(kc + d), acc);
|
| 494 |
+
}
|
| 495 |
+
float dot = _mm512_reduce_add_ps(acc);
|
| 496 |
+
for (; d < HEAD_DIM; d++) dot += qh[d] * kc[d];
|
| 497 |
+
m->attn_scores[t] = dot * scale;
|
| 498 |
+
}
|
| 499 |
+
|
| 500 |
+
softmax(m->attn_scores, pos + 1);
|
| 501 |
+
|
| 502 |
+
for (int t = 0; t <= pos; t++) {
|
| 503 |
+
float w = m->attn_scores[t];
|
| 504 |
+
if (w < 1e-8f) continue;
|
| 505 |
+
float *vc = kv_ptr(m->v_cache, layer_idx, t, kv_h);
|
| 506 |
+
__m512 wv = _mm512_set1_ps(w);
|
| 507 |
+
int d;
|
| 508 |
+
for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
|
| 509 |
+
__m512 ov = _mm512_loadu_ps(oh + d);
|
| 510 |
+
_mm512_storeu_ps(oh + d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vc + d), ov));
|
| 511 |
+
}
|
| 512 |
+
for (; d < HEAD_DIM; d++) oh[d] += w * vc[d];
|
| 513 |
+
}
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
/* O projection: quantize attn_out, then popcount matvec */
|
| 517 |
+
quantize_activation_fast(m->attn_out, &m->qa_hidden);
|
| 518 |
+
unary_matvec_popcount(&layer->o_proj, &m->qa_hidden, m->hidden2);
|
| 519 |
+
}
|
| 520 |
+
|
| 521 |
+
/* ============================================================
|
| 522 |
+
* MLP - SwiGLU with unary matmuls
|
| 523 |
+
* ============================================================ */
|
| 524 |
+
static void mlp(Model *m, int layer_idx) {
|
| 525 |
+
Layer *layer = &m->layers[layer_idx];
|
| 526 |
+
|
| 527 |
+
/* Quantize hidden2 */
|
| 528 |
+
quantize_activation_fast(m->hidden2, &m->qa_hidden);
|
| 529 |
+
|
| 530 |
+
/* gate and up projections via popcount */
|
| 531 |
+
unary_matvec_popcount(&layer->gate_proj, &m->qa_hidden, m->gate_buf);
|
| 532 |
+
unary_matvec_popcount(&layer->up_proj, &m->qa_hidden, m->up_buf);
|
| 533 |
+
|
| 534 |
+
/* SwiGLU: silu(gate) * up */
|
| 535 |
+
silu_inplace(m->gate_buf, INTER);
|
| 536 |
+
elemwise_mul(m->gate_buf, m->up_buf, m->mlp_buf, INTER);
|
| 537 |
+
|
| 538 |
+
/* Down projection: quantize intermediate, popcount matvec */
|
| 539 |
+
quantize_activation_fast(m->mlp_buf, &m->qa_inter);
|
| 540 |
+
unary_matvec_popcount(&layer->down_proj, &m->qa_inter, m->hidden2);
|
| 541 |
+
}
|
| 542 |
+
|
| 543 |
+
/* ============================================================
|
| 544 |
+
* FORWARD ONE TOKEN
|
| 545 |
+
* ============================================================ */
|
| 546 |
+
float* forward_token(Model *m, int token_id, int pos) {
|
| 547 |
+
embed_token(m, token_id, m->hidden);
|
| 548 |
+
|
| 549 |
+
for (int l = 0; l < N_LAYERS; l++) {
|
| 550 |
+
rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, HIDDEN);
|
| 551 |
+
attention(m, l, pos);
|
| 552 |
+
vec_add(m->hidden, m->hidden2, HIDDEN);
|
| 553 |
+
|
| 554 |
+
rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, HIDDEN);
|
| 555 |
+
mlp(m, l);
|
| 556 |
+
vec_add(m->hidden, m->hidden2, HIDDEN);
|
| 557 |
+
}
|
| 558 |
+
|
| 559 |
+
rmsnorm(m->hidden, m->final_norm, m->hidden2, HIDDEN);
|
| 560 |
+
fp16_matvec(&m->lm_head, m->hidden2, m->logits);
|
| 561 |
+
|
| 562 |
+
return m->logits;
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
/* ============================================================
|
| 566 |
+
* SAMPLING
|
| 567 |
+
* ============================================================ */
|
| 568 |
+
static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
|
| 569 |
+
if (temperature > 0) {
|
| 570 |
+
float inv_t = 1.0f / temperature;
|
| 571 |
+
for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
|
| 572 |
+
}
|
| 573 |
+
softmax(logits, vocab);
|
| 574 |
+
|
| 575 |
+
float *probs = (float *)malloc(vocab * sizeof(float));
|
| 576 |
+
int *indices = (int *)malloc(vocab * sizeof(int));
|
| 577 |
+
memcpy(probs, logits, vocab * sizeof(float));
|
| 578 |
+
for (int i = 0; i < vocab; i++) indices[i] = i;
|
| 579 |
+
|
| 580 |
+
int n_keep = 0;
|
| 581 |
+
float cum = 0.0f;
|
| 582 |
+
while (cum < top_p && n_keep < vocab && n_keep < 40) {
|
| 583 |
+
int best = n_keep;
|
| 584 |
+
for (int i = n_keep + 1; i < vocab; i++)
|
| 585 |
+
if (probs[i] > probs[best]) best = i;
|
| 586 |
+
float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
|
| 587 |
+
int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
|
| 588 |
+
cum += probs[n_keep];
|
| 589 |
+
n_keep++;
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
float sum = 0.0f;
|
| 593 |
+
for (int i = 0; i < n_keep; i++) sum += probs[i];
|
| 594 |
+
float r = (float)rand() / RAND_MAX * sum;
|
| 595 |
+
float acc = 0.0f;
|
| 596 |
+
int chosen = indices[0];
|
| 597 |
+
for (int i = 0; i < n_keep; i++) {
|
| 598 |
+
acc += probs[i]; if (acc >= r) { chosen = indices[i]; break; }
|
| 599 |
+
}
|
| 600 |
+
free(probs); free(indices);
|
| 601 |
+
return chosen;
|
| 602 |
+
}
|
| 603 |
+
|
| 604 |
+
/* ============================================================
|
| 605 |
+
* GENERATE
|
| 606 |
+
* ============================================================ */
|
| 607 |
+
int generate(
|
| 608 |
+
Model *m,
|
| 609 |
+
const int *prompt_ids, int prompt_len,
|
| 610 |
+
int *out_tokens, int max_new_tokens,
|
| 611 |
+
float temperature, float top_p,
|
| 612 |
+
int eos_token
|
| 613 |
+
) {
|
| 614 |
+
srand(time(NULL));
|
| 615 |
+
|
| 616 |
+
for (int i = 0; i < prompt_len; i++)
|
| 617 |
+
forward_token(m, prompt_ids[i], i);
|
| 618 |
+
|
| 619 |
+
int pos = prompt_len;
|
| 620 |
+
int generated = 0;
|
| 621 |
+
|
| 622 |
+
for (int t = 0; t < max_new_tokens; t++) {
|
| 623 |
+
float *logits = m->logits;
|
| 624 |
+
int next = (temperature <= 0) ? 0 : sample_top_p(logits, VOCAB, temperature, top_p);
|
| 625 |
+
if (temperature <= 0) {
|
| 626 |
+
for (int i = 1; i < VOCAB; i++)
|
| 627 |
+
if (logits[i] > logits[next]) next = i;
|
| 628 |
+
}
|
| 629 |
+
out_tokens[t] = next;
|
| 630 |
+
generated++;
|
| 631 |
+
if (next == eos_token) break;
|
| 632 |
+
forward_token(m, next, pos);
|
| 633 |
+
pos++;
|
| 634 |
+
}
|
| 635 |
+
return generated;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
/* ============================================================
|
| 639 |
+
* ALLOCATE QUANTIZED ACTIVATION BUFFER
|
| 640 |
+
* ============================================================ */
|
| 641 |
+
static void qa_alloc(QuantAct *qa, int dim) {
|
| 642 |
+
qa->dim = dim;
|
| 643 |
+
qa->chunks = (dim + 63) / 64;
|
| 644 |
+
qa->sign_bits = (uint64_t *)aligned_alloc(64, qa->chunks * sizeof(uint64_t));
|
| 645 |
+
qa->mag_planes = (uint64_t *)aligned_alloc(64, A_PLANES * qa->chunks * sizeof(uint64_t));
|
| 646 |
+
qa->scale = 1.0f;
|
| 647 |
+
}
|
| 648 |
+
|
| 649 |
+
/* ============================================================
|
| 650 |
+
* MODEL ALLOC
|
| 651 |
+
* ============================================================ */
|
| 652 |
+
Model* model_alloc(int n_w_planes) {
|
| 653 |
+
Model *m = (Model *)calloc(1, sizeof(Model));
|
| 654 |
+
m->n_w_planes = n_w_planes;
|
| 655 |
+
m->n_a_planes = A_PLANES;
|
| 656 |
+
|
| 657 |
+
size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
|
| 658 |
+
m->k_cache = (float *)calloc(kv_size, sizeof(float));
|
| 659 |
+
m->v_cache = (float *)calloc(kv_size, sizeof(float));
|
| 660 |
+
|
| 661 |
+
m->hidden = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
|
| 662 |
+
m->hidden2 = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
|
| 663 |
+
m->q_buf = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
|
| 664 |
+
m->k_buf = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
|
| 665 |
+
m->v_buf = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
|
| 666 |
+
m->attn_out = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
|
| 667 |
+
m->gate_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
|
| 668 |
+
m->up_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
|
| 669 |
+
m->mlp_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
|
| 670 |
+
m->logits = (float *)aligned_alloc(64, VOCAB * sizeof(float));
|
| 671 |
+
m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
|
| 672 |
+
m->final_norm = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
|
| 673 |
+
|
| 674 |
+
qa_alloc(&m->qa_hidden, HIDDEN);
|
| 675 |
+
qa_alloc(&m->qa_inter, INTER);
|
| 676 |
+
|
| 677 |
+
printf("Model allocated: KV=%zuMB, W_PLANES=%d, A_PLANES=%d\n",
|
| 678 |
+
kv_size * 2 * sizeof(float) / (1024*1024), n_w_planes, A_PLANES);
|
| 679 |
+
|
| 680 |
+
return m;
|
| 681 |
+
}
|
| 682 |
+
|
| 683 |
+
void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
|
| 684 |
+
void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, HIDDEN * sizeof(float)); }
|
| 685 |
+
void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
|
| 686 |
+
m->lm_head.weight = data; m->lm_head.out_dim = out_dim; m->lm_head.in_dim = in_dim;
|
| 687 |
+
}
|
| 688 |
+
|
| 689 |
+
void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
|
| 690 |
+
m->layers[l].input_norm = in_norm;
|
| 691 |
+
m->layers[l].post_norm = post_norm;
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
void layer_set_bias(Model *m, int l, float *qb, float *kb, float *vb) {
|
| 695 |
+
m->layers[l].q_bias = qb; m->layers[l].k_bias = kb; m->layers[l].v_bias = vb;
|
| 696 |
+
}
|
| 697 |
+
|
| 698 |
+
void layer_set_unary(
|
| 699 |
+
UnaryLinear *ul, uint64_t *sign, uint64_t *planes, float *scales,
|
| 700 |
+
int out_dim, int in_dim, int n_planes
|
| 701 |
+
) {
|
| 702 |
+
ul->sign_bits = sign; ul->mag_planes = planes; ul->scales = scales;
|
| 703 |
+
ul->out_dim = out_dim; ul->in_dim = in_dim; ul->n_planes = n_planes;
|
| 704 |
+
ul->bias = NULL;
|
| 705 |
+
}
|
| 706 |
+
|
| 707 |
+
void layer_set_linears(
|
| 708 |
+
Model *m, int l,
|
| 709 |
+
uint64_t *qs, uint64_t *qp, float *qsc, int qo, int qi,
|
| 710 |
+
uint64_t *ks, uint64_t *kp, float *ksc, int ko, int ki,
|
| 711 |
+
uint64_t *vs, uint64_t *vp, float *vsc, int vo, int vi,
|
| 712 |
+
uint64_t *os, uint64_t *op, float *osc, int oo, int oi,
|
| 713 |
+
uint64_t *gs, uint64_t *gp, float *gsc, int go, int gi,
|
| 714 |
+
uint64_t *us, uint64_t *up, float *usc, int uo, int ui,
|
| 715 |
+
uint64_t *ds, uint64_t *dp, float *dsc, int doo, int di,
|
| 716 |
+
int n_planes
|
| 717 |
+
) {
|
| 718 |
+
layer_set_unary(&m->layers[l].q_proj, qs, qp, qsc, qo, qi, n_planes);
|
| 719 |
+
layer_set_unary(&m->layers[l].k_proj, ks, kp, ksc, ko, ki, n_planes);
|
| 720 |
+
layer_set_unary(&m->layers[l].v_proj, vs, vp, vsc, vo, vi, n_planes);
|
| 721 |
+
layer_set_unary(&m->layers[l].o_proj, os, op, osc, oo, oi, n_planes);
|
| 722 |
+
layer_set_unary(&m->layers[l].gate_proj, gs, gp, gsc, go, gi, n_planes);
|
| 723 |
+
layer_set_unary(&m->layers[l].up_proj, us, up, usc, uo, ui, n_planes);
|
| 724 |
+
layer_set_unary(&m->layers[l].down_proj, ds, dp, dsc, doo, di, n_planes);
|
| 725 |
+
}
|
| 726 |
+
|
| 727 |
+
void model_reset_cache(Model *m) {
|
| 728 |
+
size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
|
| 729 |
+
memset(m->k_cache, 0, kv_size * sizeof(float));
|
| 730 |
+
memset(m->v_cache, 0, kv_size * sizeof(float));
|
| 731 |
+
}
|
| 732 |
+
|
| 733 |
+
void model_free(Model *m) {
|
| 734 |
+
free(m->k_cache); free(m->v_cache);
|
| 735 |
+
free(m->hidden); free(m->hidden2);
|
| 736 |
+
free(m->q_buf); free(m->k_buf); free(m->v_buf);
|
| 737 |
+
free(m->attn_out); free(m->gate_buf); free(m->up_buf); free(m->mlp_buf);
|
| 738 |
+
free(m->logits); free(m->attn_scores); free(m->final_norm);
|
| 739 |
+
free(m->qa_hidden.sign_bits); free(m->qa_hidden.mag_planes);
|
| 740 |
+
free(m->qa_inter.sign_bits); free(m->qa_inter.mag_planes);
|
| 741 |
+
free(m);
|
| 742 |
+
}
|
unary_group_convert.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert model to UNARY with GROUP quantization.
|
| 4 |
+
Each group of 32 weights gets its own scale factor.
|
| 5 |
+
This dramatically improves accuracy vs per-row scaling.
|
| 6 |
+
|
| 7 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 8 |
+
"""
|
| 9 |
+
import os, json, sys, time
|
| 10 |
+
import numpy as np
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
GROUP_SIZE = 32
|
| 14 |
+
|
| 15 |
+
def load_safetensors(model_dir):
|
| 16 |
+
import torch
|
| 17 |
+
from safetensors.torch import load_file
|
| 18 |
+
tensors = {}
|
| 19 |
+
for f in sorted(Path(model_dir).glob("*.safetensors")):
|
| 20 |
+
print(f"Loading {f.name}...")
|
| 21 |
+
for key, val in load_file(str(f)).items():
|
| 22 |
+
tensors[key] = val.float().numpy()
|
| 23 |
+
return tensors
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def quantize_group_unary(weight, n_planes=7):
|
| 27 |
+
"""Quantize with per-group scales. GROUP_SIZE=32 weights per scale."""
|
| 28 |
+
w = weight.astype(np.float32)
|
| 29 |
+
out_dim, in_dim = w.shape
|
| 30 |
+
n_groups = (in_dim + GROUP_SIZE - 1) // GROUP_SIZE
|
| 31 |
+
chunks = (in_dim + 63) // 64
|
| 32 |
+
padded = chunks * 64
|
| 33 |
+
|
| 34 |
+
# Pad weight to multiple of GROUP_SIZE
|
| 35 |
+
if in_dim % GROUP_SIZE != 0:
|
| 36 |
+
pad_w = GROUP_SIZE - (in_dim % GROUP_SIZE)
|
| 37 |
+
w = np.concatenate([w, np.zeros((out_dim, pad_w), dtype=np.float32)], axis=1)
|
| 38 |
+
|
| 39 |
+
# Reshape to groups: [out_dim, n_groups, GROUP_SIZE]
|
| 40 |
+
w_grouped = w[:, :n_groups * GROUP_SIZE].reshape(out_dim, n_groups, GROUP_SIZE)
|
| 41 |
+
|
| 42 |
+
# Per-group max absolute value
|
| 43 |
+
group_max = np.max(np.abs(w_grouped), axis=2) # [out_dim, n_groups]
|
| 44 |
+
group_max = np.where(group_max == 0, 1.0, group_max)
|
| 45 |
+
|
| 46 |
+
# Per-group scales
|
| 47 |
+
group_scales = (group_max / n_planes).astype(np.float32) # [out_dim, n_groups]
|
| 48 |
+
|
| 49 |
+
# Quantize per group
|
| 50 |
+
w_scaled = w_grouped / group_scales[:, :, None] # [out_dim, n_groups, GROUP_SIZE]
|
| 51 |
+
magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
|
| 52 |
+
magnitudes = np.clip(magnitudes, 0, n_planes)
|
| 53 |
+
signs = (w_grouped < 0)
|
| 54 |
+
|
| 55 |
+
# Flatten back to [out_dim, n_groups * GROUP_SIZE]
|
| 56 |
+
magnitudes = magnitudes.reshape(out_dim, -1)
|
| 57 |
+
signs = signs.reshape(out_dim, -1)
|
| 58 |
+
|
| 59 |
+
# Pad to multiple of 64 for bitpacking
|
| 60 |
+
if magnitudes.shape[1] < padded:
|
| 61 |
+
extra = padded - magnitudes.shape[1]
|
| 62 |
+
magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, extra), dtype=np.int32)], axis=1)
|
| 63 |
+
signs = np.concatenate([signs, np.zeros((out_dim, extra), dtype=bool)], axis=1)
|
| 64 |
+
|
| 65 |
+
sparsity = np.mean(magnitudes == 0)
|
| 66 |
+
|
| 67 |
+
# Pack bits
|
| 68 |
+
bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
|
| 69 |
+
|
| 70 |
+
signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 71 |
+
sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
|
| 72 |
+
|
| 73 |
+
mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
|
| 74 |
+
for p in range(n_planes):
|
| 75 |
+
active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
|
| 76 |
+
mag_planes[p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
|
| 77 |
+
|
| 78 |
+
return sign_bits, mag_planes, group_scales, sparsity
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def test_accuracy(weight, sign_bits, mag_planes, group_scales, n_planes):
|
| 82 |
+
"""Test reconstruction accuracy of a single layer."""
|
| 83 |
+
out_dim, in_dim = weight.shape
|
| 84 |
+
n_groups = group_scales.shape[1]
|
| 85 |
+
chunks = (in_dim + 63) // 64
|
| 86 |
+
|
| 87 |
+
np.random.seed(42)
|
| 88 |
+
x = np.random.randn(in_dim).astype(np.float32)
|
| 89 |
+
y_orig = weight @ x
|
| 90 |
+
|
| 91 |
+
# Reconstruct weights from unary format
|
| 92 |
+
w_recon = np.zeros((out_dim, chunks * 64), dtype=np.float32)
|
| 93 |
+
for p in range(n_planes):
|
| 94 |
+
for i in range(out_dim):
|
| 95 |
+
for c in range(chunks):
|
| 96 |
+
mbits = mag_planes[p, i, c]
|
| 97 |
+
sbits = sign_bits[i, c]
|
| 98 |
+
for b in range(64):
|
| 99 |
+
if mbits & (1 << b):
|
| 100 |
+
col = c * 64 + b
|
| 101 |
+
g = col // GROUP_SIZE
|
| 102 |
+
if g < n_groups:
|
| 103 |
+
sign = -1.0 if (sbits & (1 << b)) else 1.0
|
| 104 |
+
w_recon[i, col] += sign * group_scales[i, g]
|
| 105 |
+
|
| 106 |
+
y_recon = w_recon[:, :in_dim] @ x
|
| 107 |
+
cosim = np.dot(y_orig, y_recon) / (np.linalg.norm(y_orig) * np.linalg.norm(y_recon))
|
| 108 |
+
return cosim
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def convert(model_dir, output_dir, n_planes=7):
|
| 112 |
+
os.makedirs(output_dir, exist_ok=True)
|
| 113 |
+
tensors = load_safetensors(model_dir)
|
| 114 |
+
|
| 115 |
+
linear_keys = [k for k in tensors if any(p in k for p in
|
| 116 |
+
['q_proj.weight', 'k_proj.weight', 'v_proj.weight', 'o_proj.weight',
|
| 117 |
+
'gate_proj.weight', 'up_proj.weight', 'down_proj.weight'])]
|
| 118 |
+
other_keys = [k for k in tensors if k not in linear_keys]
|
| 119 |
+
|
| 120 |
+
print(f"\nGroup-unary: {len(linear_keys)} layers, n_planes={n_planes}, group_size={GROUP_SIZE}")
|
| 121 |
+
|
| 122 |
+
config = {
|
| 123 |
+
"hidden_size": 1536, "intermediate_size": 8960,
|
| 124 |
+
"num_attention_heads": 12, "num_key_value_heads": 2,
|
| 125 |
+
"num_hidden_layers": 28, "vocab_size": 151936,
|
| 126 |
+
"head_dim": 128, "rope_theta": 1000000.0, "rms_norm_eps": 1e-6,
|
| 127 |
+
"n_planes": n_planes, "group_size": GROUP_SIZE,
|
| 128 |
+
"quant_type": "unary_group",
|
| 129 |
+
}
|
| 130 |
+
with open(os.path.join(output_dir, "config.json"), "w") as f:
|
| 131 |
+
json.dump(config, f, indent=2)
|
| 132 |
+
|
| 133 |
+
total_unary = 0
|
| 134 |
+
total_orig = 0
|
| 135 |
+
|
| 136 |
+
# Test accuracy on first layer
|
| 137 |
+
test_key = linear_keys[0]
|
| 138 |
+
|
| 139 |
+
for key in linear_keys:
|
| 140 |
+
w = tensors[key]
|
| 141 |
+
total_orig += w.nbytes
|
| 142 |
+
|
| 143 |
+
t0 = time.time()
|
| 144 |
+
sign_bits, mag_planes, group_scales, sparsity = quantize_group_unary(w, n_planes)
|
| 145 |
+
dt = time.time() - t0
|
| 146 |
+
|
| 147 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 148 |
+
sign_bits.tofile(prefix + ".sign")
|
| 149 |
+
mag_planes.tofile(prefix + ".planes")
|
| 150 |
+
group_scales.tofile(prefix + ".gscales")
|
| 151 |
+
|
| 152 |
+
nbytes = sign_bits.nbytes + mag_planes.nbytes + group_scales.nbytes
|
| 153 |
+
total_unary += nbytes
|
| 154 |
+
|
| 155 |
+
print(f" {key}: {w.shape} -> {nbytes/1024:.0f}KB ({dt:.1f}s, {sparsity:.0%} sparse)")
|
| 156 |
+
|
| 157 |
+
total_fp16 = 0
|
| 158 |
+
for key in other_keys:
|
| 159 |
+
w = tensors[key].astype(np.float16)
|
| 160 |
+
prefix = os.path.join(output_dir, key.replace(".", "_"))
|
| 161 |
+
w.tofile(prefix + ".fp16")
|
| 162 |
+
total_fp16 += w.nbytes
|
| 163 |
+
print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
|
| 164 |
+
|
| 165 |
+
manifest = {
|
| 166 |
+
"unary": {k: list(tensors[k].shape) for k in linear_keys},
|
| 167 |
+
"fp16": {k: list(tensors[k].shape) for k in other_keys},
|
| 168 |
+
}
|
| 169 |
+
with open(os.path.join(output_dir, "manifest.json"), "w") as f:
|
| 170 |
+
json.dump(manifest, f, indent=2)
|
| 171 |
+
|
| 172 |
+
total = total_unary + total_fp16
|
| 173 |
+
print(f"\n=== Summary ===")
|
| 174 |
+
print(f"Original FP32: {total_orig/1e6:.0f} MB")
|
| 175 |
+
print(f"Unary+group: {total_unary/1e6:.0f} MB")
|
| 176 |
+
print(f"FP16 other: {total_fp16/1e6:.0f} MB")
|
| 177 |
+
print(f"Total: {total/1e6:.0f} MB")
|
| 178 |
+
|
| 179 |
+
# Quick accuracy test
|
| 180 |
+
print(f"\nAccuracy test on {test_key}...")
|
| 181 |
+
w = tensors[test_key]
|
| 182 |
+
sign_bits, mag_planes, group_scales, _ = quantize_group_unary(w, n_planes)
|
| 183 |
+
cosim = test_accuracy(w, sign_bits, mag_planes, group_scales, n_planes)
|
| 184 |
+
print(f" Cosine similarity: {cosim:.4f}")
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
if __name__ == "__main__":
|
| 188 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
|
| 189 |
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-gunary"
|
| 190 |
+
n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
|
| 191 |
+
convert(model_dir, output_dir, n_planes)
|
| 192 |
+
print("Done!")
|
unary_kernel.c
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/*
|
| 2 |
+
* UNARY (Base-1) Neural Network Kernel - AVX-512
|
| 3 |
+
*
|
| 4 |
+
* Weights quantized to signed integers [-N..+N], stored as:
|
| 5 |
+
* sign_bits[row][chunks] - 1 = negative, 0 = positive
|
| 6 |
+
* mag_planes[plane][row][chunks] - unary thermometer bitplanes
|
| 7 |
+
* scales[row] - per-row float32 scale
|
| 8 |
+
*
|
| 9 |
+
* For magnitude M, the first M bitplanes have bit=1 at that position.
|
| 10 |
+
* E.g. magnitude 3 with max_planes=7: planes 0,1,2 have bit set.
|
| 11 |
+
*
|
| 12 |
+
* TRUE UNARY: each plane contributes equally (value 1 per plane).
|
| 13 |
+
* NOT binary (where plane p contributes 2^p).
|
| 14 |
+
*
|
| 15 |
+
* y[i] = scale[i] * sum_planes( signed_masked_sum(x, plane, sign) )
|
| 16 |
+
*
|
| 17 |
+
* (c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
#include <immintrin.h>
|
| 21 |
+
#include <stdint.h>
|
| 22 |
+
#include <stdlib.h>
|
| 23 |
+
#include <string.h>
|
| 24 |
+
#include <math.h>
|
| 25 |
+
#include <stdio.h>
|
| 26 |
+
|
| 27 |
+
void unary_matvec_avx512(
|
| 28 |
+
const uint64_t *sign_bits,
|
| 29 |
+
const uint64_t *mag_planes,
|
| 30 |
+
const float *scales,
|
| 31 |
+
const float *x,
|
| 32 |
+
float *y,
|
| 33 |
+
int out_dim,
|
| 34 |
+
int in_dim,
|
| 35 |
+
int n_planes
|
| 36 |
+
) {
|
| 37 |
+
int chunks = (in_dim + 63) / 64;
|
| 38 |
+
int in_padded = (in_dim + 15) & ~15;
|
| 39 |
+
float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
|
| 40 |
+
memcpy(x_pad, x, in_dim * sizeof(float));
|
| 41 |
+
memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
|
| 42 |
+
|
| 43 |
+
for (int i = 0; i < out_dim; i++) {
|
| 44 |
+
const uint64_t *row_sign = sign_bits + (size_t)i * chunks;
|
| 45 |
+
float total = 0.0f;
|
| 46 |
+
|
| 47 |
+
for (int p = 0; p < n_planes; p++) {
|
| 48 |
+
const uint64_t *plane_row = mag_planes +
|
| 49 |
+
((size_t)p * out_dim + i) * chunks;
|
| 50 |
+
|
| 51 |
+
__m512 acc = _mm512_setzero_ps();
|
| 52 |
+
|
| 53 |
+
for (int c = 0; c < chunks; c++) {
|
| 54 |
+
uint64_t mbits = plane_row[c];
|
| 55 |
+
uint64_t sbits = row_sign[c];
|
| 56 |
+
uint64_t pos = mbits & ~sbits;
|
| 57 |
+
uint64_t neg = mbits & sbits;
|
| 58 |
+
|
| 59 |
+
for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
|
| 60 |
+
int offset = c * 64 + g * 16;
|
| 61 |
+
__m512 xv = _mm512_load_ps(x_pad + offset);
|
| 62 |
+
__mmask16 pmask = (__mmask16)((pos >> (g * 16)) & 0xFFFF);
|
| 63 |
+
__mmask16 nmask = (__mmask16)((neg >> (g * 16)) & 0xFFFF);
|
| 64 |
+
acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
|
| 65 |
+
acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
|
| 66 |
+
}
|
| 67 |
+
}
|
| 68 |
+
total += _mm512_reduce_add_ps(acc);
|
| 69 |
+
}
|
| 70 |
+
y[i] = total * scales[i];
|
| 71 |
+
}
|
| 72 |
+
free(x_pad);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
void rmsnorm_avx512(
|
| 76 |
+
const float *x, const float *weight, float *y, int dim, float eps
|
| 77 |
+
) {
|
| 78 |
+
__m512 sum_sq = _mm512_setzero_ps();
|
| 79 |
+
int i;
|
| 80 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 81 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 82 |
+
sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
|
| 83 |
+
}
|
| 84 |
+
float ss = _mm512_reduce_add_ps(sum_sq);
|
| 85 |
+
for (; i < dim; i++) ss += x[i] * x[i];
|
| 86 |
+
float rms = 1.0f / sqrtf(ss / dim + eps);
|
| 87 |
+
for (i = 0; i + 16 <= dim; i += 16) {
|
| 88 |
+
__m512 xv = _mm512_loadu_ps(x + i);
|
| 89 |
+
__m512 wv = _mm512_loadu_ps(weight + i);
|
| 90 |
+
__m512 rv = _mm512_set1_ps(rms);
|
| 91 |
+
_mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
|
| 92 |
+
}
|
| 93 |
+
for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
void silu_avx512(float *x, int n) {
|
| 97 |
+
for (int i = 0; i < n; i++) {
|
| 98 |
+
float v = x[i];
|
| 99 |
+
x[i] = v / (1.0f + expf(-v));
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
void elemwise_mul_avx512(const float *a, const float *b, float *c, int n) {
|
| 104 |
+
int i;
|
| 105 |
+
for (i = 0; i + 16 <= n; i += 16) {
|
| 106 |
+
__m512 av = _mm512_loadu_ps(a + i);
|
| 107 |
+
__m512 bv = _mm512_loadu_ps(b + i);
|
| 108 |
+
_mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
|
| 109 |
+
}
|
| 110 |
+
for (; i < n; i++) c[i] = a[i] * b[i];
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
void softmax_avx512(float *x, int n) {
|
| 114 |
+
float max_val = x[0];
|
| 115 |
+
for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
|
| 116 |
+
float sum = 0.0f;
|
| 117 |
+
for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
|
| 118 |
+
float inv = 1.0f / sum;
|
| 119 |
+
for (int i = 0; i < n; i++) x[i] *= inv;
|
| 120 |
+
}
|
unary_loader.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Thin Python loader for the Unary C Engine.
|
| 4 |
+
Loads weights from disk, passes pointers to C, calls C generate().
|
| 5 |
+
ZERO Python in the inference hot path.
|
| 6 |
+
|
| 7 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import ctypes
|
| 11 |
+
import numpy as np
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import time
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from transformers import AutoTokenizer
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class UnaryEngine:
|
| 20 |
+
def __init__(self, model_dir, so_path="unary_engine.so"):
|
| 21 |
+
self.model_dir = Path(model_dir)
|
| 22 |
+
self.lib = ctypes.CDLL(so_path)
|
| 23 |
+
self._setup_ctypes()
|
| 24 |
+
|
| 25 |
+
# Load config
|
| 26 |
+
with open(self.model_dir / "config.json") as f:
|
| 27 |
+
self.config = json.load(f)
|
| 28 |
+
self.n_planes = self.config["n_planes"]
|
| 29 |
+
|
| 30 |
+
# Load manifest
|
| 31 |
+
with open(self.model_dir / "manifest.json") as f:
|
| 32 |
+
self.manifest = json.load(f)
|
| 33 |
+
|
| 34 |
+
# Allocate model in C
|
| 35 |
+
self.model = self.lib.model_alloc(self.n_planes)
|
| 36 |
+
|
| 37 |
+
# Keep references so GC doesn't free numpy arrays
|
| 38 |
+
self._refs = []
|
| 39 |
+
|
| 40 |
+
# Load all weights
|
| 41 |
+
self._load_weights()
|
| 42 |
+
|
| 43 |
+
def _setup_ctypes(self):
|
| 44 |
+
L = self.lib
|
| 45 |
+
L.model_alloc.restype = ctypes.c_void_p
|
| 46 |
+
L.model_alloc.argtypes = [ctypes.c_int]
|
| 47 |
+
|
| 48 |
+
L.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 49 |
+
L.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 50 |
+
L.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 51 |
+
|
| 52 |
+
L.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 53 |
+
L.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 54 |
+
|
| 55 |
+
# layer_set_linears: model, layer_idx, then 7x (sign, planes, scales, out, in), plus n_planes
|
| 56 |
+
args = [ctypes.c_void_p, ctypes.c_int]
|
| 57 |
+
for _ in range(7):
|
| 58 |
+
args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 59 |
+
args.append(ctypes.c_int)
|
| 60 |
+
L.layer_set_linears.argtypes = args
|
| 61 |
+
|
| 62 |
+
L.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 63 |
+
L.model_free.argtypes = [ctypes.c_void_p]
|
| 64 |
+
|
| 65 |
+
L.forward_token.restype = ctypes.POINTER(ctypes.c_float)
|
| 66 |
+
L.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 67 |
+
|
| 68 |
+
L.generate.restype = ctypes.c_int
|
| 69 |
+
L.generate.argtypes = [
|
| 70 |
+
ctypes.c_void_p,
|
| 71 |
+
ctypes.c_void_p, ctypes.c_int,
|
| 72 |
+
ctypes.c_void_p, ctypes.c_int,
|
| 73 |
+
ctypes.c_float, ctypes.c_float,
|
| 74 |
+
ctypes.c_int
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
def _keep(self, arr):
|
| 78 |
+
"""Keep reference to prevent GC."""
|
| 79 |
+
self._refs.append(arr)
|
| 80 |
+
return arr.ctypes.data
|
| 81 |
+
|
| 82 |
+
def _load_fp16(self, key):
|
| 83 |
+
path = self.model_dir / (key.replace(".", "_") + ".fp16")
|
| 84 |
+
arr = np.fromfile(str(path), dtype=np.float16)
|
| 85 |
+
return arr
|
| 86 |
+
|
| 87 |
+
def _load_fp16_as_f32(self, key):
|
| 88 |
+
arr = self._load_fp16(key).astype(np.float32)
|
| 89 |
+
self._refs.append(arr)
|
| 90 |
+
return arr
|
| 91 |
+
|
| 92 |
+
def _load_unary(self, key):
|
| 93 |
+
prefix = str(self.model_dir / key.replace(".", "_"))
|
| 94 |
+
sign = np.fromfile(prefix + ".sign", dtype=np.uint64)
|
| 95 |
+
planes = np.fromfile(prefix + ".planes", dtype=np.uint64)
|
| 96 |
+
scales = np.fromfile(prefix + ".scales", dtype=np.float32)
|
| 97 |
+
self._refs.extend([sign, planes, scales])
|
| 98 |
+
shape = self.manifest["unary"][key]
|
| 99 |
+
return sign, planes, scales, shape[0], shape[1]
|
| 100 |
+
|
| 101 |
+
def _load_weights(self):
|
| 102 |
+
t0 = time.time()
|
| 103 |
+
|
| 104 |
+
# Embeddings
|
| 105 |
+
embed = self._load_fp16("model.embed_tokens.weight")
|
| 106 |
+
self._refs.append(embed)
|
| 107 |
+
self.lib.model_set_embed(self.model, embed.ctypes.data)
|
| 108 |
+
print(f" Embeddings: {embed.nbytes/1024/1024:.1f} MB")
|
| 109 |
+
|
| 110 |
+
# Final norm
|
| 111 |
+
fnorm = self._load_fp16_as_f32("model.norm.weight")
|
| 112 |
+
self.lib.model_set_final_norm(self.model, fnorm.ctypes.data)
|
| 113 |
+
|
| 114 |
+
# LM head
|
| 115 |
+
lm = self._load_fp16("lm_head.weight")
|
| 116 |
+
self._refs.append(lm)
|
| 117 |
+
shape = self.manifest["fp16"]["lm_head.weight"]
|
| 118 |
+
self.lib.model_set_lm_head(self.model, lm.ctypes.data, shape[0], shape[1])
|
| 119 |
+
print(f" LM head: {lm.nbytes/1024/1024:.1f} MB")
|
| 120 |
+
|
| 121 |
+
# Layers
|
| 122 |
+
for l in range(28):
|
| 123 |
+
prefix = f"model.layers.{l}"
|
| 124 |
+
|
| 125 |
+
# Norms
|
| 126 |
+
in_norm = self._load_fp16_as_f32(f"{prefix}.input_layernorm.weight")
|
| 127 |
+
post_norm = self._load_fp16_as_f32(f"{prefix}.post_attention_layernorm.weight")
|
| 128 |
+
self.lib.layer_set_norms(self.model, l, in_norm.ctypes.data, post_norm.ctypes.data)
|
| 129 |
+
|
| 130 |
+
# Biases
|
| 131 |
+
q_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.q_proj.bias")
|
| 132 |
+
k_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.k_proj.bias")
|
| 133 |
+
v_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.v_proj.bias")
|
| 134 |
+
self.lib.layer_set_bias(self.model, l,
|
| 135 |
+
q_bias.ctypes.data, k_bias.ctypes.data, v_bias.ctypes.data)
|
| 136 |
+
|
| 137 |
+
# Unary linear layers
|
| 138 |
+
projs = ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj',
|
| 139 |
+
'self_attn.o_proj', 'mlp.gate_proj', 'mlp.up_proj', 'mlp.down_proj']
|
| 140 |
+
|
| 141 |
+
linear_args = []
|
| 142 |
+
for proj in projs:
|
| 143 |
+
key = f"{prefix}.{proj}.weight"
|
| 144 |
+
sign, planes, scales, out_d, in_d = self._load_unary(key)
|
| 145 |
+
linear_args.extend([sign.ctypes.data, planes.ctypes.data,
|
| 146 |
+
scales.ctypes.data, out_d, in_d])
|
| 147 |
+
|
| 148 |
+
self.lib.layer_set_linears(self.model, l, *linear_args, self.n_planes)
|
| 149 |
+
|
| 150 |
+
if (l + 1) % 7 == 0:
|
| 151 |
+
print(f" Loaded {l+1}/28 layers")
|
| 152 |
+
|
| 153 |
+
dt = time.time() - t0
|
| 154 |
+
total = sum(a.nbytes for a in self._refs) / 1024 / 1024
|
| 155 |
+
print(f"\nModel loaded in {dt:.1f}s, {total:.0f} MB in Python arrays")
|
| 156 |
+
|
| 157 |
+
def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95, eos_token=151643):
|
| 158 |
+
self.lib.model_reset_cache(self.model)
|
| 159 |
+
|
| 160 |
+
prompt = np.array(token_ids, dtype=np.int32)
|
| 161 |
+
output = np.zeros(max_new_tokens, dtype=np.int32)
|
| 162 |
+
|
| 163 |
+
t0 = time.time()
|
| 164 |
+
n_gen = self.lib.generate(
|
| 165 |
+
self.model,
|
| 166 |
+
prompt.ctypes.data, len(prompt),
|
| 167 |
+
output.ctypes.data, max_new_tokens,
|
| 168 |
+
ctypes.c_float(temperature), ctypes.c_float(top_p),
|
| 169 |
+
eos_token
|
| 170 |
+
)
|
| 171 |
+
dt = time.time() - t0
|
| 172 |
+
|
| 173 |
+
return output[:n_gen].tolist(), n_gen, dt
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def main():
|
| 177 |
+
import sys
|
| 178 |
+
model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary"
|
| 179 |
+
hf_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
|
| 180 |
+
|
| 181 |
+
print("Loading tokenizer...")
|
| 182 |
+
tok = AutoTokenizer.from_pretrained(hf_dir, trust_remote_code=True)
|
| 183 |
+
|
| 184 |
+
print("Loading unary engine...")
|
| 185 |
+
engine = UnaryEngine(model_dir, "./unary_engine.so")
|
| 186 |
+
|
| 187 |
+
# Test
|
| 188 |
+
messages = [{"role": "user", "content": "What is 2+2?"}]
|
| 189 |
+
prompt = tok.apply_chat_template(messages, add_generation_prompt=True)
|
| 190 |
+
print(f"\nPrompt: {len(prompt)} tokens")
|
| 191 |
+
print("Generating...")
|
| 192 |
+
|
| 193 |
+
tokens, n_gen, dt = engine.generate(prompt, max_new_tokens=60, temperature=0.6)
|
| 194 |
+
text = tok.decode(tokens)
|
| 195 |
+
|
| 196 |
+
print(f"\n--- Output ({n_gen} tokens in {dt:.2f}s = {n_gen/dt:.1f} tok/s) ---")
|
| 197 |
+
print(text)
|
| 198 |
+
print("---")
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
if __name__ == "__main__":
|
| 202 |
+
main()
|
unary_run.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unary Engine Runner - Loads weights into the C engine and generates text.
|
| 4 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 5 |
+
"""
|
| 6 |
+
import ctypes, numpy as np, os, sys, time, struct
|
| 7 |
+
|
| 8 |
+
MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
|
| 9 |
+
HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
|
| 10 |
+
ENGINE = "/root/ternary_engine/unary_engine.so"
|
| 11 |
+
N_PLANES = 7
|
| 12 |
+
N_LAYERS = 28
|
| 13 |
+
HIDDEN = 1536
|
| 14 |
+
VOCAB = 151936
|
| 15 |
+
|
| 16 |
+
# Load engine
|
| 17 |
+
lib = ctypes.CDLL(ENGINE)
|
| 18 |
+
|
| 19 |
+
# Define function signatures
|
| 20 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 21 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 22 |
+
|
| 23 |
+
lib.model_set_embed.restype = None
|
| 24 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 25 |
+
|
| 26 |
+
lib.model_set_final_norm.restype = None
|
| 27 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 28 |
+
|
| 29 |
+
lib.model_set_lm_head.restype = None
|
| 30 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 31 |
+
|
| 32 |
+
lib.layer_set_norms.restype = None
|
| 33 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 34 |
+
|
| 35 |
+
lib.layer_set_bias.restype = None
|
| 36 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 37 |
+
|
| 38 |
+
lib.layer_set_linears.restype = None
|
| 39 |
+
# 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
|
| 40 |
+
lib.layer_set_linears.argtypes = [
|
| 41 |
+
ctypes.c_void_p, ctypes.c_int, # model, layer_idx
|
| 42 |
+
# q_proj
|
| 43 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 44 |
+
# k_proj
|
| 45 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 46 |
+
# v_proj
|
| 47 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 48 |
+
# o_proj
|
| 49 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 50 |
+
# gate_proj
|
| 51 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 52 |
+
# up_proj
|
| 53 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 54 |
+
# down_proj
|
| 55 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 56 |
+
ctypes.c_int # n_planes
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
lib.generate.restype = ctypes.c_int
|
| 60 |
+
lib.generate.argtypes = [
|
| 61 |
+
ctypes.c_void_p, # model
|
| 62 |
+
ctypes.c_void_p, ctypes.c_int, # prompt_ids, prompt_len
|
| 63 |
+
ctypes.c_void_p, ctypes.c_int, # out_tokens, max_new_tokens
|
| 64 |
+
ctypes.c_float, ctypes.c_float, # temperature, top_p
|
| 65 |
+
ctypes.c_int # eos_token
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
lib.model_reset_cache.restype = None
|
| 69 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 70 |
+
|
| 71 |
+
lib.model_free.restype = None
|
| 72 |
+
lib.model_free.argtypes = [ctypes.c_void_p]
|
| 73 |
+
|
| 74 |
+
def load_fp16_as_uint16(path):
|
| 75 |
+
"""Load FP16 file as raw uint16 array (for passing to C as FP16)"""
|
| 76 |
+
return np.fromfile(path, dtype=np.uint16)
|
| 77 |
+
|
| 78 |
+
def load_fp16_as_f32(path):
|
| 79 |
+
"""Load FP16 file and convert to FP32"""
|
| 80 |
+
raw = np.fromfile(path, dtype=np.float16)
|
| 81 |
+
return raw.astype(np.float32)
|
| 82 |
+
|
| 83 |
+
def load_unary(name):
|
| 84 |
+
"""Load sign, planes, scales for a unary layer"""
|
| 85 |
+
base = os.path.join(MODEL_DIR, name)
|
| 86 |
+
sign = np.fromfile(base + ".sign", dtype=np.uint64)
|
| 87 |
+
planes = np.fromfile(base + ".planes", dtype=np.uint64)
|
| 88 |
+
scales = np.fromfile(base + ".scales", dtype=np.float32)
|
| 89 |
+
return sign, planes, scales
|
| 90 |
+
|
| 91 |
+
# Keep references to prevent GC
|
| 92 |
+
_refs = []
|
| 93 |
+
|
| 94 |
+
def keep(arr):
|
| 95 |
+
"""Keep numpy array alive and return its ctypes pointer"""
|
| 96 |
+
_refs.append(arr)
|
| 97 |
+
return arr.ctypes.data
|
| 98 |
+
|
| 99 |
+
print("Allocating model...")
|
| 100 |
+
model = lib.model_alloc(N_PLANES)
|
| 101 |
+
print(f"Model pointer: {model:#x}")
|
| 102 |
+
|
| 103 |
+
# Load embeddings (FP16, passed as uint16)
|
| 104 |
+
print("Loading embeddings...")
|
| 105 |
+
embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
|
| 106 |
+
print(f" embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
|
| 107 |
+
lib.model_set_embed(model, keep(embed))
|
| 108 |
+
|
| 109 |
+
# Load final norm (FP16 -> FP32)
|
| 110 |
+
print("Loading final norm...")
|
| 111 |
+
final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
|
| 112 |
+
lib.model_set_final_norm(model, keep(final_norm))
|
| 113 |
+
|
| 114 |
+
# Load lm_head (FP16, passed as uint16)
|
| 115 |
+
print("Loading lm_head...")
|
| 116 |
+
lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
|
| 117 |
+
lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
|
| 118 |
+
|
| 119 |
+
# Load layers
|
| 120 |
+
PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
|
| 121 |
+
"self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
|
| 122 |
+
|
| 123 |
+
# Layer dimensions: [out_dim, in_dim]
|
| 124 |
+
PROJ_DIMS = {
|
| 125 |
+
"self_attn_q_proj": (1536, 1536),
|
| 126 |
+
"self_attn_k_proj": (256, 1536),
|
| 127 |
+
"self_attn_v_proj": (256, 1536),
|
| 128 |
+
"self_attn_o_proj": (1536, 1536),
|
| 129 |
+
"mlp_gate_proj": (8960, 1536),
|
| 130 |
+
"mlp_up_proj": (8960, 1536),
|
| 131 |
+
"mlp_down_proj": (1536, 8960),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
for l in range(N_LAYERS):
|
| 135 |
+
if l % 7 == 0:
|
| 136 |
+
print(f"Loading layer {l}/{N_LAYERS}...")
|
| 137 |
+
|
| 138 |
+
# Norms (FP16 -> FP32)
|
| 139 |
+
input_norm = load_fp16_as_f32(
|
| 140 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
|
| 141 |
+
post_norm = load_fp16_as_f32(
|
| 142 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
|
| 143 |
+
lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
|
| 144 |
+
|
| 145 |
+
# Biases (FP16 -> FP32)
|
| 146 |
+
q_bias = load_fp16_as_f32(
|
| 147 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
|
| 148 |
+
k_bias = load_fp16_as_f32(
|
| 149 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
|
| 150 |
+
v_bias = load_fp16_as_f32(
|
| 151 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
|
| 152 |
+
lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
|
| 153 |
+
|
| 154 |
+
# Unary linear layers
|
| 155 |
+
proj_args = []
|
| 156 |
+
for proj_name in PROJ_NAMES:
|
| 157 |
+
weight_name = f"model_layers_{l}_{proj_name}_weight"
|
| 158 |
+
sign, planes, scales = load_unary(weight_name)
|
| 159 |
+
out_dim, in_dim = PROJ_DIMS[proj_name]
|
| 160 |
+
proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
|
| 161 |
+
|
| 162 |
+
lib.layer_set_linears(model, l, *proj_args, N_PLANES)
|
| 163 |
+
|
| 164 |
+
print("Model loaded!")
|
| 165 |
+
|
| 166 |
+
# Load tokenizer
|
| 167 |
+
print("Loading tokenizer...")
|
| 168 |
+
from transformers import AutoTokenizer
|
| 169 |
+
tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
|
| 170 |
+
eos_id = tokenizer.eos_token_id
|
| 171 |
+
print(f"Tokenizer loaded, EOS={eos_id}")
|
| 172 |
+
|
| 173 |
+
# Generate
|
| 174 |
+
prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
|
| 175 |
+
print(f"\nPrompt: {prompt}")
|
| 176 |
+
|
| 177 |
+
input_ids = tokenizer.encode(prompt, return_tensors=None)
|
| 178 |
+
input_arr = np.array(input_ids, dtype=np.int32)
|
| 179 |
+
max_new = 256
|
| 180 |
+
out_arr = np.zeros(max_new, dtype=np.int32)
|
| 181 |
+
|
| 182 |
+
lib.model_reset_cache(model)
|
| 183 |
+
|
| 184 |
+
print("Generating...")
|
| 185 |
+
t0 = time.time()
|
| 186 |
+
n_gen = lib.generate(
|
| 187 |
+
model,
|
| 188 |
+
input_arr.ctypes.data, len(input_ids),
|
| 189 |
+
out_arr.ctypes.data, max_new,
|
| 190 |
+
ctypes.c_float(0.6), ctypes.c_float(0.9),
|
| 191 |
+
eos_id
|
| 192 |
+
)
|
| 193 |
+
dt = time.time() - t0
|
| 194 |
+
|
| 195 |
+
output_ids = out_arr[:n_gen].tolist()
|
| 196 |
+
text = tokenizer.decode(output_ids, skip_special_tokens=False)
|
| 197 |
+
tok_s = n_gen / dt if dt > 0 else 0
|
| 198 |
+
|
| 199 |
+
print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
|
| 200 |
+
print(text)
|
| 201 |
+
print(f"--- End ---")
|
| 202 |
+
|
| 203 |
+
lib.model_free(model)
|
unary_run16.py
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Unary Engine Runner - Loads weights into the C engine and generates text.
|
| 4 |
+
(c) 2026 OpenTransformers Ltd / Scott Bisset
|
| 5 |
+
"""
|
| 6 |
+
import ctypes, numpy as np, os, sys, time, struct
|
| 7 |
+
|
| 8 |
+
MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
|
| 9 |
+
HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
|
| 10 |
+
ENGINE = "/root/ternary_engine/unary_engine.so"
|
| 11 |
+
N_PLANES = 7
|
| 12 |
+
N_LAYERS = 28
|
| 13 |
+
HIDDEN = 1536
|
| 14 |
+
VOCAB = 151936
|
| 15 |
+
|
| 16 |
+
# Load engine
|
| 17 |
+
lib = ctypes.CDLL(ENGINE)
|
| 18 |
+
|
| 19 |
+
# Define function signatures
|
| 20 |
+
lib.model_alloc.restype = ctypes.c_void_p
|
| 21 |
+
lib.model_alloc.argtypes = [ctypes.c_int]
|
| 22 |
+
|
| 23 |
+
lib.model_set_embed.restype = None
|
| 24 |
+
lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 25 |
+
|
| 26 |
+
lib.model_set_final_norm.restype = None
|
| 27 |
+
lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
|
| 28 |
+
|
| 29 |
+
lib.model_set_lm_head.restype = None
|
| 30 |
+
lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
|
| 31 |
+
|
| 32 |
+
lib.layer_set_norms.restype = None
|
| 33 |
+
lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
|
| 34 |
+
|
| 35 |
+
lib.layer_set_bias.restype = None
|
| 36 |
+
lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
|
| 37 |
+
|
| 38 |
+
lib.layer_set_linears.restype = None
|
| 39 |
+
# 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
|
| 40 |
+
lib.layer_set_linears.argtypes = [
|
| 41 |
+
ctypes.c_void_p, ctypes.c_int, # model, layer_idx
|
| 42 |
+
# q_proj
|
| 43 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 44 |
+
# k_proj
|
| 45 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 46 |
+
# v_proj
|
| 47 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 48 |
+
# o_proj
|
| 49 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 50 |
+
# gate_proj
|
| 51 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 52 |
+
# up_proj
|
| 53 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 54 |
+
# down_proj
|
| 55 |
+
ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
|
| 56 |
+
ctypes.c_int # n_planes
|
| 57 |
+
]
|
| 58 |
+
|
| 59 |
+
lib.generate.restype = ctypes.c_int
|
| 60 |
+
lib.generate.argtypes = [
|
| 61 |
+
ctypes.c_void_p, # model
|
| 62 |
+
ctypes.c_void_p, ctypes.c_int, # prompt_ids, prompt_len
|
| 63 |
+
ctypes.c_void_p, ctypes.c_int, # out_tokens, max_new_tokens
|
| 64 |
+
ctypes.c_float, ctypes.c_float, # temperature, top_p
|
| 65 |
+
ctypes.c_int # eos_token
|
| 66 |
+
]
|
| 67 |
+
|
| 68 |
+
lib.model_reset_cache.restype = None
|
| 69 |
+
lib.model_reset_cache.argtypes = [ctypes.c_void_p]
|
| 70 |
+
|
| 71 |
+
lib.model_free.restype = None
|
| 72 |
+
lib.model_free.argtypes = [ctypes.c_void_p]
|
| 73 |
+
|
| 74 |
+
def load_fp16_as_uint16(path):
|
| 75 |
+
"""Load FP16 file as raw uint16 array (for passing to C as FP16)"""
|
| 76 |
+
return np.fromfile(path, dtype=np.uint16)
|
| 77 |
+
|
| 78 |
+
def load_fp16_as_f32(path):
|
| 79 |
+
"""Load FP16 file and convert to FP32"""
|
| 80 |
+
raw = np.fromfile(path, dtype=np.float16)
|
| 81 |
+
return raw.astype(np.float32)
|
| 82 |
+
|
| 83 |
+
def load_unary(name):
|
| 84 |
+
"""Load sign, planes, scales for a unary layer"""
|
| 85 |
+
base = os.path.join(MODEL_DIR, name)
|
| 86 |
+
sign = np.fromfile(base + ".sign", dtype=np.uint64)
|
| 87 |
+
planes = np.fromfile(base + ".planes", dtype=np.uint64)
|
| 88 |
+
scales = np.fromfile(base + ".scales", dtype=np.float32)
|
| 89 |
+
return sign, planes, scales
|
| 90 |
+
|
| 91 |
+
# Keep references to prevent GC
|
| 92 |
+
_refs = []
|
| 93 |
+
|
| 94 |
+
def keep(arr):
|
| 95 |
+
"""Keep numpy array alive and return its ctypes pointer"""
|
| 96 |
+
_refs.append(arr)
|
| 97 |
+
return arr.ctypes.data
|
| 98 |
+
|
| 99 |
+
print("Allocating model...")
|
| 100 |
+
model = lib.model_alloc(N_PLANES)
|
| 101 |
+
print(f"Model pointer: {model:#x}")
|
| 102 |
+
|
| 103 |
+
# Load embeddings (FP16, passed as uint16)
|
| 104 |
+
print("Loading embeddings...")
|
| 105 |
+
embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
|
| 106 |
+
print(f" embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
|
| 107 |
+
lib.model_set_embed(model, keep(embed))
|
| 108 |
+
|
| 109 |
+
# Load final norm (FP16 -> FP32)
|
| 110 |
+
print("Loading final norm...")
|
| 111 |
+
final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
|
| 112 |
+
lib.model_set_final_norm(model, keep(final_norm))
|
| 113 |
+
|
| 114 |
+
# Load lm_head (FP16, passed as uint16)
|
| 115 |
+
print("Loading lm_head...")
|
| 116 |
+
lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
|
| 117 |
+
lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
|
| 118 |
+
|
| 119 |
+
# Load layers
|
| 120 |
+
PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
|
| 121 |
+
"self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
|
| 122 |
+
|
| 123 |
+
# Layer dimensions: [out_dim, in_dim]
|
| 124 |
+
PROJ_DIMS = {
|
| 125 |
+
"self_attn_q_proj": (1536, 1536),
|
| 126 |
+
"self_attn_k_proj": (256, 1536),
|
| 127 |
+
"self_attn_v_proj": (256, 1536),
|
| 128 |
+
"self_attn_o_proj": (1536, 1536),
|
| 129 |
+
"mlp_gate_proj": (8960, 1536),
|
| 130 |
+
"mlp_up_proj": (8960, 1536),
|
| 131 |
+
"mlp_down_proj": (1536, 8960),
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
for l in range(N_LAYERS):
|
| 135 |
+
if l % 7 == 0:
|
| 136 |
+
print(f"Loading layer {l}/{N_LAYERS}...")
|
| 137 |
+
|
| 138 |
+
# Norms (FP16 -> FP32)
|
| 139 |
+
input_norm = load_fp16_as_f32(
|
| 140 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
|
| 141 |
+
post_norm = load_fp16_as_f32(
|
| 142 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
|
| 143 |
+
lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
|
| 144 |
+
|
| 145 |
+
# Biases (FP16 -> FP32)
|
| 146 |
+
q_bias = load_fp16_as_f32(
|
| 147 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
|
| 148 |
+
k_bias = load_fp16_as_f32(
|
| 149 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
|
| 150 |
+
v_bias = load_fp16_as_f32(
|
| 151 |
+
os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
|
| 152 |
+
lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
|
| 153 |
+
|
| 154 |
+
# Unary linear layers
|
| 155 |
+
proj_args = []
|
| 156 |
+
for proj_name in PROJ_NAMES:
|
| 157 |
+
weight_name = f"model_layers_{l}_{proj_name}_weight"
|
| 158 |
+
sign, planes, scales = load_unary(weight_name)
|
| 159 |
+
out_dim, in_dim = PROJ_DIMS[proj_name]
|
| 160 |
+
proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
|
| 161 |
+
|
| 162 |
+
lib.layer_set_linears(model, l, *proj_args, N_PLANES)
|
| 163 |
+
|
| 164 |
+
print("Model loaded!")
|
| 165 |
+
|
| 166 |
+
# Load tokenizer
|
| 167 |
+
print("Loading tokenizer...")
|
| 168 |
+
from transformers import AutoTokenizer
|
| 169 |
+
tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
|
| 170 |
+
eos_id = tokenizer.eos_token_id
|
| 171 |
+
print(f"Tokenizer loaded, EOS={eos_id}")
|
| 172 |
+
|
| 173 |
+
# Generate
|
| 174 |
+
prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
|
| 175 |
+
print(f"\nPrompt: {prompt}")
|
| 176 |
+
|
| 177 |
+
input_ids = tokenizer.encode(prompt, return_tensors=None)
|
| 178 |
+
input_arr = np.array(input_ids, dtype=np.int32)
|
| 179 |
+
max_new = 16
|
| 180 |
+
out_arr = np.zeros(max_new, dtype=np.int32)
|
| 181 |
+
|
| 182 |
+
lib.model_reset_cache(model)
|
| 183 |
+
|
| 184 |
+
print("Generating...")
|
| 185 |
+
t0 = time.time()
|
| 186 |
+
n_gen = lib.generate(
|
| 187 |
+
model,
|
| 188 |
+
input_arr.ctypes.data, len(input_ids),
|
| 189 |
+
out_arr.ctypes.data, max_new,
|
| 190 |
+
ctypes.c_float(0.6), ctypes.c_float(0.9),
|
| 191 |
+
eos_id
|
| 192 |
+
)
|
| 193 |
+
dt = time.time() - t0
|
| 194 |
+
|
| 195 |
+
output_ids = out_arr[:n_gen].tolist()
|
| 196 |
+
text = tokenizer.decode(output_ids, skip_special_tokens=False)
|
| 197 |
+
tok_s = n_gen / dt if dt > 0 else 0
|
| 198 |
+
|
| 199 |
+
print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
|
| 200 |
+
print(text)
|
| 201 |
+
print(f"--- End ---")
|
| 202 |
+
|
| 203 |
+
lib.model_free(model)
|