OpenTransformer commited on
Commit
19ed98b
·
verified ·
1 Parent(s): 0ebe638

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. README.md +66 -0
  2. bench_fwd.py +70 -0
  3. bench_gen.py +71 -0
  4. bench_prompt.py +86 -0
  5. build.sh +82 -0
  6. concat_unary +0 -0
  7. concat_unary.c +608 -0
  8. convert.py +205 -0
  9. convert_fast.py +226 -0
  10. convert_log_unary.py +159 -0
  11. convert_proper_unary.py +164 -0
  12. convert_proper_unary_v2.py +247 -0
  13. convert_qwen3.py +149 -0
  14. convert_qwen3_v2.py +161 -0
  15. deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales +0 -0
  16. deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16 +0 -0
  17. deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales +0 -0
  18. deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg +0 -0
  19. deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales +3 -0
  20. deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos +0 -0
  21. inference.py +503 -0
  22. log_unary_engine.c +598 -0
  23. logunary_tensor.c +534 -0
  24. packed_convert.py +79 -0
  25. packed_engine.c +408 -0
  26. packed_loader.py +134 -0
  27. proper_unary +0 -0
  28. proper_unary.c +563 -0
  29. pure_unary_engine.c +658 -0
  30. run_convert.py +76 -0
  31. run_log_unary.py +123 -0
  32. run_pure_unary.py +176 -0
  33. run_qwen3_4b.py +221 -0
  34. server.py +107 -0
  35. ternary_kernel.c +265 -0
  36. test_logunary +0 -0
  37. test_logunary.c +153 -0
  38. test_popcount.py +99 -0
  39. true_unary +0 -0
  40. true_unary.c +552 -0
  41. unary_convert.py +189 -0
  42. unary_convert_v2.py +134 -0
  43. unary_engine.c +381 -0
  44. unary_engine_v2.c +629 -0
  45. unary_full.c +742 -0
  46. unary_group_convert.py +192 -0
  47. unary_kernel.c +120 -0
  48. unary_loader.py +202 -0
  49. unary_run.py +203 -0
  50. unary_run16.py +203 -0
README.md ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - quantization
5
+ - unary
6
+ - thermometer-encoding
7
+ - inference-engine
8
+ - low-bit
9
+ language:
10
+ - en
11
+ ---
12
+
13
+ # Unary Quantization Research
14
+
15
+ True unary (base-1) quantization for neural network weights. NOT binary.
16
+
17
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
18
+
19
+ ## Overview
20
+
21
+ Unary means magnitude N = N consecutive 1-bits across N bitplanes. Each bitplane contributes value=1, not binary powers. This eliminates multiplication from inference — only addition and popcount.
22
+
23
+ 7-plane unary gives 8 magnitude levels (15 distinct values with sign), achieving 0.97 cosine similarity per layer against FP32 originals.
24
+
25
+ ## Contents
26
+
27
+ ### Converters (Python)
28
+ - `unary_convert.py` / `unary_convert_v2.py` — Base unary thermometer conversion
29
+ - `convert_proper_unary.py` / `convert_proper_unary_v2.py` — Proper unary with group quantization
30
+ - `convert_log_unary.py` — Log-spaced unary variant
31
+ - `convert_fast.py` — Optimised conversion pipeline
32
+ - `packed_convert.py` / `packed_loader.py` — Packed binary format
33
+ - `convert_qwen3.py` / `convert_qwen3_v2.py` — Qwen3-4B specific converters
34
+
35
+ ### C Inference Engines (AVX-512 + POPCNT)
36
+ - `unary_engine.c` / `unary_engine_v2.c` — Core unary inference
37
+ - `pure_unary_engine.c` — Pure unary (no FP in linear layers)
38
+ - `log_unary_engine.c` — Log-unary engine
39
+ - `proper_unary.c` — Proper unary with group scales
40
+ - `true_unary.c` — True base-1 unary engine
41
+ - `concat_unary.c` — Concatenated unary engine
42
+ - `packed_engine.c` — Packed bitplane engine
43
+ - `unary_full.c` — Full forward pass engine
44
+
45
+ ### Converted Models
46
+ - `deepseek-r1-1.5b-*` — DeepSeek-R1-1.5B in multiple unary variants (4-plane, 7-plane, 31-plane, grouped, packed, ternary baseline)
47
+ - `qwen3-4b-*` — Qwen3-4B-Thinking in unary, log-unary, and proper-unary variants
48
+
49
+ ### Benchmarks and Runners
50
+ - `bench_fwd.py` / `bench_gen.py` / `bench_prompt.py` — Performance benchmarks
51
+ - `inference.py` / `server.py` — Python inference and API server
52
+ - Various `run_*.py` — Model-specific runners
53
+
54
+ ## Key Insight
55
+
56
+ Unary quantization trades bits-per-weight for computational simplicity. All multiply-accumulate operations become popcount + addition, making this particularly suited for edge/CPU inference where SIMD popcount is fast.
57
+
58
+ ## Building
59
+
60
+ ```bash
61
+ gcc -O3 -mavx512f -mavx512bw -mpopcnt -o unary_engine unary_engine.c -lm
62
+ ```
63
+
64
+ ## License
65
+
66
+ Apache 2.0
bench_fwd.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes, numpy as np, os, time, sys
2
+
3
+ MODEL_DIR = "deepseek-r1-1.5b-unary"
4
+ HF_DIR = "deepseek-r1-1.5b-hf"
5
+ lib = ctypes.CDLL("./unary_engine.so")
6
+
7
+ lib.model_alloc.restype = ctypes.c_void_p
8
+ lib.model_alloc.argtypes = [ctypes.c_int]
9
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
10
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
11
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
12
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
13
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
14
+ lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
15
+ lib.forward_token.restype = ctypes.c_void_p
16
+ lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
17
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
18
+
19
+ _refs = []
20
+ def keep(a):
21
+ _refs.append(a)
22
+ return a.ctypes.data
23
+
24
+ N_PLANES = 7
25
+ N_LAYERS = 28
26
+ PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
27
+ DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
28
+
29
+ print("Loading model...")
30
+ m = lib.model_alloc(N_PLANES)
31
+ e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
32
+ lib.model_set_embed(m, keep(e))
33
+ n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
34
+ lib.model_set_final_norm(m, keep(n))
35
+ h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
36
+ lib.model_set_lm_head(m, keep(h), 151936, 1536)
37
+ for l in range(N_LAYERS):
38
+ inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
39
+ pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
40
+ lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
41
+ qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
42
+ kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
43
+ vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
44
+ lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
45
+ pa = []
46
+ for pn in PROJS:
47
+ base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
48
+ s = np.fromfile(base+'.sign',dtype=np.uint64)
49
+ p = np.fromfile(base+'.planes',dtype=np.uint64)
50
+ sc = np.fromfile(base+'.scales',dtype=np.float32)
51
+ od,id = DIMS[pn]
52
+ pa.extend([keep(s),keep(p),keep(sc),od,id])
53
+ lib.layer_set_linears(m, l, *pa, N_PLANES)
54
+
55
+ print("Model loaded, benchmarking single forward pass...")
56
+ lib.model_reset_cache(m)
57
+
58
+ # Time single forward pass (token_id=1, pos=0)
59
+ times = []
60
+ for i in range(3):
61
+ lib.model_reset_cache(m)
62
+ t0 = time.time()
63
+ lib.forward_token(m, 1, 0)
64
+ dt = time.time() - t0
65
+ times.append(dt)
66
+ print(f" forward_token run {i}: {dt:.3f}s")
67
+
68
+ avg = sum(times)/len(times)
69
+ print(f"\nAvg: {avg:.3f}s per token = {1/avg:.1f} tok/s")
70
+ print(f"OMP threads: {os.environ.get('OMP_NUM_THREADS', 'default')}")
bench_gen.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes, numpy as np, os, time
2
+
3
+ MODEL_DIR = "deepseek-r1-1.5b-unary"
4
+ HF_DIR = "deepseek-r1-1.5b-hf"
5
+ lib = ctypes.CDLL("./unary_engine.so")
6
+
7
+ lib.model_alloc.restype = ctypes.c_void_p
8
+ lib.model_alloc.argtypes = [ctypes.c_int]
9
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
10
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
11
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
12
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
13
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
14
+ lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
15
+ lib.forward_token.restype = ctypes.c_void_p
16
+ lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
17
+ lib.generate.restype = ctypes.c_int
18
+ lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
19
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
20
+
21
+ _refs = []
22
+ def keep(a):
23
+ _refs.append(a)
24
+ return a.ctypes.data
25
+
26
+ N_PLANES = 7
27
+ N_LAYERS = 28
28
+ PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
29
+ DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
30
+
31
+ m = lib.model_alloc(N_PLANES)
32
+ e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
33
+ lib.model_set_embed(m, keep(e))
34
+ n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
35
+ lib.model_set_final_norm(m, keep(n))
36
+ h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
37
+ lib.model_set_lm_head(m, keep(h), 151936, 1536)
38
+ for l in range(N_LAYERS):
39
+ inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
40
+ pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
41
+ lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
42
+ qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
43
+ kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
44
+ vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
45
+ lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
46
+ pa = []
47
+ for pn in PROJS:
48
+ base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
49
+ s = np.fromfile(base+'.sign',dtype=np.uint64)
50
+ p = np.fromfile(base+'.planes',dtype=np.uint64)
51
+ sc = np.fromfile(base+'.scales',dtype=np.float32)
52
+ od,id = DIMS[pn]
53
+ pa.extend([keep(s),keep(p),keep(sc),od,id])
54
+ lib.layer_set_linears(m, l, *pa, N_PLANES)
55
+
56
+ print("Loaded. Testing generate with greedy (temp=0)...")
57
+ lib.model_reset_cache(m)
58
+
59
+ inp = np.array([1], dtype=np.int32) # just BOS token
60
+ out = np.zeros(8, dtype=np.int32)
61
+
62
+ t0 = time.time()
63
+ ng = lib.generate(m, inp.ctypes.data, 1, out.ctypes.data, 8,
64
+ ctypes.c_float(0.0), ctypes.c_float(0.9), 151643)
65
+ dt = time.time() - t0
66
+ print(f"Generated {ng} tokens in {dt:.1f}s = {ng/dt:.1f} tok/s")
67
+ print(f"Token IDs: {out[:ng].tolist()}")
68
+
69
+ from transformers import AutoTokenizer
70
+ tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
71
+ print(f"Text: {tok.decode(out[:ng].tolist())}")
bench_prompt.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ctypes, numpy as np, os, time
2
+
3
+ MODEL_DIR = "deepseek-r1-1.5b-unary"
4
+ HF_DIR = "deepseek-r1-1.5b-hf"
5
+ lib = ctypes.CDLL("./unary_engine.so")
6
+
7
+ lib.model_alloc.restype = ctypes.c_void_p
8
+ lib.model_alloc.argtypes = [ctypes.c_int]
9
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
10
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
11
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
12
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
13
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
14
+ lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]*7 + [ctypes.c_int]
15
+ lib.generate.restype = ctypes.c_int
16
+ lib.generate.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_float, ctypes.c_float, ctypes.c_int]
17
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
18
+
19
+ _refs = []
20
+ def keep(a):
21
+ _refs.append(a)
22
+ return a.ctypes.data
23
+
24
+ N_PLANES = 7
25
+ N_LAYERS = 28
26
+ PROJS = ['self_attn_q_proj','self_attn_k_proj','self_attn_v_proj','self_attn_o_proj','mlp_gate_proj','mlp_up_proj','mlp_down_proj']
27
+ DIMS = {'self_attn_q_proj':(1536,1536),'self_attn_k_proj':(256,1536),'self_attn_v_proj':(256,1536),'self_attn_o_proj':(1536,1536),'mlp_gate_proj':(8960,1536),'mlp_up_proj':(8960,1536),'mlp_down_proj':(1536,8960)}
28
+
29
+ m = lib.model_alloc(N_PLANES)
30
+ e = np.fromfile(os.path.join(MODEL_DIR,'model_embed_tokens_weight.fp16'), dtype=np.uint16)
31
+ lib.model_set_embed(m, keep(e))
32
+ n = np.fromfile(os.path.join(MODEL_DIR,'model_norm_weight.fp16'), dtype=np.float16).astype(np.float32)
33
+ lib.model_set_final_norm(m, keep(n))
34
+ h = np.fromfile(os.path.join(MODEL_DIR,'lm_head_weight.fp16'), dtype=np.uint16)
35
+ lib.model_set_lm_head(m, keep(h), 151936, 1536)
36
+ for l in range(N_LAYERS):
37
+ inorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_input_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
38
+ pnorm = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_post_attention_layernorm_weight.fp16'),dtype=np.float16).astype(np.float32)
39
+ lib.layer_set_norms(m, l, keep(inorm), keep(pnorm))
40
+ qb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_q_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
41
+ kb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_k_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
42
+ vb = np.fromfile(os.path.join(MODEL_DIR,f'model_layers_{l}_self_attn_v_proj_bias.fp16'),dtype=np.float16).astype(np.float32)
43
+ lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
44
+ pa = []
45
+ for pn in PROJS:
46
+ base = os.path.join(MODEL_DIR,f'model_layers_{l}_{pn}_weight')
47
+ s = np.fromfile(base+'.sign',dtype=np.uint64)
48
+ p = np.fromfile(base+'.planes',dtype=np.uint64)
49
+ sc = np.fromfile(base+'.scales',dtype=np.float32)
50
+ od,id = DIMS[pn]
51
+ pa.extend([keep(s),keep(p),keep(sc),od,id])
52
+ lib.layer_set_linears(m, l, *pa, N_PLANES)
53
+
54
+ from transformers import AutoTokenizer
55
+ tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
56
+
57
+ # Test with actual prompt
58
+ prompt = "What is 2+2? Think step by step."
59
+ ids = tok.encode(prompt)
60
+ inp = np.array(ids, dtype=np.int32)
61
+ out = np.zeros(64, dtype=np.int32)
62
+ lib.model_reset_cache(m)
63
+
64
+ print(f"Prompt: {prompt} ({len(ids)} tokens)")
65
+
66
+ # Test greedy first
67
+ print("\n--- Greedy ---")
68
+ t0 = time.time()
69
+ ng = lib.generate(m, inp.ctypes.data, len(ids), out.ctypes.data, 64,
70
+ ctypes.c_float(0.0), ctypes.c_float(0.9), tok.eos_token_id)
71
+ dt = time.time() - t0
72
+ text = tok.decode(out[:ng].tolist(), skip_special_tokens=False)
73
+ print(f"{ng} tokens, {dt:.1f}s, {ng/dt:.1f} tok/s")
74
+ print(f"Output: {text}")
75
+
76
+ # Test with temperature
77
+ print("\n--- Temperature=0.6 ---")
78
+ lib.model_reset_cache(m)
79
+ out2 = np.zeros(64, dtype=np.int32)
80
+ t0 = time.time()
81
+ ng2 = lib.generate(m, inp.ctypes.data, len(ids), out2.ctypes.data, 64,
82
+ ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
83
+ dt2 = time.time() - t0
84
+ text2 = tok.decode(out2[:ng2].tolist(), skip_special_tokens=False)
85
+ print(f"{ng2} tokens, {dt2:.1f}s, {ng2/dt2:.1f} tok/s")
86
+ print(f"Output: {text2}")
build.sh ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Build and deploy ternary inference engine
3
+ # (c) 2026 OpenTransformers Ltd / Scott Bisset
4
+
5
+ set -e
6
+
7
+ WORKDIR=/root/ternary_engine
8
+ MODEL_HF=deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
9
+ MODEL_HF_DIR=$WORKDIR/deepseek-r1-1.5b-hf
10
+ TERNARY_DIR=$WORKDIR/deepseek-r1-1.5b-ternary
11
+
12
+ echo "=== Ternary Inference Engine Build ==="
13
+ echo "Target: AVX-512 Skylake"
14
+ echo ""
15
+
16
+ mkdir -p $WORKDIR
17
+ cd $WORKDIR
18
+
19
+ # Step 1: Compile C kernel with AVX-512
20
+ echo "[1/4] Compiling AVX-512 kernel..."
21
+ gcc -O3 -march=skylake-avx512 -mavx512f -mavx512bw -mavx512dq -mavx512vl \
22
+ -shared -fPIC -lm \
23
+ -o ternary_kernel.so ternary_kernel.c
24
+ echo " -> ternary_kernel.so built"
25
+ ls -lh ternary_kernel.so
26
+
27
+ # Step 2: Download model from HuggingFace
28
+ echo ""
29
+ echo "[2/4] Downloading model weights..."
30
+ pip install --break-system-packages -q safetensors tokenizers 2>/dev/null
31
+ python3 -c "
32
+ from huggingface_hub import snapshot_download
33
+ snapshot_download('$MODEL_HF', local_dir='$MODEL_HF_DIR',
34
+ ignore_patterns=['*.md', '*.txt', 'figures/*'])
35
+ print('Download complete')
36
+ "
37
+
38
+ # Step 3: Convert to ternary
39
+ echo ""
40
+ echo "[3/4] Converting to ternary format..."
41
+ python3 convert.py "$MODEL_HF_DIR" "$TERNARY_DIR" 0.7
42
+
43
+ # Step 4: Verify
44
+ echo ""
45
+ echo "[4/4] Verifying..."
46
+ ls -lh $TERNARY_DIR/ | head -20
47
+ echo ""
48
+ du -sh $TERNARY_DIR/
49
+ echo ""
50
+
51
+ # Quick test
52
+ echo "Running speed test..."
53
+ python3 -c "
54
+ from inference import TernaryQwen, load_kernel
55
+ import time
56
+ import os
57
+
58
+ kernel = load_kernel('$WORKDIR/ternary_kernel.so')
59
+ model = TernaryQwen('$TERNARY_DIR', kernel)
60
+
61
+ # Warm up
62
+ import numpy as np
63
+ cache_module = __import__('inference')
64
+ cache = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
65
+ hidden = model.forward_token(9707, cache, 0) # 'Hello'
66
+
67
+ # Benchmark single token
68
+ times = []
69
+ for i in range(5):
70
+ cache2 = cache_module.KVCache(model.n_layers, model.n_kv, model.head_dim)
71
+ t0 = time.time()
72
+ h = model.forward_token(9707, cache2, 0)
73
+ times.append(time.time() - t0)
74
+
75
+ avg = sum(times) / len(times)
76
+ print(f'Single token forward: {avg*1000:.1f}ms ({1/avg:.1f} tok/s)')
77
+ print(f'Times: {[f\"{t*1000:.1f}ms\" for t in times]}')
78
+ "
79
+
80
+ echo ""
81
+ echo "=== Build complete ==="
82
+ echo "To start server: cd $WORKDIR && TERNARY_MODEL_DIR=$TERNARY_DIR TOKENIZER_DIR=$MODEL_HF_DIR python3 server.py"
concat_unary ADDED
Binary file (26.1 kB). View file
 
concat_unary.c ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * CONCATENATIVE UNARY ENGINE
3
+ *
4
+ * In base-1, the value IS the count of ones.
5
+ * Addition = concatenation of bitstreams.
6
+ * Multiplication = AND + count.
7
+ *
8
+ * REPRESENTATION:
9
+ * Each element of a vector has:
10
+ * - A sign bit (positive/negative)
11
+ * - A magnitude = number of 1-bits across K "slots"
12
+ *
13
+ * But crucially, when we ADD two unary vectors (residual connection),
14
+ * we DON'T dequantize-add-requantize. We CONCATENATE the slots.
15
+ *
16
+ * If vector A has K_a slots and vector B has K_b slots,
17
+ * A + B has K_a + K_b slots. The magnitude of element j is
18
+ * just the total count of 1-bits at position j across ALL slots.
19
+ *
20
+ * This means the residual stream GROWS through the network:
21
+ * After embed: K_0 slots
22
+ * After layer 1: K_0 + K_attn + K_mlp slots
23
+ * After layer L: K_0 + L*(K_attn + K_mlp) slots
24
+ *
25
+ * No information is ever destroyed by requantization.
26
+ *
27
+ * MATMUL:
28
+ * y = W @ x where W has K_w slots and x has K_x slots.
29
+ * For each output element y[i]:
30
+ * For each slot pair (p from W, q from x):
31
+ * count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
32
+ * - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
33
+ * Output gets K_out = some fixed number of slots (requantized)
34
+ * because matmul output magnitude is in a different scale.
35
+ *
36
+ * SAME-SIGN ADD (residual):
37
+ * Just append slots. Zero compute.
38
+ * For different signs: need cancellation.
39
+ * In practice residual connections are same-sign-dominant,
40
+ * so we track sign separately and concat magnitudes,
41
+ * deferring cancellation to the next norm.
42
+ *
43
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
44
+ */
45
+
46
+ #define _POSIX_C_SOURCE 199309L
47
+ #include <immintrin.h>
48
+ #include <omp.h>
49
+ #include <stdint.h>
50
+ #include <stdlib.h>
51
+ #include <string.h>
52
+ #include <math.h>
53
+ #include <stdio.h>
54
+ #include <time.h>
55
+
56
+ /* ============================================================
57
+ * GROWABLE UNARY VECTOR
58
+ *
59
+ * The key data structure. Slots can be appended (concat = add).
60
+ * Each slot is a bitplane of dim bits packed into uint64 chunks.
61
+ *
62
+ * sign: uint64[chunks] — per-element sign
63
+ * slots: uint64[n_slots * chunks] — each slot is chunks uint64s
64
+ * n_slots: current number of slots (grows via concat)
65
+ * max_slots: allocated capacity
66
+ *
67
+ * For element j:
68
+ * magnitude = number of slots where bit j is set
69
+ * value = sign * magnitude * scale
70
+ *
71
+ * ============================================================ */
72
+ typedef struct {
73
+ uint64_t *sign;
74
+ uint64_t *slots; /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */
75
+ float scale; /* per-vector scale factor */
76
+ int dim;
77
+ int chunks; /* (dim+63)/64 */
78
+ int n_slots; /* current slot count */
79
+ int max_slots; /* allocated capacity */
80
+ } GrowVec;
81
+
82
+ /* Fixed-size unary matrix (weights don't grow) */
83
+ typedef struct {
84
+ uint64_t *sign; /* [rows * chunks] */
85
+ uint64_t *slots; /* [K * rows * chunks] */
86
+ float *scales; /* [rows] per-row scale */
87
+ int rows, cols, chunks, K;
88
+ } FixedMat;
89
+
90
+ /* ============================================================
91
+ * ALLOCATION
92
+ * ============================================================ */
93
+ GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
94
+ GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec));
95
+ v->dim = dim;
96
+ v->chunks = (dim + 63) / 64;
97
+ v->n_slots = 0;
98
+ v->max_slots = max_slots;
99
+ v->scale = 1.0f;
100
+ v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
101
+ v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t));
102
+ memset(v->sign, 0, v->chunks * sizeof(uint64_t));
103
+ memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
104
+ return v;
105
+ }
106
+
107
+ void gv_free(GrowVec *v) {
108
+ if (v) { free(v->sign); free(v->slots); free(v); }
109
+ }
110
+
111
+ FixedMat* fm_alloc(int rows, int cols, int K) {
112
+ FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat));
113
+ m->rows = rows; m->cols = cols; m->K = K;
114
+ m->chunks = (cols + 63) / 64;
115
+ m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
116
+ m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
117
+ m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
118
+ memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
119
+ memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
120
+ return m;
121
+ }
122
+
123
+ void fm_free(FixedMat *m) {
124
+ if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
125
+ }
126
+
127
+ /* ============================================================
128
+ * FLOAT → UNARY CONVERSION (only at boundaries)
129
+ * ============================================================ */
130
+ void gv_from_float(GrowVec *v, const float *x, int K) {
131
+ int dim = v->dim, chunks = v->chunks;
132
+
133
+ v->n_slots = K;
134
+ memset(v->sign, 0, chunks * sizeof(uint64_t));
135
+ memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
136
+
137
+ float amax = 0.0f;
138
+ for (int i = 0; i < dim; i++) {
139
+ float a = fabsf(x[i]);
140
+ if (a > amax) amax = a;
141
+ }
142
+ if (amax == 0.0f) { v->scale = 1.0f; return; }
143
+ v->scale = amax / K;
144
+ float inv = K / amax;
145
+
146
+ for (int i = 0; i < dim; i++) {
147
+ int c = i / 64;
148
+ uint64_t bit = 1ULL << (i % 64);
149
+
150
+ if (x[i] < 0.0f) v->sign[c] |= bit;
151
+
152
+ int mag = (int)(fabsf(x[i]) * inv + 0.5f);
153
+ if (mag > K) mag = K;
154
+ for (int s = 0; s < mag; s++)
155
+ v->slots[(size_t)s * chunks + c] |= bit;
156
+ }
157
+ }
158
+
159
+ void gv_to_float(const GrowVec *v, float *out) {
160
+ int dim = v->dim, chunks = v->chunks;
161
+
162
+ for (int i = 0; i < dim; i++) {
163
+ int c = i / 64;
164
+ uint64_t bit = 1ULL << (i % 64);
165
+
166
+ int mag = 0;
167
+ for (int s = 0; s < v->n_slots; s++) {
168
+ if (v->slots[(size_t)s * chunks + c] & bit)
169
+ mag++;
170
+ }
171
+
172
+ float val = (float)mag * v->scale;
173
+ out[i] = (v->sign[c] & bit) ? -val : val;
174
+ }
175
+ }
176
+
177
+ void fm_from_float(FixedMat *m, const float *data) {
178
+ int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
179
+
180
+ memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
181
+ memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
182
+
183
+ for (int r = 0; r < rows; r++) {
184
+ const float *row = data + (size_t)r * cols;
185
+ float amax = 0.0f;
186
+ for (int j = 0; j < cols; j++) {
187
+ float a = fabsf(row[j]);
188
+ if (a > amax) amax = a;
189
+ }
190
+ if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
191
+ m->scales[r] = amax / K;
192
+ float inv = K / amax;
193
+
194
+ uint64_t *rs = m->sign + (size_t)r * chunks;
195
+ for (int j = 0; j < cols; j++) {
196
+ int c = j / 64;
197
+ uint64_t bit = 1ULL << (j % 64);
198
+ if (row[j] < 0.0f) rs[c] |= bit;
199
+
200
+ int mag = (int)(fabsf(row[j]) * inv + 0.5f);
201
+ if (mag > K) mag = K;
202
+ for (int s = 0; s < mag; s++)
203
+ m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
204
+ }
205
+ }
206
+ }
207
+
208
+ /* ============================================================
209
+ * CONCATENATION = ADDITION
210
+ *
211
+ * gv_concat(dst, src):
212
+ * Appends src's slots to dst.
213
+ * Same-sign: just append.
214
+ * Different-sign: cancel bits (remove from both).
215
+ *
216
+ * For efficiency with residual connections where scales differ:
217
+ * We track a "slot_scales" or use a single scale with normalization.
218
+ *
219
+ * SIMPLE VERSION: assumes same scale (works after norm).
220
+ * ============================================================ */
221
+
222
+ /* Simple concat: append src slots to dst. Handles sign cancellation. */
223
+ void gv_concat(GrowVec *dst, const GrowVec *src) {
224
+ int chunks = dst->chunks;
225
+
226
+ /* For each source slot, process element-wise:
227
+ * Where signs agree: copy bit to new dst slot
228
+ * Where signs differ: cancel - find a dst slot with that bit set and clear it
229
+ *
230
+ * Optimization: for most transformer residuals, signs mostly agree.
231
+ * So we do the simple thing: compute per-element sign agreement,
232
+ * then for agreeing elements just append, for disagreeing elements cancel.
233
+ */
234
+
235
+ /* Sign agreement mask */
236
+ /* agree[c] = ~(dst_sign[c] ^ src_sign[c]) — bits where signs match */
237
+
238
+ for (int s = 0; s < src->n_slots; s++) {
239
+ const uint64_t *src_slot = src->slots + (size_t)s * chunks;
240
+
241
+ /* Split into agree and disagree portions */
242
+ int new_slot = dst->n_slots;
243
+ if (new_slot >= dst->max_slots) {
244
+ /* Out of room — would need realloc in production */
245
+ printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
246
+ return;
247
+ }
248
+ uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks;
249
+
250
+ for (int c = 0; c < chunks; c++) {
251
+ uint64_t src_bits = src_slot[c];
252
+ uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
253
+ uint64_t disagree = dst->sign[c] ^ src->sign[c];
254
+
255
+ /* Same sign: just append to new slot */
256
+ uint64_t to_add = src_bits & agree;
257
+
258
+ /* Different sign: cancel from existing dst slots */
259
+ uint64_t to_cancel = src_bits & disagree;
260
+
261
+ /* Cancel by walking backwards through dst slots */
262
+ for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
263
+ uint64_t *dslot = dst->slots + (size_t)d * chunks + c;
264
+ uint64_t overlap = *dslot & to_cancel;
265
+ *dslot &= ~overlap; /* clear cancelled bits in dst */
266
+ to_cancel &= ~overlap; /* mark as cancelled */
267
+ }
268
+
269
+ /* Any remaining to_cancel means src > dst for those elements
270
+ * — flip the sign and add to new slot */
271
+ if (to_cancel) {
272
+ dst->sign[c] ^= to_cancel; /* flip sign for these elements */
273
+ to_add |= to_cancel;
274
+ }
275
+
276
+ dst_new[c] = to_add;
277
+ }
278
+
279
+ /* Only increment if new slot is non-empty */
280
+ int non_empty = 0;
281
+ for (int c = 0; c < chunks && !non_empty; c++)
282
+ if (dst_new[c]) non_empty = 1;
283
+ if (non_empty)
284
+ dst->n_slots++;
285
+ }
286
+ }
287
+
288
+ /* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
289
+ void gv_concat_fast(GrowVec *dst, const GrowVec *src) {
290
+ int chunks = dst->chunks;
291
+ int src_slots = src->n_slots;
292
+
293
+ if (dst->n_slots + src_slots > dst->max_slots) {
294
+ printf("WARNING: GrowVec overflow\n");
295
+ src_slots = dst->max_slots - dst->n_slots;
296
+ }
297
+
298
+ /* Just memcpy the slots — handles same-sign correctly,
299
+ * defers opposite-sign cancellation to next norm */
300
+ memcpy(dst->slots + (size_t)dst->n_slots * chunks,
301
+ src->slots,
302
+ (size_t)src_slots * chunks * sizeof(uint64_t));
303
+ dst->n_slots += src_slots;
304
+ }
305
+
306
+ /* ============================================================
307
+ * MATMUL: y = M @ x
308
+ *
309
+ * M is fixed (K_w slots), x is growable (n_slots slots).
310
+ * Output is a NEW GrowVec with K_out slots.
311
+ *
312
+ * Core: for each output element i, accumulate:
313
+ * acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
314
+ * - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
315
+ *
316
+ * Then quantize acc to K_out unary slots.
317
+ * ============================================================ */
318
+ void gv_matmul(
319
+ const FixedMat *M,
320
+ const GrowVec *x,
321
+ GrowVec *y, /* output — gets filled with K_out slots */
322
+ int K_out /* how many output slots */
323
+ ) {
324
+ int out_dim = M->rows;
325
+ int chunks = M->chunks;
326
+ int wK = M->K;
327
+ int xK = x->n_slots;
328
+
329
+ float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
330
+
331
+ #pragma omp parallel for schedule(dynamic, 32)
332
+ for (int i = 0; i < out_dim; i++) {
333
+ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
334
+ long long acc = 0;
335
+
336
+ for (int c = 0; c < chunks; c++) {
337
+ uint64_t ws = w_sign_row[c];
338
+ uint64_t xs = x->sign[c];
339
+ uint64_t same = ~(ws ^ xs);
340
+ uint64_t diff = ws ^ xs;
341
+
342
+ for (int p = 0; p < wK; p++) {
343
+ uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
344
+
345
+ for (int q = 0; q < xK; q++) {
346
+ uint64_t xq = x->slots[(size_t)q * chunks + c];
347
+ uint64_t active = wp & xq;
348
+ acc += __builtin_popcountll(active & same)
349
+ - __builtin_popcountll(active & diff);
350
+ }
351
+ }
352
+ }
353
+
354
+ y_float[i] = (float)acc * M->scales[i] * x->scale;
355
+ }
356
+
357
+ /* Quantize to K_out slots */
358
+ gv_from_float(y, y_float, K_out);
359
+ free(y_float);
360
+ }
361
+
362
+ /* ============================================================
363
+ * NORM: GrowVec → GrowVec with controlled slot count
364
+ *
365
+ * RMSNorm dequantizes (counting), normalizes (float),
366
+ * then requantizes to a fixed K.
367
+ * This is where slot count gets reset.
368
+ * ============================================================ */
369
+ void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) {
370
+ int dim = x->dim;
371
+ float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
372
+ gv_to_float(x, xf);
373
+
374
+ float ss = 0.0f;
375
+ for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
376
+ float rms = 1.0f / sqrtf(ss / dim + eps);
377
+ for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
378
+
379
+ gv_from_float(out, xf, K_out);
380
+ free(xf);
381
+ }
382
+
383
+ /* ============================================================
384
+ * SILU_MUL: out = SiLU(gate) * up
385
+ * Dequant, compute, requant. O(dim).
386
+ * ============================================================ */
387
+ void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) {
388
+ int dim = gate->dim;
389
+ float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
390
+ float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
391
+ gv_to_float(gate, gf);
392
+ gv_to_float(up, uf);
393
+
394
+ for (int i = 0; i < dim; i++)
395
+ gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
396
+
397
+ gv_from_float(out, gf, K_out);
398
+ free(gf); free(uf);
399
+ }
400
+
401
+ /* ============================================================
402
+ * TEST: demonstrate growing residual stream
403
+ * ============================================================ */
404
+ void test_concat_add() {
405
+ printf("=== CONCATENATION = ADDITION TEST ===\n\n");
406
+
407
+ int dim = 16;
408
+
409
+ /* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
410
+ float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
411
+ float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};
412
+
413
+ GrowVec *a = gv_alloc(dim, 8, 64);
414
+ GrowVec *b = gv_alloc(dim, 8, 64);
415
+ gv_from_float(a, a_vals, 8);
416
+ gv_from_float(b, b_vals, 8);
417
+
418
+ printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
419
+ float af[16], bf[16];
420
+ gv_to_float(a, af);
421
+ printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
422
+ printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");
423
+
424
+ printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
425
+ gv_to_float(b, bf);
426
+ printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
427
+ printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");
428
+
429
+ /* Concatenate (= add) */
430
+ printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);
431
+
432
+ /* Need same scale for concat to work correctly */
433
+ /* In a real network, both come from norm so they have comparable scale */
434
+ /* For this test, use fast concat (no cancellation) */
435
+ gv_concat(a, b);
436
+ printf(" -> %d):\n", a->n_slots);
437
+
438
+ float result[16], ref[16];
439
+ gv_to_float(a, result);
440
+ for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];
441
+
442
+ /* NOTE: concat addition only works correctly when scales match.
443
+ * When scales differ, we'd need to adjust. In a transformer,
444
+ * the norm before each sublayer ensures comparable scales. */
445
+
446
+ printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
447
+ printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");
448
+
449
+ gv_free(a); gv_free(b);
450
+ }
451
+
452
+ void test_growing_residual() {
453
+ printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
454
+ printf("Simulating 6 transformer layers with concat residuals\n\n");
455
+
456
+ int dim = 2560;
457
+ int K_embed = 16; /* initial embedding quantization */
458
+ int K_sublayer = 8; /* each sublayer output */
459
+ int n_layers = 6;
460
+
461
+ /* Create random embedding */
462
+ float *embed = (float *)malloc(dim * sizeof(float));
463
+ srand(42);
464
+ for (int i = 0; i < dim; i++) {
465
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
466
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
467
+ embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
468
+ }
469
+
470
+ /* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
471
+ int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
472
+ GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
473
+ gv_from_float(residual, embed, K_embed);
474
+
475
+ printf("After embedding: %d slots (%.1f KB)\n",
476
+ residual->n_slots,
477
+ (float)residual->n_slots * residual->chunks * 8 / 1024);
478
+
479
+ for (int l = 0; l < n_layers; l++) {
480
+ /* Simulate attention output */
481
+ GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
482
+ float *fake_attn = (float *)malloc(dim * sizeof(float));
483
+ for (int i = 0; i < dim; i++) {
484
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
485
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
486
+ fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
487
+ }
488
+ gv_from_float(attn_out, fake_attn, K_sublayer);
489
+ /* Scale must match for concat to work — in real net, norm handles this */
490
+ attn_out->scale = residual->scale;
491
+
492
+ /* RESIDUAL ADD = CONCATENATION */
493
+ gv_concat_fast(residual, attn_out);
494
+
495
+ /* Simulate MLP output */
496
+ GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
497
+ float *fake_mlp = (float *)malloc(dim * sizeof(float));
498
+ for (int i = 0; i < dim; i++) {
499
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
500
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
501
+ fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
502
+ }
503
+ gv_from_float(mlp_out, fake_mlp, K_sublayer);
504
+ mlp_out->scale = residual->scale;
505
+
506
+ /* RESIDUAL ADD = CONCATENATION */
507
+ gv_concat_fast(residual, mlp_out);
508
+
509
+ printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
510
+ l + 1, residual->n_slots,
511
+ (float)residual->n_slots * residual->chunks * 8 / 1024,
512
+ K_sublayer, K_sublayer);
513
+
514
+ gv_free(attn_out); gv_free(mlp_out);
515
+ free(fake_attn); free(fake_mlp);
516
+ }
517
+
518
+ printf("\nResidual grew from %d to %d slots through %d layers\n",
519
+ K_embed, residual->n_slots, n_layers);
520
+ printf("Information accumulated, never lost to requantization\n");
521
+
522
+ gv_free(residual);
523
+ free(embed);
524
+ }
525
+
526
+ void test_matmul_accuracy() {
527
+ printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");
528
+
529
+ int rows = 512, cols = 2560;
530
+ int wK = 32;
531
+
532
+ printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
533
+ printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");
534
+
535
+ srand(42);
536
+ float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
537
+ float *xf = (float *)malloc(cols * sizeof(float));
538
+ float *y_ref = (float *)calloc(rows, sizeof(float));
539
+
540
+ for (size_t i = 0; i < (size_t)rows * cols; i++) {
541
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
542
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
543
+ Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
544
+ }
545
+ for (int i = 0; i < cols; i++) {
546
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
547
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
548
+ xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
549
+ }
550
+ for (int i = 0; i < rows; i++)
551
+ for (int j = 0; j < cols; j++)
552
+ y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
553
+
554
+ FixedMat *M = fm_alloc(rows, cols, wK);
555
+ fm_from_float(M, Mf);
556
+
557
+ /* Test with different x slot counts (simulating growing residual) */
558
+ int x_slots[] = {8, 16, 32, 48, 64, 96};
559
+ for (int t = 0; t < 6; t++) {
560
+ int xK = x_slots[t];
561
+ GrowVec *x = gv_alloc(cols, xK, xK);
562
+ GrowVec *y = gv_alloc(rows, xK, xK);
563
+ gv_from_float(x, xf, xK);
564
+
565
+ struct timespec t0, t1;
566
+ float *yf = (float *)malloc(rows * sizeof(float));
567
+
568
+ clock_gettime(CLOCK_MONOTONIC, &t0);
569
+ gv_matmul(M, x, y, xK);
570
+ clock_gettime(CLOCK_MONOTONIC, &t1);
571
+ double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
572
+
573
+ gv_to_float(y, yf);
574
+
575
+ float dot = 0, na = 0, nb = 0, noise = 0;
576
+ for (int i = 0; i < rows; i++) {
577
+ dot += y_ref[i] * yf[i];
578
+ na += y_ref[i] * y_ref[i];
579
+ nb += yf[i] * yf[i];
580
+ float e = y_ref[i] - yf[i];
581
+ noise += e * e;
582
+ }
583
+ float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
584
+ float snr = 10.0f * log10f(na / (noise + 1e-10f));
585
+
586
+ printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);
587
+
588
+ gv_free(x); gv_free(y); free(yf);
589
+ }
590
+
591
+ fm_free(M);
592
+ free(Mf); free(xf); free(y_ref);
593
+ }
594
+
595
+ int main() {
596
+ printf("========================================\n");
597
+ printf(" CONCATENATIVE UNARY ENGINE TESTS\n");
598
+ printf(" Addition = Concatenation\n");
599
+ printf(" Value = Count of Ones\n");
600
+ printf("========================================\n");
601
+
602
+ test_concat_add();
603
+ test_growing_residual();
604
+ test_matmul_accuracy();
605
+
606
+ printf("\n=== ALL TESTS DONE ===\n");
607
+ return 0;
608
+ }
convert.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert DeepSeek-R1-Distill-Qwen-1.5B to ternary format.
4
+
5
+ Stores linear weights as bitplanes (pos_mask, neg_mask) + per-row scale.
6
+ Embeddings and layernorms stay FP16. LM head stays FP16.
7
+
8
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import struct
14
+ import numpy as np
15
+ from pathlib import Path
16
+ import time
17
+
18
+ def load_safetensors(model_dir):
19
+ """Load all tensors from safetensors files."""
20
+ import torch; from safetensors.torch import load_file
21
+
22
+ tensors = {}
23
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
24
+ print(f"Loading {f.name}...")
25
+ state = load_file(str(f))
26
+ for key, val in state.items():
27
+ tensors[key] = val.float().numpy()
28
+ return tensors
29
+
30
+ def quantize_row_ternary(row, alpha=0.7):
31
+ """Quantize a single row to ternary {-1, 0, +1}. Vectorized bitpacking."""
32
+ row = row.astype(np.float32)
33
+ mean_abs = np.mean(np.abs(row))
34
+ threshold = alpha * mean_abs
35
+
36
+ pos = row >= threshold
37
+ neg = row <= -threshold
38
+
39
+ nz_mask = pos | neg
40
+ scale = np.mean(np.abs(row[nz_mask])) if nz_mask.any() else np.float32(1.0)
41
+
42
+ # Pad to multiple of 64
43
+ in_dim = len(row)
44
+ pad = (64 - in_dim % 64) % 64
45
+ if pad:
46
+ pos = np.concatenate([pos, np.zeros(pad, dtype=bool)])
47
+ neg = np.concatenate([neg, np.zeros(pad, dtype=bool)])
48
+
49
+ # Vectorized bitpack: reshape to [chunks, 64], multiply by bit positions, sum
50
+ pos_r = pos.reshape(-1, 64).astype(np.uint64)
51
+ neg_r = neg.reshape(-1, 64).astype(np.uint64)
52
+ bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
53
+ pos_bits = np.bitwise_or.reduce(pos_r * bit_positions, axis=1)
54
+ neg_bits = np.bitwise_or.reduce(neg_r * bit_positions, axis=1)
55
+
56
+ return pos_bits, neg_bits, np.float32(scale)
57
+
58
+ return pos_bits, neg_bits, np.float32(scale)
59
+
60
+ def quantize_weight_matrix(weight, alpha=0.7):
61
+ """Quantize entire weight matrix [out_dim, in_dim] to ternary. Fully vectorized."""
62
+ w = weight.astype(np.float32)
63
+ out_dim, in_dim = w.shape
64
+
65
+ # Per-row thresholds
66
+ row_means = np.mean(np.abs(w), axis=1, keepdims=True)
67
+ thresholds = alpha * row_means
68
+
69
+ pos = w >= thresholds # [out_dim, in_dim]
70
+ neg = w <= -thresholds
71
+
72
+ # Per-row scales
73
+ nz = pos | neg
74
+ # Use row means of absolute values where non-zero
75
+ scales = np.zeros(out_dim, dtype=np.float32)
76
+ for i in range(out_dim):
77
+ if nz[i].any():
78
+ scales[i] = np.mean(np.abs(w[i, nz[i]]))
79
+ else:
80
+ scales[i] = 1.0
81
+
82
+ # Sparsity
83
+ total = out_dim * in_dim
84
+ sparsity = 1.0 - np.sum(nz) / total
85
+
86
+ # Pad to multiple of 64
87
+ pad = (64 - in_dim % 64) % 64
88
+ if pad:
89
+ pos = np.concatenate([pos, np.zeros((out_dim, pad), dtype=bool)], axis=1)
90
+ neg = np.concatenate([neg, np.zeros((out_dim, pad), dtype=bool)], axis=1)
91
+
92
+ padded_dim = pos.shape[1]
93
+ chunks = padded_dim // 64
94
+
95
+ # Vectorized bitpacking for entire matrix at once
96
+ bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64)) # [64]
97
+
98
+ pos_r = pos.reshape(out_dim, chunks, 64).astype(np.uint64) # [out, chunks, 64]
99
+ neg_r = neg.reshape(out_dim, chunks, 64).astype(np.uint64)
100
+
101
+ all_pos = np.bitwise_or.reduce(pos_r * bit_positions, axis=2) # [out, chunks]
102
+ all_neg = np.bitwise_or.reduce(neg_r * bit_positions, axis=2)
103
+
104
+ return all_pos, all_neg, scales, sparsity
105
+
106
+ def save_ternary_model(tensors, output_dir, alpha=0.7):
107
+ """Convert and save full model to ternary format."""
108
+ os.makedirs(output_dir, exist_ok=True)
109
+
110
+ config = {
111
+ "hidden_size": 1536,
112
+ "intermediate_size": 8960,
113
+ "num_attention_heads": 12,
114
+ "num_key_value_heads": 2,
115
+ "num_hidden_layers": 28,
116
+ "vocab_size": 151936,
117
+ "head_dim": 128,
118
+ "rope_theta": 1000000.0,
119
+ "rms_norm_eps": 1e-6,
120
+ "alpha": alpha,
121
+ }
122
+
123
+ # Identify which tensors to ternarize vs keep as-is
124
+ ternary_keys = [] # Linear weights to ternarize
125
+ keep_keys = [] # Embeddings, norms, biases to keep as FP16
126
+
127
+ for key in tensors:
128
+ if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
129
+ 'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
130
+ 'down_proj.weight']):
131
+ ternary_keys.append(key)
132
+ else:
133
+ keep_keys.append(key)
134
+
135
+ print(f"\nTernary layers: {len(ternary_keys)}")
136
+ print(f"FP16 layers: {len(keep_keys)}")
137
+
138
+ # Save config
139
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
140
+ json.dump(config, f, indent=2)
141
+
142
+ # Save ternary weights
143
+ total_ternary_bytes = 0
144
+ total_original_bytes = 0
145
+
146
+ for key in ternary_keys:
147
+ w = tensors[key].astype(np.float32)
148
+ out_dim, in_dim = w.shape
149
+ total_original_bytes += w.nbytes
150
+
151
+ t0 = time.time()
152
+ pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
153
+ dt = time.time() - t0
154
+
155
+ # Save as binary
156
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
157
+ pos.tofile(prefix + ".pos")
158
+ neg.tofile(prefix + ".neg")
159
+ scales.tofile(prefix + ".scales")
160
+
161
+ ternary_bytes = pos.nbytes + neg.nbytes + scales.nbytes
162
+ total_ternary_bytes += ternary_bytes
163
+ ratio = w.nbytes / ternary_bytes
164
+
165
+ print(f" {key}: {w.shape} -> ternary ({ternary_bytes/1024:.0f}KB, "
166
+ f"{ratio:.1f}x compression, {sparsity:.1%} sparse, {dt:.1f}s)")
167
+
168
+ # Save FP16 weights
169
+ total_fp16_bytes = 0
170
+ for key in keep_keys:
171
+ w = tensors[key].astype(np.float16)
172
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
173
+ w.tofile(prefix + ".fp16")
174
+ total_fp16_bytes += w.nbytes
175
+ print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
176
+
177
+ # Save tensor manifest
178
+ manifest = {
179
+ "ternary": {k: list(tensors[k].shape) for k in ternary_keys},
180
+ "fp16": {k: list(tensors[k].shape) for k in keep_keys},
181
+ }
182
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
183
+ json.dump(manifest, f, indent=2)
184
+
185
+ total_bytes = total_ternary_bytes + total_fp16_bytes
186
+ orig_bytes = total_original_bytes + total_fp16_bytes
187
+ print(f"\n=== Summary ===")
188
+ print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
189
+ print(f"Ternary linear weights: {total_ternary_bytes/1024/1024:.1f} MB")
190
+ print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
191
+ print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
192
+ print(f"Compression vs FP32: {orig_bytes/total_bytes:.1f}x")
193
+
194
+ if __name__ == "__main__":
195
+ import sys
196
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
197
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-ternary"
198
+ alpha = float(sys.argv[3]) if len(sys.argv) > 3 else 0.7
199
+
200
+ print(f"Loading model from {model_dir}...")
201
+ tensors = load_safetensors(model_dir)
202
+
203
+ print(f"Converting to ternary (alpha={alpha})...")
204
+ save_ternary_model(tensors, output_dir, alpha)
205
+ print("Done!")
convert_fast.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ FAST proper unary converter — vectorized bitpacking via numpy.
4
+
5
+ Instead of iterating columns one at a time, processes plane-by-plane
6
+ with vectorized comparisons, then packs to uint64 using np.packbits.
7
+
8
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
9
+ """
10
+
11
+ import torch, json, os, sys, gc, shutil
12
+ from safetensors import safe_open
13
+ import numpy as np
14
+
15
+
16
+ def pack_bits_to_uint64(bool_matrix):
17
+ """
18
+ Pack [rows, cols] boolean → [rows, chunks] uint64
19
+ where chunks = ceil(cols/64).
20
+
21
+ Bit j of element (r, c) corresponds to column c*64+j.
22
+ Uses little-endian bit ordering within each uint64.
23
+ """
24
+ rows, cols = bool_matrix.shape
25
+ chunks = (cols + 63) // 64
26
+
27
+ # Pad cols to multiple of 64
28
+ if cols % 64:
29
+ padded = np.zeros((rows, chunks * 64), dtype=np.uint8)
30
+ padded[:, :cols] = bool_matrix.astype(np.uint8)
31
+ else:
32
+ padded = bool_matrix.astype(np.uint8)
33
+
34
+ # Reshape to [rows, chunks, 64]
35
+ reshaped = padded.reshape(rows, chunks, 64)
36
+
37
+ # Pack: bit j of uint64 = reshaped[r, c, j]
38
+ # Build uint64 from 64 bits using shifts
39
+ result = np.zeros((rows, chunks), dtype=np.uint64)
40
+ for bit in range(64):
41
+ result |= reshaped[:, :, bit].astype(np.uint64) << np.uint64(bit)
42
+
43
+ return result
44
+
45
+
46
+ def encode_fast(weight_f32_np, quantum, K):
47
+ """
48
+ Fast vectorized proper unary encoding.
49
+ weight_f32_np: [rows, cols] numpy float32
50
+ Returns: sign [rows, chunks] uint64, slots [K, rows, chunks] uint64, clip_count
51
+ """
52
+ rows, cols = weight_f32_np.shape
53
+ chunks = (cols + 63) // 64
54
+
55
+ inv_q = 1.0 / quantum
56
+ magnitudes = np.round(np.abs(weight_f32_np) * inv_q).astype(np.int32)
57
+ clip_count = int(np.sum(magnitudes > K))
58
+ magnitudes = np.clip(magnitudes, 0, K)
59
+
60
+ # Sign: negative elements
61
+ signs_bool = weight_f32_np < 0 # [rows, cols]
62
+ sign_packed = pack_bits_to_uint64(signs_bool) # [rows, chunks]
63
+
64
+ # Unary slots: plane p is set where magnitude > p
65
+ # Process plane by plane (K iterations, each vectorized over entire matrix)
66
+ slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
67
+
68
+ for p in range(K):
69
+ active = magnitudes > p # [rows, cols] boolean, fully vectorized
70
+ slots_packed[p] = pack_bits_to_uint64(active)
71
+
72
+ if (p + 1) % 8 == 0 or p == K - 1:
73
+ print(f" plane {p+1}/{K}", end="\r", flush=True)
74
+
75
+ print(f" {K}/{K} planes done, {clip_count} clipped")
76
+ return sign_packed, slots_packed, clip_count
77
+
78
+
79
+ def convert(model_dir, output_dir, K=32, clip_pct=99.9):
80
+ os.makedirs(output_dir, exist_ok=True)
81
+
82
+ config = json.load(open(os.path.join(model_dir, "config.json")))
83
+ print(f"Model: {config.get('model_type', '?')}")
84
+ print(f" Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
85
+
86
+ # Index
87
+ index_path = os.path.join(model_dir, "model.safetensors.index.json")
88
+ if os.path.exists(index_path):
89
+ index = json.load(open(index_path))
90
+ shards = sorted(set(index["weight_map"].values()))
91
+ weight_map = index["weight_map"]
92
+ else:
93
+ shards = ["model.safetensors"]
94
+ weight_map = None
95
+
96
+ # Scan for quantum
97
+ print("\nScanning weights...")
98
+ all_abs = []
99
+ linear_names = []
100
+ global_max = 0.0
101
+
102
+ for shard in shards:
103
+ path = os.path.join(model_dir, shard)
104
+ print(f" {shard}...")
105
+ with safe_open(path, framework="pt") as f:
106
+ for name in f.keys():
107
+ t = f.get_tensor(name).float()
108
+ if t.dim() == 2 and "norm" not in name and "embed" not in name:
109
+ linear_names.append(name)
110
+ am = t.abs().max().item()
111
+ if am > global_max: global_max = am
112
+ idx = torch.randint(0, t.numel(), (2000,))
113
+ all_abs.append(t.flatten()[idx].abs())
114
+
115
+ all_abs_t = torch.cat(all_abs)
116
+ clip_val = torch.quantile(all_abs_t, clip_pct / 100.0).item()
117
+ quantum = clip_val / K
118
+
119
+ print(f"\n Absmax={global_max:.6f} P{clip_pct}={clip_val:.6f}")
120
+ print(f" K={K} quantum={quantum:.8f}")
121
+
122
+ mags = (all_abs_t / quantum).round().clamp(0, K)
123
+ print(f" Mean mag={mags.mean():.1f} Median={mags.median():.1f} Zero={100*(mags==0).float().mean():.1f}% Clipped={100*(mags==K).float().mean():.1f}%")
124
+
125
+ del all_abs, all_abs_t, mags
126
+ gc.collect()
127
+
128
+ manifest = {
129
+ "format": "proper_unary",
130
+ "quantum": float(quantum),
131
+ "K": K,
132
+ "clip_pct": clip_pct,
133
+ "clip_val": float(clip_val),
134
+ "global_absmax": float(global_max),
135
+ "unary": {},
136
+ "fp16": [],
137
+ }
138
+
139
+ total_unary = 0
140
+ total_fp16 = 0
141
+ total_clip = 0
142
+ done = 0
143
+
144
+ for shard in shards:
145
+ path = os.path.join(model_dir, shard)
146
+
147
+ # Get linear names in this shard
148
+ shard_lins = [n for n in linear_names if (weight_map or {}).get(n, "model.safetensors") == shard]
149
+ print(f"\n{shard}: {len(shard_lins)} linear layers")
150
+
151
+ with safe_open(path, framework="pt") as f:
152
+ # Non-linear → FP16
153
+ for name in f.keys():
154
+ if name in linear_names:
155
+ continue
156
+ fname = name.replace(".", "_") + ".fp16"
157
+ out_path = os.path.join(output_dir, fname)
158
+ if not os.path.exists(out_path):
159
+ t = f.get_tensor(name).half().numpy()
160
+ t.view(np.uint16).tofile(out_path)
161
+ total_fp16 += os.path.getsize(out_path)
162
+ manifest["fp16"].append(name)
163
+ print(f" FP16: {name} {t.shape}")
164
+
165
+ # Linear → proper unary
166
+ for name in shard_lins:
167
+ fname = name.replace(".", "_")
168
+ sign_path = os.path.join(output_dir, f"{fname}.usign")
169
+ slots_path = os.path.join(output_dir, f"{fname}.uslots")
170
+
171
+ if os.path.exists(sign_path) and os.path.exists(slots_path):
172
+ t_shape = list(f.get_tensor(name).shape)
173
+ manifest["unary"][name] = t_shape
174
+ total_unary += os.path.getsize(sign_path) + os.path.getsize(slots_path)
175
+ done += 1
176
+ print(f" Skip: {name}")
177
+ continue
178
+
179
+ t = f.get_tensor(name).float().numpy()
180
+ rows, cols = t.shape
181
+ print(f" {name} [{rows}x{cols}]", flush=True)
182
+
183
+ sign_p, slots_p, clip_c = encode_fast(t, quantum, K)
184
+ total_clip += clip_c
185
+
186
+ sign_p.tofile(sign_path)
187
+ slots_p.tofile(slots_path)
188
+
189
+ s_sz = os.path.getsize(sign_path)
190
+ sl_sz = os.path.getsize(slots_path)
191
+ total_unary += s_sz + sl_sz
192
+
193
+ manifest["unary"][name] = [rows, cols]
194
+ done += 1
195
+ mb = (s_sz + sl_sz) / 1e6
196
+ print(f" → {mb:.1f} MB ({s_sz//1024}KB sign + {sl_sz//1024}KB slots)")
197
+
198
+ del t, sign_p, slots_p
199
+ gc.collect()
200
+
201
+ # Copy tokenizer/config files
202
+ for fname in os.listdir(model_dir):
203
+ if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
204
+ src = os.path.join(model_dir, fname)
205
+ dst = os.path.join(output_dir, fname)
206
+ if not os.path.exists(dst):
207
+ shutil.copy2(src, dst)
208
+
209
+ json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
210
+
211
+ total = total_unary + total_fp16
212
+ print(f"\n{'='*60}")
213
+ print(f"DONE: {done} layers, quantum={quantum:.8f}, K={K}")
214
+ print(f" Unary: {total_unary/1e9:.2f} GB")
215
+ print(f" FP16: {total_fp16/1e6:.1f} MB")
216
+ print(f" Total: {total/1e9:.2f} GB (vs ~7.6 GB BF16 = {total/7.6e9:.1f}x)")
217
+ print(f" Clipped: {total_clip} values")
218
+ print(f"{'='*60}")
219
+
220
+
221
+ if __name__ == "__main__":
222
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
223
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
224
+ K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
225
+ clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
226
+ convert(model_dir, output_dir, K=K, clip_pct=clip)
convert_log_unary.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Log-unary converter.
4
+ Instead of thermometer (plane p = mag > p), uses binary decomposition
5
+ (plane p = bit p of magnitude). Fewer planes, wider dynamic range.
6
+
7
+ 3 log-planes: 9 levels (-4 to +4), storage = 3 bitplanes
8
+ vs 7 linear planes: 15 levels (-7 to +7), storage = 7 bitplanes
9
+
10
+ 4 log-planes: 17 levels (-8 to +8), storage = 4 bitplanes <-- sweet spot
11
+ 5 log-planes: 33 levels (-16 to +16), storage = 5 bitplanes
12
+
13
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
14
+ """
15
+ import numpy as np
16
+ import os, sys, json, time, gc
17
+
18
+ def quantize_log_unary(w_fp32, n_planes):
19
+ """Quantize weight matrix to log-unary format (binary magnitude planes)"""
20
+ out_dim, in_dim = w_fp32.shape
21
+ max_level = (1 << n_planes) - 1 # 2^n - 1
22
+
23
+ # Per-row scale
24
+ abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
25
+ abs_max = np.where(abs_max == 0, 1.0, abs_max)
26
+ scales = (abs_max.flatten() / max_level).astype(np.float32)
27
+
28
+ # Quantize to integer magnitudes
29
+ scaled = w_fp32 / abs_max * max_level
30
+ rounded = np.clip(np.round(scaled), -max_level, max_level).astype(np.int32)
31
+
32
+ signs = (rounded < 0)
33
+ magnitudes = np.abs(rounded)
34
+
35
+ # Pad to 64-bit chunks
36
+ chunks = (in_dim + 63) // 64
37
+ padded = chunks * 64
38
+ if padded > in_dim:
39
+ signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
40
+ magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
41
+
42
+ # Pack sign bits
43
+ sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
44
+ sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
45
+
46
+ # Pack log-planes: plane p = bit p of magnitude
47
+ plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
48
+ for p in range(n_planes):
49
+ bit_mask = (magnitudes >> p) & 1 # extract bit p
50
+ packed = np.packbits(bit_mask.astype(np.uint8), axis=1, bitorder='little')
51
+ plane_bits[p] = packed.view(np.uint64)[:, :chunks]
52
+
53
+ return sign_u64, plane_bits, scales
54
+
55
+ def convert_model(model_dir, output_dir, n_planes=4):
56
+ os.makedirs(output_dir, exist_ok=True)
57
+
58
+ config = json.load(open(os.path.join(model_dir, "config.json")))
59
+ n_layers = config["num_hidden_layers"]
60
+ hidden = config["hidden_size"]
61
+ max_level = (1 << n_planes) - 1
62
+
63
+ index_file = os.path.join(model_dir, "model.safetensors.index.json")
64
+ if os.path.exists(index_file):
65
+ index = json.load(open(index_file))
66
+ weight_map = index["weight_map"]
67
+ shards = sorted(set(weight_map.values()))
68
+ else:
69
+ shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
70
+ weight_map = None
71
+
72
+ print(f"LOG-UNARY CONVERSION")
73
+ print(f" Model: {n_layers} layers, hidden={hidden}")
74
+ print(f" Log-planes: {n_planes} -> {2*max_level+1} levels (range -{max_level}..+{max_level})")
75
+ print(f" Shards: {len(shards)}")
76
+
77
+ manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers,
78
+ "encoding": "log_unary", "config": config}
79
+
80
+ total_linear = sum(1 for k in (weight_map or {}) if k.endswith(".weight") and "proj" in k)
81
+ converted = 0
82
+
83
+ import torch
84
+ from safetensors import safe_open
85
+
86
+ for si, shard in enumerate(shards):
87
+ path = os.path.join(model_dir, shard)
88
+ print(f"\n=== Shard {si+1}/{len(shards)}: {shard} ===")
89
+
90
+ with safe_open(path, framework="pt") as f:
91
+ for key in sorted(f.keys()):
92
+ fname = key.replace(".", "_")
93
+ is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
94
+
95
+ if is_linear:
96
+ sign_path = os.path.join(output_dir, f"{fname}.sign")
97
+ if os.path.exists(sign_path):
98
+ manifest["unary"][key] = list(f.get_tensor(key).shape)
99
+ converted += 1
100
+ print(f" [SKIP] {key}")
101
+ continue
102
+
103
+ w = f.get_tensor(key).float().numpy()
104
+ t0 = time.time()
105
+ sign, planes, scales = quantize_log_unary(w, n_planes)
106
+ dt = time.time() - t0
107
+
108
+ np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
109
+ np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
110
+ np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
111
+
112
+ manifest["unary"][key] = list(w.shape)
113
+ converted += 1
114
+ orig_mb = w.nbytes / 1e6
115
+ comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
116
+ print(f" [{converted}/{total_linear}] {key}: {list(w.shape)} "
117
+ f"-> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
118
+ del w, sign, planes, scales
119
+ else:
120
+ fp16_path = os.path.join(output_dir, f"{fname}.fp16")
121
+ if os.path.exists(fp16_path):
122
+ manifest["fp16"][key] = list(f.get_tensor(key).shape)
123
+ print(f" [SKIP] {key}")
124
+ continue
125
+
126
+ w = f.get_tensor(key).float().numpy()
127
+ w_fp16 = w.astype(np.float16)
128
+ w_fp16.view(np.uint16).tofile(fp16_path)
129
+ manifest["fp16"][key] = list(w.shape)
130
+ print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
131
+ del w, w_fp16
132
+
133
+ gc.collect()
134
+
135
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
136
+ json.dump(manifest, f, indent=2)
137
+
138
+ import shutil
139
+ for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
140
+ src = os.path.join(model_dir, cf)
141
+ if os.path.exists(src):
142
+ shutil.copy(src, os.path.join(output_dir, cf))
143
+
144
+ total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
145
+ for f in os.listdir(output_dir) if f.endswith((".sign",".planes",".scales")))
146
+ total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
147
+ for f in os.listdir(output_dir) if f.endswith(".fp16"))
148
+
149
+ print(f"\n=== LOG-UNARY CONVERSION COMPLETE ===")
150
+ print(f" Encoding: {n_planes} log-planes (binary magnitude)")
151
+ print(f" Unary: {total_unary/1e9:.2f} GB")
152
+ print(f" FP16: {total_fp16/1e9:.2f} GB")
153
+ print(f" Total: {(total_unary+total_fp16)/1e9:.2f} GB")
154
+
155
+ if __name__ == "__main__":
156
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
157
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-log-unary"
158
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 4
159
+ convert_model(model_dir, output_dir, n_planes)
convert_proper_unary.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert Qwen3-4B BF16 safetensors → Proper Unary.
4
+ Reads safetensors raw bytes (no framework dependency for BF16).
5
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
6
+ """
7
+ import numpy as np
8
+ import json, os, sys, gc, shutil, struct, time
9
+
10
+ class SafeTensorReader:
11
+ """Read safetensors one tensor at a time (memory efficient)."""
12
+ def __init__(self, path):
13
+ self.f = open(path, "rb")
14
+ header_size = struct.unpack("<Q", self.f.read(8))[0]
15
+ self.header = json.loads(self.f.read(header_size).decode("utf-8"))
16
+ self.data_start = 8 + header_size
17
+ self._meta = {k: v for k, v in self.header.items() if k != "__metadata__"}
18
+
19
+ def keys(self):
20
+ return list(self._meta.keys())
21
+
22
+ def get(self, name):
23
+ meta = self._meta[name]
24
+ dtype = meta["dtype"]
25
+ shape = tuple(meta["shape"])
26
+ start, end = meta["data_offsets"]
27
+ self.f.seek(self.data_start + start)
28
+ raw = self.f.read(end - start)
29
+
30
+ if dtype == "BF16":
31
+ u16 = np.frombuffer(raw, dtype=np.uint16)
32
+ u32 = u16.astype(np.uint32) << 16
33
+ return u32.view(np.float32).reshape(shape)
34
+ elif dtype == "F16":
35
+ return np.frombuffer(raw, dtype=np.float16).reshape(shape).astype(np.float32)
36
+ elif dtype == "F32":
37
+ return np.frombuffer(raw, dtype=np.float32).reshape(shape).copy()
38
+ else:
39
+ raise ValueError(f"Unknown dtype {dtype}")
40
+
41
+ def close(self):
42
+ self.f.close()
43
+
44
+ def encode_proper_unary(weight_f32, K):
45
+ """Encode 2D float32 matrix to proper unary."""
46
+ rows, cols = weight_f32.shape
47
+ chunks = (cols + 63) // 64
48
+
49
+ row_absmax = np.abs(weight_f32).max(axis=1).astype(np.float32)
50
+ row_absmax = np.maximum(row_absmax, 1e-10)
51
+ row_scales = (row_absmax / K).astype(np.float32)
52
+
53
+ inv_scales = K / row_absmax
54
+ magnitudes = np.clip(
55
+ np.round(np.abs(weight_f32) * inv_scales[:, None]).astype(np.int32), 0, K)
56
+
57
+ sign_bits = np.zeros((rows, chunks), dtype=np.uint64)
58
+ slot_planes = np.zeros((K, rows, chunks), dtype=np.uint64)
59
+
60
+ negative = weight_f32 < 0
61
+
62
+ for j in range(cols):
63
+ c = j // 64
64
+ b = np.uint64(j % 64)
65
+ bit = np.uint64(1) << b
66
+
67
+ neg_mask = negative[:, j]
68
+ if neg_mask.any():
69
+ sign_bits[neg_mask, c] |= bit
70
+
71
+ mag_col = magnitudes[:, j]
72
+ for s in range(K):
73
+ active = mag_col > s
74
+ if not active.any():
75
+ break
76
+ slot_planes[s, active, c] |= bit
77
+
78
+ return sign_bits, slot_planes, row_scales
79
+
80
+ def convert_model(model_dir, output_dir, K=32):
81
+ os.makedirs(output_dir, exist_ok=True)
82
+ config = json.load(open(os.path.join(model_dir, "config.json")))
83
+
84
+ for f in ["config.json", "tokenizer.json", "tokenizer_config.json",
85
+ "special_tokens_map.json", "generation_config.json"]:
86
+ src = os.path.join(model_dir, f)
87
+ if os.path.exists(src):
88
+ shutil.copy2(src, output_dir)
89
+
90
+ index_path = os.path.join(model_dir, "model.safetensors.index.json")
91
+ if os.path.exists(index_path):
92
+ index = json.load(open(index_path))
93
+ shard_files = sorted(set(index["weight_map"].values()))
94
+ else:
95
+ shard_files = ["model.safetensors"]
96
+
97
+ linear_names = ["q_proj", "k_proj", "v_proj", "o_proj",
98
+ "gate_proj", "up_proj", "down_proj"]
99
+
100
+ manifest = {"K": K, "format": "proper_unary", "unary": {}, "fp16": []}
101
+ total_linear = 0
102
+ total_size = 0
103
+
104
+ for shard_name in shard_files:
105
+ shard_path = os.path.join(model_dir, shard_name)
106
+ print(f"\n=== {shard_name} ===", flush=True)
107
+
108
+ reader = SafeTensorReader(shard_path)
109
+ print(f" {len(reader.keys())} tensors", flush=True)
110
+
111
+ for key in sorted(reader.keys()):
112
+ tensor = reader.get(key)
113
+ fname = key.replace(".", "_")
114
+
115
+ is_linear = any(ln + ".weight" in key for ln in linear_names)
116
+
117
+ if is_linear and tensor.ndim == 2:
118
+ rows, cols = tensor.shape
119
+ t0 = time.time()
120
+ print(f" {key}: {rows}x{cols} K={K}...", end="", flush=True)
121
+
122
+ sign_bits, slot_planes, row_scales = encode_proper_unary(tensor, K)
123
+ dt = time.time() - t0
124
+
125
+ sign_bits.tofile(os.path.join(output_dir, fname + ".sign"))
126
+ slot_planes.tofile(os.path.join(output_dir, fname + ".slots"))
127
+ row_scales.tofile(os.path.join(output_dir, fname + ".scales"))
128
+
129
+ manifest["unary"][key] = [rows, cols]
130
+ sz = sign_bits.nbytes + slot_planes.nbytes + row_scales.nbytes
131
+ total_size += sz
132
+ total_linear += 1
133
+
134
+ ratio = sz / (rows * cols * 2)
135
+ print(f" {sz/1e6:.1f}MB ({ratio:.1f}x) [{dt:.0f}s]", flush=True)
136
+
137
+ del sign_bits, slot_planes, row_scales
138
+ else:
139
+ # FP16
140
+ t_f16 = tensor.astype(np.float16)
141
+ out_data = t_f16.view(np.uint16)
142
+ out_data.tofile(os.path.join(output_dir, fname + ".fp16"))
143
+ manifest["fp16"].append(key)
144
+ sz = out_data.nbytes
145
+ total_size += sz
146
+ print(f" {key}: {tensor.shape} -> FP16 ({sz/1e6:.1f}MB)", flush=True)
147
+ del t_f16, out_data
148
+
149
+ del tensor
150
+
151
+ reader.close()
152
+ gc.collect()
153
+
154
+ json.dump(manifest, open(os.path.join(output_dir, "manifest.json"), "w"), indent=2)
155
+
156
+ print(f"\n{'='*50}", flush=True)
157
+ print(f"DONE: {total_linear} layers, K={K}", flush=True)
158
+ print(f"Total: {total_size/1e9:.2f} GB (orig ~7.6 GB, ratio {total_size/7.6e9:.1f}x)", flush=True)
159
+
160
+ if __name__ == "__main__":
161
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "/root/ternary_engine/qwen3-4b-thinking-hf"
162
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "/root/ternary_engine/qwen3-4b-proper-unary"
163
+ K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
164
+ convert_model(model_dir, output_dir, K)
convert_proper_unary_v2.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ PROPER UNARY CONVERTER — Global quantum, torch-based, BF16 support
4
+
5
+ Clips at P99.9 of |weights| instead of absmax to avoid wasting
6
+ quantization range on rare outliers. Values above clip point
7
+ saturate at K (still represented, just capped).
8
+
9
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
10
+ """
11
+
12
+ import torch, json, os, sys, gc, shutil
13
+ from safetensors import safe_open
14
+ import numpy as np
15
+
16
+ def scan_all_linears(model_dir):
17
+ """Scan linear layers, return global stats."""
18
+ index_path = os.path.join(model_dir, "model.safetensors.index.json")
19
+ if os.path.exists(index_path):
20
+ index = json.load(open(index_path))
21
+ shards = sorted(set(index["weight_map"].values()))
22
+ else:
23
+ shards = ["model.safetensors"]
24
+
25
+ all_abs_samples = []
26
+ linear_names = []
27
+ global_max = 0.0
28
+
29
+ for shard in shards:
30
+ path = os.path.join(model_dir, shard)
31
+ print(f" Scanning {shard}...")
32
+ with safe_open(path, framework="pt") as f:
33
+ for name in f.keys():
34
+ t = f.get_tensor(name).float()
35
+ if t.dim() == 2 and "norm" not in name and "embed" not in name:
36
+ linear_names.append(name)
37
+ am = t.abs().max().item()
38
+ if am > global_max:
39
+ global_max = am
40
+ # Sample 2000 values for distribution
41
+ idx = torch.randint(0, t.numel(), (2000,))
42
+ all_abs_samples.append(t.flatten()[idx].abs())
43
+
44
+ all_abs = torch.cat(all_abs_samples)
45
+ return global_max, all_abs, linear_names, shards
46
+
47
+
48
+ def encode_to_proper_unary_torch(weight_f32, quantum, K):
49
+ """
50
+ Encode [rows, cols] float32 tensor to proper unary.
51
+ Returns sign_packed [rows, chunks] uint64, slots_packed [K, rows, chunks] uint64
52
+ """
53
+ rows, cols = weight_f32.shape
54
+ chunks = (cols + 63) // 64
55
+
56
+ inv_q = 1.0 / quantum
57
+ magnitudes = (weight_f32.abs() * inv_q).round().long().clamp(0, K)
58
+ signs = weight_f32 < 0
59
+ clip_count = int((weight_f32.abs() * inv_q > K).sum().item())
60
+
61
+ # Pack to uint64 bitplanes using numpy (torch lacks bit manipulation)
62
+ sign_packed = np.zeros((rows, chunks), dtype=np.uint64)
63
+ slots_packed = np.zeros((K, rows, chunks), dtype=np.uint64)
64
+
65
+ mags_np = magnitudes.numpy()
66
+ signs_np = signs.numpy()
67
+
68
+ for j in range(cols):
69
+ c = j // 64
70
+ bit = np.uint64(1) << np.uint64(j % 64)
71
+
72
+ # Sign
73
+ mask = signs_np[:, j]
74
+ sign_packed[mask, c] |= bit
75
+
76
+ # Unary slots: for each element, set slots 0..mag-1
77
+ col_mags = mags_np[:, j]
78
+ for p in range(K):
79
+ active = col_mags > p
80
+ slots_packed[p, active, c] |= bit
81
+
82
+ if (j + 1) % 256 == 0:
83
+ print(f" col {j+1}/{cols}", end="\r", flush=True)
84
+
85
+ print(f" {cols}/{cols} done, {clip_count} clipped")
86
+ return sign_packed, slots_packed, clip_count
87
+
88
+
89
+ def convert(model_dir, output_dir, K=32, clip_pct=99.9):
90
+ os.makedirs(output_dir, exist_ok=True)
91
+
92
+ config = json.load(open(os.path.join(model_dir, "config.json")))
93
+ print(f"Model: {config.get('_name_or_path', config.get('model_type', '?'))}")
94
+ print(f" Layers={config['num_hidden_layers']} Hidden={config['hidden_size']} Inter={config['intermediate_size']}")
95
+
96
+ # Scan
97
+ print("\nScanning weights...")
98
+ global_max, all_abs, linear_names, shards = scan_all_linears(model_dir)
99
+
100
+ # Pick quantum from clip percentile
101
+ clip_val = torch.quantile(all_abs, clip_pct / 100.0).item()
102
+ quantum = clip_val / K
103
+
104
+ print(f"\n Global absmax: {global_max:.6f}")
105
+ print(f" P{clip_pct} clip: {clip_val:.6f}")
106
+ print(f" K = {K}")
107
+ print(f" Quantum = {quantum:.8f}")
108
+ print(f" Values > clip ({clip_pct}%): saturate at K={K}")
109
+
110
+ # Distribution with chosen quantum
111
+ mags = (all_abs / quantum).round().clamp(0, K)
112
+ print(f"\n Mean magnitude: {mags.mean():.1f} slots")
113
+ print(f" Median: {mags.median():.1f} slots")
114
+ print(f" Zero fraction: {100*(mags==0).float().mean():.1f}%")
115
+ print(f" At K (clipped): {100*(mags==K).float().mean():.1f}%")
116
+ print(f" Unique levels: {len(mags.unique())} / {K+1}")
117
+
118
+ # Memory estimate
119
+ # Per linear: sign=rows*chunks*8 bytes, slots=K*rows*chunks*8 bytes
120
+ # Approx: (K+1) bits per element vs 16 bits BF16
121
+ bits_per_elem = K + 1 # K slot bits + 1 sign bit (stored in uint64 chunks)
122
+ ratio = bits_per_elem / 16.0
123
+ print(f"\n Bits per weight: {bits_per_elem}")
124
+ print(f" vs BF16 (16 bit): {ratio:.1f}x")
125
+ print(f" Original: ~7.6 GB → Estimated: ~{7.6 * ratio:.1f} GB")
126
+
127
+ # Build weight map
128
+ index_path = os.path.join(model_dir, "model.safetensors.index.json")
129
+ if os.path.exists(index_path):
130
+ weight_map = json.load(open(index_path))["weight_map"]
131
+ else:
132
+ weight_map = None
133
+
134
+ manifest = {
135
+ "format": "proper_unary",
136
+ "quantum": float(quantum),
137
+ "K": K,
138
+ "clip_pct": clip_pct,
139
+ "clip_val": float(clip_val),
140
+ "global_absmax": float(global_max),
141
+ "unary": {},
142
+ "fp16": [],
143
+ }
144
+
145
+ # Group linears by shard
146
+ shard_linears = {}
147
+ for name in linear_names:
148
+ shard = weight_map[name] if weight_map else "model.safetensors"
149
+ shard_linears.setdefault(shard, []).append(name)
150
+
151
+ total_unary_bytes = 0
152
+ total_fp16_bytes = 0
153
+ total_clipped = 0
154
+ done = 0
155
+
156
+ for shard in shards:
157
+ path = os.path.join(model_dir, shard)
158
+ shard_lins = shard_linears.get(shard, [])
159
+ print(f"\nProcessing {shard} ({len(shard_lins)} linear layers)...")
160
+
161
+ with safe_open(path, framework="pt") as f:
162
+ all_keys = list(f.keys())
163
+
164
+ # Non-linear weights → FP16
165
+ for name in all_keys:
166
+ if name in linear_names:
167
+ continue
168
+ fname = name.replace(".", "_") + ".fp16"
169
+ out_path = os.path.join(output_dir, fname)
170
+ if not os.path.exists(out_path):
171
+ t = f.get_tensor(name).half()
172
+ t.numpy().view(np.uint16).tofile(out_path)
173
+ sz = os.path.getsize(out_path)
174
+ total_fp16_bytes += sz
175
+ manifest["fp16"].append(name)
176
+ print(f" FP16: {name} {list(t.shape)} ({sz//1024}KB)")
177
+
178
+ # Linear weights → proper unary
179
+ for name in shard_lins:
180
+ fname = name.replace(".", "_")
181
+ sign_path = os.path.join(output_dir, f"{fname}.usign")
182
+ slots_path = os.path.join(output_dir, f"{fname}.uslots")
183
+
184
+ if os.path.exists(sign_path) and os.path.exists(slots_path):
185
+ t = f.get_tensor(name)
186
+ manifest["unary"][name] = list(t.shape)
187
+ total_unary_bytes += os.path.getsize(sign_path) + os.path.getsize(slots_path)
188
+ done += 1
189
+ print(f" Skip: {name}")
190
+ continue
191
+
192
+ t = f.get_tensor(name).float()
193
+ rows, cols = t.shape
194
+ print(f" Converting: {name} [{rows}x{cols}]...", flush=True)
195
+
196
+ sign_p, slots_p, clip_c = encode_to_proper_unary_torch(t, quantum, K)
197
+ total_clipped += clip_c
198
+
199
+ sign_p.tofile(sign_path)
200
+ slots_p.tofile(slots_path)
201
+
202
+ s_sz = os.path.getsize(sign_path)
203
+ sl_sz = os.path.getsize(slots_path)
204
+ total_unary_bytes += s_sz + sl_sz
205
+
206
+ manifest["unary"][name] = [rows, cols]
207
+ done += 1
208
+ print(f" sign={s_sz//1024}KB slots={sl_sz//1024}KB total={( s_sz+sl_sz)//1024//1024}MB")
209
+
210
+ del t, sign_p, slots_p
211
+ gc.collect()
212
+
213
+ # Copy config and tokenizer
214
+ for fname in os.listdir(model_dir):
215
+ if fname.endswith(('.json', '.txt', '.model')) and not fname.startswith('model.safetensors'):
216
+ src = os.path.join(model_dir, fname)
217
+ dst = os.path.join(output_dir, fname)
218
+ if not os.path.exists(dst):
219
+ shutil.copy2(src, dst)
220
+
221
+ manifest_path = os.path.join(output_dir, "manifest.json")
222
+ json.dump(manifest, open(manifest_path, "w"), indent=2)
223
+
224
+ total = total_unary_bytes + total_fp16_bytes
225
+ print(f"\n{'='*60}")
226
+ print(f"PROPER UNARY CONVERSION COMPLETE")
227
+ print(f"{'='*60}")
228
+ print(f" Quantum: {quantum:.8f}")
229
+ print(f" K: {K}")
230
+ print(f" Clip at P{clip_pct}: {clip_val:.6f}")
231
+ print(f" Linear layers: {done}")
232
+ print(f" Clipped vals: {total_clipped}")
233
+ print(f" Unary: {total_unary_bytes/1e9:.2f} GB")
234
+ print(f" FP16 (norms): {total_fp16_bytes/1e6:.1f} MB")
235
+ print(f" Total: {total/1e9:.2f} GB")
236
+ print(f" Original BF16: ~7.6 GB")
237
+ print(f" Ratio: {total/7.6e9:.1f}x")
238
+ print(f" Output dir: {output_dir}")
239
+
240
+
241
+ if __name__ == "__main__":
242
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
243
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-proper-unary"
244
+ K = int(sys.argv[3]) if len(sys.argv) > 3 else 32
245
+ clip = float(sys.argv[4]) if len(sys.argv) > 4 else 99.9
246
+
247
+ convert(model_dir, output_dir, K=K, clip_pct=clip)
convert_qwen3.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unary converter for Qwen3 models.
4
+ Converts safetensors to unary bitplane format.
5
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
6
+ """
7
+ import numpy as np
8
+ import os, sys, json, time
9
+
10
+ def load_safetensors_torch(model_dir):
11
+ """Load all safetensors shards using torch backend"""
12
+ import torch
13
+ from safetensors import safe_open
14
+
15
+ weights = {}
16
+ shard_files = sorted([f for f in os.listdir(model_dir) if f.endswith('.safetensors')])
17
+ print(f"Loading {len(shard_files)} shard(s)...")
18
+
19
+ for sf in shard_files:
20
+ path = os.path.join(model_dir, sf)
21
+ print(f" {sf}...")
22
+ with safe_open(path, framework="pt") as f:
23
+ for key in f.keys():
24
+ t = f.get_tensor(key)
25
+ weights[key] = t.float().numpy() # Convert BF16->FP32
26
+ return weights
27
+
28
+ def quantize_unary_vectorized(w_fp32, n_planes):
29
+ """Quantize a weight matrix to unary bitplane format using vectorized numpy"""
30
+ out_dim, in_dim = w_fp32.shape
31
+ max_val = n_planes # values from -n_planes to +n_planes
32
+
33
+ # Scale to [-max_val, max_val]
34
+ abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
35
+ abs_max = np.where(abs_max == 0, 1.0, abs_max)
36
+ scaled = w_fp32 / abs_max * max_val
37
+ rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
38
+
39
+ # Per-row scales
40
+ scales = (abs_max.flatten() / max_val).astype(np.float32)
41
+
42
+ # Sign and magnitude
43
+ signs = (rounded < 0) # True = negative
44
+ magnitudes = np.abs(rounded) # 0 to n_planes
45
+
46
+ # Pack into uint64 bitplanes
47
+ chunks = (in_dim + 63) // 64
48
+ padded = chunks * 64
49
+
50
+ # Pad to multiple of 64
51
+ if padded > in_dim:
52
+ signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
53
+ magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
54
+
55
+ # Pack sign bits: [out_dim, chunks] as uint64
56
+ sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
57
+ sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
58
+
59
+ # Pack magnitude planes: for each plane p, bit is set if magnitude > p
60
+ plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
61
+ for p in range(n_planes):
62
+ mask = (magnitudes > p)
63
+ packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
64
+ plane_bits[p] = packed.view(np.uint64)[:, :chunks]
65
+
66
+ return sign_u64, plane_bits, scales
67
+
68
+ def convert_model(model_dir, output_dir, n_planes=7):
69
+ """Convert a Qwen3 model to unary format"""
70
+ os.makedirs(output_dir, exist_ok=True)
71
+
72
+ # Load config
73
+ config = json.load(open(os.path.join(model_dir, "config.json")))
74
+ n_layers = config["num_hidden_layers"]
75
+ hidden = config["hidden_size"]
76
+ print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
77
+
78
+ # Load weights
79
+ weights = load_safetensors_torch(model_dir)
80
+ print(f"Loaded {len(weights)} tensors")
81
+
82
+ # Identify linear layers (2D weight matrices in attn/mlp)
83
+ linear_keys = [k for k in weights if k.endswith(".weight") and weights[k].ndim == 2
84
+ and ("proj" in k)]
85
+
86
+ manifest = {"unary": {}, "fp16": {}}
87
+
88
+ # Convert linear layers to unary
89
+ total = len(linear_keys)
90
+ for idx, key in enumerate(sorted(linear_keys)):
91
+ w = weights[key]
92
+ t0 = time.time()
93
+ sign, planes, scales = quantize_unary_vectorized(w, n_planes)
94
+ dt = time.time() - t0
95
+
96
+ # Flatten name for filesystem
97
+ fname = key.replace(".", "_")
98
+ np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
99
+ np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
100
+ np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
101
+
102
+ manifest["unary"][key] = list(w.shape)
103
+ sparsity = 1.0 - np.count_nonzero(np.abs(np.round(w / np.abs(w).max(axis=1, keepdims=True) * n_planes)).astype(int)) / w.size
104
+ orig_mb = w.nbytes / 1e6
105
+ comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
106
+ print(f" [{idx+1}/{total}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
107
+
108
+ # Save FP16 weights (norms, embeddings, QK-norms)
109
+ fp16_keys = [k for k in weights if k not in linear_keys]
110
+ for key in sorted(fp16_keys):
111
+ w = weights[key]
112
+ fname = key.replace(".", "_")
113
+ w_fp16 = w.astype(np.float16)
114
+ w_fp16.view(np.uint16).tofile(os.path.join(output_dir, f"{fname}.fp16"))
115
+ manifest["fp16"][key] = list(w.shape)
116
+ print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
117
+
118
+ # Save manifest and config
119
+ manifest["n_planes"] = n_planes
120
+ manifest["n_layers"] = n_layers
121
+ manifest["config"] = config
122
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
123
+ json.dump(manifest, f, indent=2)
124
+
125
+ # Copy config
126
+ import shutil
127
+ shutil.copy(os.path.join(model_dir, "config.json"), os.path.join(output_dir, "config.json"))
128
+
129
+ # Size summary
130
+ total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
131
+ for f in os.listdir(output_dir)
132
+ if f.endswith((".sign", ".planes", ".scales")))
133
+ total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
134
+ for f in os.listdir(output_dir)
135
+ if f.endswith(".fp16"))
136
+ orig_total = sum(w.nbytes for w in weights.values())
137
+
138
+ print(f"\n=== CONVERSION COMPLETE ===")
139
+ print(f"Original FP32: {orig_total/1e9:.2f} GB")
140
+ print(f"Unary linear: {total_unary/1e9:.2f} GB")
141
+ print(f"FP16 other: {total_fp16/1e9:.2f} GB")
142
+ print(f"Total: {(total_unary+total_fp16)/1e9:.2f} GB")
143
+ print(f"Compression: {orig_total/(total_unary+total_fp16):.1f}x")
144
+
145
+ if __name__ == "__main__":
146
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
147
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
148
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
149
+ convert_model(model_dir, output_dir, n_planes)
convert_qwen3_v2.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Memory-efficient unary converter for Qwen3.
4
+ Processes one safetensors shard at a time to avoid OOM.
5
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
6
+ """
7
+ import numpy as np
8
+ import os, sys, json, time, gc
9
+
10
+ def quantize_unary(w_fp32, n_planes):
11
+ """Quantize weight matrix to unary bitplane format"""
12
+ out_dim, in_dim = w_fp32.shape
13
+ max_val = n_planes
14
+
15
+ abs_max = np.abs(w_fp32).max(axis=1, keepdims=True)
16
+ abs_max = np.where(abs_max == 0, 1.0, abs_max)
17
+ scaled = w_fp32 / abs_max * max_val
18
+ rounded = np.clip(np.round(scaled), -max_val, max_val).astype(np.int32)
19
+
20
+ scales = (abs_max.flatten() / max_val).astype(np.float32)
21
+ signs = (rounded < 0)
22
+ magnitudes = np.abs(rounded)
23
+
24
+ chunks = (in_dim + 63) // 64
25
+ padded = chunks * 64
26
+
27
+ if padded > in_dim:
28
+ signs = np.pad(signs, ((0,0),(0,padded-in_dim)), constant_values=False)
29
+ magnitudes = np.pad(magnitudes, ((0,0),(0,padded-in_dim)), constant_values=0)
30
+
31
+ sign_bits = np.packbits(signs.astype(np.uint8), axis=1, bitorder='little')
32
+ sign_u64 = sign_bits.view(np.uint64)[:, :chunks]
33
+
34
+ plane_bits = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
35
+ for p in range(n_planes):
36
+ mask = (magnitudes > p)
37
+ packed = np.packbits(mask.astype(np.uint8), axis=1, bitorder='little')
38
+ plane_bits[p] = packed.view(np.uint64)[:, :chunks]
39
+
40
+ return sign_u64, plane_bits, scales
41
+
42
+ def convert_model(model_dir, output_dir, n_planes=7):
43
+ os.makedirs(output_dir, exist_ok=True)
44
+
45
+ config = json.load(open(os.path.join(model_dir, "config.json")))
46
+ n_layers = config["num_hidden_layers"]
47
+ hidden = config["hidden_size"]
48
+
49
+ # Load index to know which keys are in which shard
50
+ index_file = os.path.join(model_dir, "model.safetensors.index.json")
51
+ if os.path.exists(index_file):
52
+ index = json.load(open(index_file))
53
+ weight_map = index["weight_map"]
54
+ shards = sorted(set(weight_map.values()))
55
+ else:
56
+ # Single shard
57
+ shards = [f for f in os.listdir(model_dir) if f.endswith('.safetensors')]
58
+ weight_map = None
59
+
60
+ print(f"Model: {n_layers} layers, hidden={hidden}, n_planes={n_planes}")
61
+ print(f"Shards: {len(shards)}")
62
+
63
+ manifest = {"unary": {}, "fp16": {}, "n_planes": n_planes, "n_layers": n_layers, "config": config}
64
+ total_converted = 0
65
+ total_linear = 0
66
+
67
+ # Count total linear layers
68
+ if weight_map:
69
+ total_linear = sum(1 for k in weight_map if k.endswith(".weight") and "proj" in k)
70
+ print(f"Total linear layers to convert: {total_linear}")
71
+
72
+ import torch
73
+ from safetensors import safe_open
74
+
75
+ for shard_idx, shard in enumerate(shards):
76
+ shard_path = os.path.join(model_dir, shard)
77
+ print(f"\n=== Shard {shard_idx+1}/{len(shards)}: {shard} ===")
78
+
79
+ with safe_open(shard_path, framework="pt") as f:
80
+ keys = list(f.keys())
81
+ print(f" {len(keys)} tensors in shard")
82
+
83
+ for key in sorted(keys):
84
+ fname = key.replace(".", "_")
85
+ is_linear = key.endswith(".weight") and "proj" in key and f.get_tensor(key).dim() == 2
86
+
87
+ if is_linear:
88
+ # Check if already converted
89
+ sign_path = os.path.join(output_dir, f"{fname}.sign")
90
+ if os.path.exists(sign_path):
91
+ w = f.get_tensor(key)
92
+ manifest["unary"][key] = list(w.shape)
93
+ total_converted += 1
94
+ print(f" [SKIP] {key} already converted")
95
+ continue
96
+
97
+ w = f.get_tensor(key).float().numpy()
98
+ t0 = time.time()
99
+ sign, planes, scales = quantize_unary(w, n_planes)
100
+ dt = time.time() - t0
101
+
102
+ np.array(sign).tofile(os.path.join(output_dir, f"{fname}.sign"))
103
+ np.array(planes).tofile(os.path.join(output_dir, f"{fname}.planes"))
104
+ np.array(scales).tofile(os.path.join(output_dir, f"{fname}.scales"))
105
+
106
+ orig_mb = w.nbytes / 1e6
107
+ comp_mb = (sign.nbytes + planes.nbytes + scales.nbytes) / 1e6
108
+ total_converted += 1
109
+ manifest["unary"][key] = list(w.shape)
110
+ print(f" [{total_converted}/{total_linear}] {key}: {list(w.shape)} -> {comp_mb:.1f}MB ({orig_mb/comp_mb:.1f}x) [{dt:.1f}s]")
111
+
112
+ del w, sign, planes, scales
113
+ else:
114
+ # FP16 weight (norms, embeddings, etc)
115
+ fp16_path = os.path.join(output_dir, f"{fname}.fp16")
116
+ if os.path.exists(fp16_path):
117
+ w = f.get_tensor(key)
118
+ manifest["fp16"][key] = list(w.shape)
119
+ print(f" [SKIP] {key} already saved")
120
+ continue
121
+
122
+ w = f.get_tensor(key).float().numpy()
123
+ w_fp16 = w.astype(np.float16)
124
+ w_fp16.view(np.uint16).tofile(fp16_path)
125
+ manifest["fp16"][key] = list(w.shape)
126
+ print(f" [FP16] {key}: {list(w.shape)} ({w_fp16.nbytes/1e6:.1f}MB)")
127
+ del w, w_fp16
128
+
129
+ # Force GC between shards
130
+ gc.collect()
131
+ print(f" Shard done, memory freed")
132
+
133
+ # Save manifest
134
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
135
+ json.dump(manifest, f, indent=2)
136
+
137
+ # Copy config
138
+ import shutil
139
+ for cf in ["config.json", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json"]:
140
+ src = os.path.join(model_dir, cf)
141
+ if os.path.exists(src):
142
+ shutil.copy(src, os.path.join(output_dir, cf))
143
+
144
+ # Summary
145
+ total_unary = sum(os.path.getsize(os.path.join(output_dir, f))
146
+ for f in os.listdir(output_dir)
147
+ if f.endswith((".sign", ".planes", ".scales")))
148
+ total_fp16 = sum(os.path.getsize(os.path.join(output_dir, f))
149
+ for f in os.listdir(output_dir)
150
+ if f.endswith(".fp16"))
151
+
152
+ print(f"\n=== CONVERSION COMPLETE ===")
153
+ print(f"Unary linear: {total_unary/1e9:.2f} GB")
154
+ print(f"FP16 other: {total_fp16/1e9:.2f} GB")
155
+ print(f"Total: {(total_unary+total_fp16)/1e9:.2f} GB")
156
+
157
+ if __name__ == "__main__":
158
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-hf"
159
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "qwen3-4b-thinking-unary"
160
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
161
+ convert_model(model_dir, output_dir, n_planes)
deepseek-r1-1.5b-ternary/model_layers_10_mlp_up_proj_weight.scales ADDED
Binary file (35.8 kB). View file
 
deepseek-r1-1.5b-ternary/model_layers_10_self_attn_q_proj_bias.fp16 ADDED
Binary file (3.07 kB). View file
 
deepseek-r1-1.5b-ternary/model_layers_14_self_attn_v_proj_weight.scales ADDED
Binary file (1.02 kB). View file
 
deepseek-r1-1.5b-ternary/model_layers_25_self_attn_v_proj_weight.neg ADDED
Binary file (49.2 kB). View file
 
deepseek-r1-1.5b-ternary/model_layers_27_self_attn_v_proj_weight.scales ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ E�=�r�=ě�=�6�=e��=)�= ڍ=Q�=룋=�d�=n��=L��=q2�=
2
+ �=D�=��=�=S�=���=�A�=�=\%�=�F�= J�=��=6v�=bՃ=a
3
+ �=���=ƞn=�P�=~j�=��=�*�=ő=�(�=���=J6�=N+�=Cs�=�S�=Uu�=Vԋ=�і=� �=�ɓ=Zړ=^;�=�1�=s�=޷�=Uj�= �v=ڱ�=���=b�=c@�=W��=C)�=�܋=�,�=ᵓ=Z~�=�`�=��=i��=0W�=�.�=���=�َ=S��=�H�=0�=���=�0�=6'�=��=�<�=沕=��=���=�Љ=ޏ=H5�=f�~=z;�=�u�=���=�"�=��=�i�=+~�<�-�=�g�=�)�=�u�=��=��=ܨ�=�k�=~[�=�=�l�=���=G(�=��=�/�=��=6Y�=1 �=�= Ջ=2�=��=蓏=)G�=.l�=N�=:��=.��=���=���=WA�=�>�=ѡ�=�ӏ=��=U�=㟙=q:�=F�=T��=���=o�y=�Ѝ=+
deepseek-r1-1.5b-ternary/model_layers_5_self_attn_v_proj_weight.pos ADDED
Binary file (49.2 kB). View file
 
inference.py ADDED
@@ -0,0 +1,503 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Ternary Transformer Inference Engine
4
+
5
+ Full Qwen2 architecture inference using ternary (1.58-bit) linear layers
6
+ with AVX-512 optimized kernels. Zero multiplications in linear layers.
7
+
8
+ Architecture: DeepSeek-R1-Distill-Qwen-1.5B
9
+ - 28 layers, hidden=1536, intermediate=8960
10
+ - GQA: 12 heads, 2 KV heads, head_dim=128
11
+ - SwiGLU MLP, RoPE, RMSNorm
12
+
13
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
14
+ """
15
+
16
+ import os
17
+ import json
18
+ import ctypes
19
+ import numpy as np
20
+ from pathlib import Path
21
+ import time
22
+
23
+ # ============================================================
24
+ # Load C kernel
25
+ # ============================================================
26
+ def load_kernel(so_path="ternary_kernel.so"):
27
+ lib = ctypes.CDLL(so_path)
28
+
29
+ # ternary_matvec_avx512
30
+ lib.ternary_matvec_avx512.restype = None
31
+ lib.ternary_matvec_avx512.argtypes = [
32
+ ctypes.c_void_p, # pos_bits
33
+ ctypes.c_void_p, # neg_bits
34
+ ctypes.c_void_p, # scales
35
+ ctypes.c_void_p, # x
36
+ ctypes.c_void_p, # y
37
+ ctypes.c_int, # out_dim
38
+ ctypes.c_int, # in_dim
39
+ ]
40
+
41
+ # rmsnorm
42
+ lib.rmsnorm_avx512.restype = None
43
+ lib.rmsnorm_avx512.argtypes = [
44
+ ctypes.c_void_p, # x
45
+ ctypes.c_void_p, # weight
46
+ ctypes.c_void_p, # y
47
+ ctypes.c_int, # dim
48
+ ctypes.c_float, # eps
49
+ ]
50
+
51
+ # silu
52
+ lib.silu_avx512.restype = None
53
+ lib.silu_avx512.argtypes = [ctypes.c_void_p, ctypes.c_int]
54
+
55
+ # elemwise_mul
56
+ lib.elemwise_mul_avx512.restype = None
57
+ lib.elemwise_mul_avx512.argtypes = [
58
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int
59
+ ]
60
+
61
+ # softmax
62
+ lib.softmax.restype = None
63
+ lib.softmax.argtypes = [ctypes.c_void_p, ctypes.c_int]
64
+
65
+ # rope
66
+ lib.apply_rope.restype = None
67
+ lib.apply_rope.argtypes = [
68
+ ctypes.c_void_p, ctypes.c_void_p,
69
+ ctypes.c_int, ctypes.c_int, ctypes.c_int,
70
+ ctypes.c_int, ctypes.c_float
71
+ ]
72
+
73
+ return lib
74
+
75
+ # ============================================================
76
+ # Ternary Linear Layer
77
+ # ============================================================
78
+ class TernaryLinear:
79
+ def __init__(self, pos_bits, neg_bits, scales, out_dim, in_dim, kernel):
80
+ self.pos = pos_bits # uint64 contiguous array
81
+ self.neg = neg_bits
82
+ self.scales = scales # float32
83
+ self.out_dim = out_dim
84
+ self.in_dim = in_dim
85
+ self.kernel = kernel
86
+
87
+ def forward(self, x):
88
+ """x: float32[in_dim] -> float32[out_dim]"""
89
+ y = np.zeros(self.out_dim, dtype=np.float32)
90
+ self.kernel.ternary_matvec_avx512(
91
+ self.pos.ctypes.data,
92
+ self.neg.ctypes.data,
93
+ self.scales.ctypes.data,
94
+ x.ctypes.data,
95
+ y.ctypes.data,
96
+ self.out_dim,
97
+ self.in_dim,
98
+ )
99
+ return y
100
+
101
+ # ============================================================
102
+ # KV Cache
103
+ # ============================================================
104
+ class KVCache:
105
+ def __init__(self, n_layers, n_kv_heads, head_dim, max_seq=4096):
106
+ self.n_layers = n_layers
107
+ self.max_seq = max_seq
108
+ # Pre-allocate
109
+ self.k = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
110
+ self.v = [np.zeros((max_seq, n_kv_heads, head_dim), dtype=np.float32) for _ in range(n_layers)]
111
+ self.seq_len = 0
112
+
113
+ def append(self, layer, k, v):
114
+ """k, v: [n_kv_heads, head_dim]"""
115
+ pos = self.seq_len
116
+ self.k[layer][pos] = k
117
+ self.v[layer][pos] = v
118
+
119
+ def get(self, layer):
120
+ """Returns k, v up to current position: [seq_len, n_kv_heads, head_dim]"""
121
+ return self.k[layer][:self.seq_len + 1], self.v[layer][:self.seq_len + 1]
122
+
123
+ def advance(self):
124
+ self.seq_len += 1
125
+
126
+ # ============================================================
127
+ # Model
128
+ # ============================================================
129
+ class TernaryQwen:
130
+ def __init__(self, model_dir, kernel):
131
+ self.kernel = kernel
132
+ self.model_dir = model_dir
133
+
134
+ with open(os.path.join(model_dir, "config.json")) as f:
135
+ self.config = json.load(f)
136
+ with open(os.path.join(model_dir, "manifest.json")) as f:
137
+ self.manifest = json.load(f)
138
+
139
+ self.hidden = self.config["hidden_size"] # 1536
140
+ self.inter = self.config["intermediate_size"] # 8960
141
+ self.n_heads = self.config["num_attention_heads"] # 12
142
+ self.n_kv = self.config["num_key_value_heads"] # 2
143
+ self.head_dim = self.config["head_dim"] # 128
144
+ self.n_layers = self.config["num_hidden_layers"] # 28
145
+ self.vocab = self.config["vocab_size"] # 151936
146
+ self.rope_theta = self.config["rope_theta"]
147
+ self.eps = self.config["rms_norm_eps"]
148
+
149
+ print(f"Loading ternary model: {self.n_layers} layers, "
150
+ f"hidden={self.hidden}, heads={self.n_heads}/{self.n_kv}")
151
+
152
+ t0 = time.time()
153
+ self._load_weights()
154
+ print(f"Model loaded in {time.time()-t0:.1f}s")
155
+
156
+ self._compute_memory()
157
+
158
+ def _load_ternary(self, key):
159
+ """Load a ternary linear layer."""
160
+ prefix = os.path.join(self.model_dir, key.replace(".", "_"))
161
+ shape = self.manifest["ternary"][key]
162
+ out_dim, in_dim = shape
163
+ chunks = (in_dim + 63) // 64
164
+
165
+ pos = np.fromfile(prefix + ".pos", dtype=np.uint64).reshape(out_dim, chunks)
166
+ neg = np.fromfile(prefix + ".neg", dtype=np.uint64).reshape(out_dim, chunks)
167
+ scales = np.fromfile(prefix + ".scales", dtype=np.float32)
168
+
169
+ # Make contiguous
170
+ pos = np.ascontiguousarray(pos)
171
+ neg = np.ascontiguousarray(neg)
172
+
173
+ return TernaryLinear(pos, neg, scales, out_dim, in_dim, self.kernel)
174
+
175
+ def _load_fp16(self, key):
176
+ """Load an FP16 tensor."""
177
+ prefix = os.path.join(self.model_dir, key.replace(".", "_"))
178
+ shape = self.manifest["fp16"][key]
179
+ return np.fromfile(prefix + ".fp16", dtype=np.float16).reshape(shape).astype(np.float32)
180
+
181
+ def _load_weights(self):
182
+ """Load all weights."""
183
+ # Embedding (FP16)
184
+ self.embed = self._load_fp16("model.embed_tokens.weight") # [vocab, hidden]
185
+
186
+ # Final norm
187
+ self.final_norm = self._load_fp16("model.norm.weight") # [hidden]
188
+
189
+ # LM head — check if it exists as ternary or fp16
190
+ if "lm_head.weight" in self.manifest.get("ternary", {}):
191
+ self.lm_head = self._load_ternary("lm_head.weight")
192
+ self.lm_head_ternary = True
193
+ elif "lm_head.weight" in self.manifest.get("fp16", {}):
194
+ self.lm_head_w = self._load_fp16("lm_head.weight")
195
+ self.lm_head_ternary = False
196
+ else:
197
+ # Tied embeddings
198
+ self.lm_head_w = self.embed
199
+ self.lm_head_ternary = False
200
+
201
+ # Layers
202
+ self.layers = []
203
+ for i in range(self.n_layers):
204
+ layer = {}
205
+ prefix = f"model.layers.{i}"
206
+
207
+ # Attention
208
+ layer["q_proj"] = self._load_ternary(f"{prefix}.self_attn.q_proj.weight")
209
+ layer["k_proj"] = self._load_ternary(f"{prefix}.self_attn.k_proj.weight")
210
+ layer["v_proj"] = self._load_ternary(f"{prefix}.self_attn.v_proj.weight")
211
+ layer["o_proj"] = self._load_ternary(f"{prefix}.self_attn.o_proj.weight")
212
+
213
+ # MLP
214
+ layer["gate_proj"] = self._load_ternary(f"{prefix}.mlp.gate_proj.weight")
215
+ layer["up_proj"] = self._load_ternary(f"{prefix}.mlp.up_proj.weight")
216
+ layer["down_proj"] = self._load_ternary(f"{prefix}.mlp.down_proj.weight")
217
+
218
+ # Norms (FP16 -> FP32)
219
+ layer["input_norm"] = self._load_fp16(f"{prefix}.input_layernorm.weight")
220
+ layer["post_norm"] = self._load_fp16(f"{prefix}.post_attention_layernorm.weight")
221
+
222
+ # Load biases if they exist
223
+ for proj in ["q_proj", "k_proj", "v_proj"]:
224
+ bias_key = f"{prefix}.self_attn.{proj}.bias"
225
+ if bias_key in self.manifest.get("fp16", {}):
226
+ layer[f"{proj}_bias"] = self._load_fp16(bias_key)
227
+
228
+ self.layers.append(layer)
229
+ if (i + 1) % 7 == 0:
230
+ print(f" Loaded {i+1}/{self.n_layers} layers")
231
+
232
+ print(f" Loaded {self.n_layers}/{self.n_layers} layers")
233
+
234
+ def _compute_memory(self):
235
+ """Report memory usage."""
236
+ ternary_bytes = 0
237
+ fp_bytes = 0
238
+
239
+ for layer in self.layers:
240
+ for key in ["q_proj", "k_proj", "v_proj", "o_proj",
241
+ "gate_proj", "up_proj", "down_proj"]:
242
+ tl = layer[key]
243
+ ternary_bytes += tl.pos.nbytes + tl.neg.nbytes + tl.scales.nbytes
244
+ for key in ["input_norm", "post_norm"]:
245
+ fp_bytes += layer[key].nbytes
246
+
247
+ fp_bytes += self.embed.nbytes + self.final_norm.nbytes
248
+ if not self.lm_head_ternary:
249
+ fp_bytes += self.lm_head_w.nbytes if hasattr(self, 'lm_head_w') else 0
250
+
251
+ total = ternary_bytes + fp_bytes
252
+ print(f"\nMemory: ternary={ternary_bytes/1024/1024:.1f}MB, "
253
+ f"fp={fp_bytes/1024/1024:.1f}MB, total={total/1024/1024:.1f}MB")
254
+
255
+ def _rmsnorm(self, x, weight):
256
+ """RMSNorm using C kernel."""
257
+ y = np.zeros_like(x)
258
+ self.kernel.rmsnorm_avx512(
259
+ x.ctypes.data, weight.ctypes.data, y.ctypes.data,
260
+ len(x), ctypes.c_float(self.eps)
261
+ )
262
+ return y
263
+
264
+ def _attention(self, x, layer, cache, layer_idx, pos):
265
+ """Grouped-Query Attention."""
266
+ h = self.hidden
267
+ n_h = self.n_heads
268
+ n_kv = self.n_kv
269
+ hd = self.head_dim
270
+
271
+ # Project Q, K, V
272
+ q = layer["q_proj"].forward(x) # [n_heads * head_dim]
273
+ k = layer["k_proj"].forward(x) # [n_kv * head_dim]
274
+ v = layer["v_proj"].forward(x) # [n_kv * head_dim]
275
+
276
+ # Add biases if present
277
+ if "q_proj_bias" in layer:
278
+ q += layer["q_proj_bias"]
279
+ if "k_proj_bias" in layer:
280
+ k += layer["k_proj_bias"]
281
+ if "v_proj_bias" in layer:
282
+ v += layer["v_proj_bias"]
283
+
284
+ # Reshape
285
+ q = q.reshape(n_h, hd)
286
+ k = k.reshape(n_kv, hd)
287
+ v = v.reshape(n_kv, hd)
288
+
289
+ # RoPE
290
+ self.kernel.apply_rope(
291
+ q.ctypes.data, k.ctypes.data,
292
+ n_h, n_kv, hd, pos,
293
+ ctypes.c_float(self.rope_theta)
294
+ )
295
+
296
+ # Update KV cache
297
+ cache.append(layer_idx, k, v)
298
+
299
+ # Get full K, V history
300
+ k_all, v_all = cache.get(layer_idx) # [seq_len, n_kv, head_dim]
301
+ seq_len = k_all.shape[0]
302
+
303
+ # GQA: repeat KV heads to match Q heads
304
+ heads_per_kv = n_h // n_kv
305
+
306
+ # Compute attention for each head
307
+ output = np.zeros(n_h * hd, dtype=np.float32)
308
+ scale = 1.0 / np.sqrt(hd)
309
+
310
+ for head in range(n_h):
311
+ kv_head = head // heads_per_kv
312
+ q_h = q[head] # [head_dim]
313
+
314
+ # Attention scores: q @ K^T
315
+ scores = np.dot(k_all[:, kv_head, :], q_h) * scale # [seq_len]
316
+
317
+ # Causal mask (all visible for single token generation)
318
+ # Softmax
319
+ scores_max = np.max(scores)
320
+ scores = np.exp(scores - scores_max)
321
+ scores /= np.sum(scores)
322
+
323
+ # Weighted sum of values
324
+ out_h = np.dot(scores, v_all[:, kv_head, :]) # [head_dim]
325
+ output[head * hd:(head + 1) * hd] = out_h
326
+
327
+ # Output projection
328
+ return layer["o_proj"].forward(output)
329
+
330
+ def _mlp(self, x, layer):
331
+ """SwiGLU MLP."""
332
+ gate = layer["gate_proj"].forward(x)
333
+ up = layer["up_proj"].forward(x)
334
+
335
+ # SiLU on gate
336
+ self.kernel.silu_avx512(gate.ctypes.data, len(gate))
337
+
338
+ # gate * up
339
+ self.kernel.elemwise_mul_avx512(
340
+ gate.ctypes.data, up.ctypes.data, gate.ctypes.data, len(gate)
341
+ )
342
+
343
+ # Down projection
344
+ return layer["down_proj"].forward(gate)
345
+
346
+ def forward_token(self, token_id, cache, pos):
347
+ """Forward pass for a single token."""
348
+ # Embedding lookup
349
+ x = self.embed[token_id].copy() # [hidden]
350
+
351
+ # Transformer layers
352
+ for i, layer in enumerate(self.layers):
353
+ # Pre-attention norm
354
+ normed = self._rmsnorm(x, layer["input_norm"])
355
+
356
+ # Self-attention + residual
357
+ attn_out = self._attention(normed, layer, cache, i, pos)
358
+ x = x + attn_out
359
+
360
+ # Pre-MLP norm
361
+ normed = self._rmsnorm(x, layer["post_norm"])
362
+
363
+ # MLP + residual
364
+ mlp_out = self._mlp(normed, layer)
365
+ x = x + mlp_out
366
+
367
+ # Final norm
368
+ x = self._rmsnorm(x, self.final_norm)
369
+
370
+ return x
371
+
372
+ def logits(self, hidden):
373
+ """Compute logits from hidden state."""
374
+ if self.lm_head_ternary:
375
+ return self.lm_head.forward(hidden)
376
+ else:
377
+ return hidden @ self.lm_head_w.T
378
+
379
+ def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95):
380
+ """Generate tokens autoregressively."""
381
+ cache = KVCache(self.n_layers, self.n_kv, self.head_dim)
382
+
383
+ generated = []
384
+ all_tokens = list(token_ids)
385
+
386
+ t_start = time.time()
387
+
388
+ # Prefill: process all input tokens
389
+ for i, tid in enumerate(token_ids):
390
+ hidden = self.forward_token(tid, cache, i)
391
+ if i < len(token_ids) - 1:
392
+ cache.advance()
393
+
394
+ t_prefill = time.time() - t_start
395
+
396
+ # Decode
397
+ t_decode_start = time.time()
398
+ for step in range(max_new_tokens):
399
+ # Get logits
400
+ logit_vec = self.logits(hidden)
401
+
402
+ # Sample
403
+ if temperature < 0.01:
404
+ next_token = int(np.argmax(logit_vec))
405
+ else:
406
+ logit_vec = logit_vec / temperature
407
+ # Top-p sampling
408
+ sorted_idx = np.argsort(logit_vec)[::-1]
409
+ sorted_logits = logit_vec[sorted_idx]
410
+
411
+ # Softmax
412
+ max_l = sorted_logits[0]
413
+ probs = np.exp(sorted_logits - max_l)
414
+ probs /= probs.sum()
415
+
416
+ cumsum = np.cumsum(probs)
417
+ cutoff = np.searchsorted(cumsum, top_p) + 1
418
+
419
+ top_probs = probs[:cutoff]
420
+ top_probs /= top_probs.sum()
421
+ top_idx = sorted_idx[:cutoff]
422
+
423
+ next_token = int(np.random.choice(top_idx, p=top_probs))
424
+
425
+ generated.append(next_token)
426
+ all_tokens.append(next_token)
427
+
428
+ # Check stop tokens
429
+ if next_token in [151643, 151644, 151645]: # Qwen EOS tokens
430
+ break
431
+
432
+ cache.advance()
433
+ hidden = self.forward_token(next_token, cache, len(all_tokens) - 1)
434
+
435
+ t_total = time.time() - t_start
436
+ t_decode = time.time() - t_decode_start
437
+ n_gen = len(generated)
438
+
439
+ stats = {
440
+ "prefill_ms": t_prefill * 1000,
441
+ "decode_ms": t_decode * 1000,
442
+ "total_ms": t_total * 1000,
443
+ "tokens_generated": n_gen,
444
+ "tok_per_sec": n_gen / t_decode if t_decode > 0 else 0,
445
+ "prefill_tokens": len(token_ids),
446
+ }
447
+
448
+ return generated, stats
449
+
450
+ # ============================================================
451
+ # Tokenizer wrapper
452
+ # ============================================================
453
+ class Tokenizer:
454
+ def __init__(self, model_dir):
455
+ from tokenizers import Tokenizer as HFTokenizer
456
+ tok_path = os.path.join(model_dir, "tokenizer.json")
457
+ if os.path.exists(tok_path):
458
+ self.tok = HFTokenizer.from_file(tok_path)
459
+ else:
460
+ # Try loading from HF
461
+ from transformers import AutoTokenizer
462
+ self.tok = AutoTokenizer.from_pretrained(model_dir)
463
+ self._is_transformers = True
464
+ return
465
+ self._is_transformers = False
466
+
467
+ def encode(self, text):
468
+ if self._is_transformers:
469
+ return self.tok.encode(text)
470
+ return self.tok.encode(text).ids
471
+
472
+ def decode(self, ids):
473
+ if self._is_transformers:
474
+ return self.tok.decode(ids, skip_special_tokens=True)
475
+ return self.tok.decode(ids)
476
+
477
+ def apply_chat_template(self, messages):
478
+ """Build Qwen chat format."""
479
+ parts = []
480
+ for msg in messages:
481
+ role = msg["role"]
482
+ content = msg["content"]
483
+ parts.append(f"<|im_start|>{role}\n{content}<|im_end|>")
484
+ parts.append("<|im_start|>assistant\n")
485
+ return "".join(parts)
486
+
487
+ if __name__ == "__main__":
488
+ import sys
489
+
490
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-ternary"
491
+ kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
492
+
493
+ model = TernaryQwen(model_dir, kernel)
494
+
495
+ # Quick test
496
+ test_ids = [151644, 8948, 198, 151645, 198, 151644, 872, 198, 9707, 151645, 198, 151644, 77091, 198]
497
+
498
+ print("\nGenerating...")
499
+ tokens, stats = model.generate(test_ids, max_new_tokens=50, temperature=0.6)
500
+ print(f"Generated {stats['tokens_generated']} tokens")
501
+ print(f"Speed: {stats['tok_per_sec']:.1f} tok/s")
502
+ print(f"Prefill: {stats['prefill_ms']:.0f}ms, Decode: {stats['decode_ms']:.0f}ms")
503
+ print(f"Token IDs: {tokens}")
log_unary_engine.c ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * LOG-UNARY TRANSFORMER ENGINE
3
+ *
4
+ * Unary base-1 with logarithmic compression:
5
+ * Linear unary: value 7 = 1111111 (7 planes, each = +1)
6
+ * Log unary: value 8 = 111 (3 planes, plane p = 2^p)
7
+ *
8
+ * Matmul kernel: acc += popcount(w_plane[p] AND x_plane[q]) << (p+q)
9
+ * Still pure AND+popcount+shift, no float in hot path.
10
+ *
11
+ * 3 log-planes = values {0,1,2,4} with sign = {-4..+4} = 9 levels
12
+ * 4 log-planes = values {0,1,2,4,8} with sign = {-8..+8} = 17 levels
13
+ * 5 log-planes = values {0,1,2,4,8,16} with sign = {-16..+16} = 33 levels
14
+ *
15
+ * vs linear 7 planes = {-7..+7} = 15 levels using 7 planes
16
+ *
17
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
18
+ */
19
+
20
+ #include <immintrin.h>
21
+ #include <omp.h>
22
+ #include <stdint.h>
23
+ #include <stdlib.h>
24
+ #include <string.h>
25
+ #include <math.h>
26
+ #include <stdio.h>
27
+ #include <time.h>
28
+
29
+ #define MAX_SEQ 4096
30
+ #define RMS_EPS 1e-6f
31
+
32
+ /* ============================================================
33
+ * Config
34
+ * ============================================================ */
35
+ typedef struct {
36
+ int hidden;
37
+ int inter;
38
+ int n_heads;
39
+ int n_kv_heads;
40
+ int head_dim;
41
+ int n_layers;
42
+ int vocab;
43
+ float rope_theta;
44
+ int tie_embeddings;
45
+ int w_planes; /* weight log-planes */
46
+ int a_planes; /* activation log-planes */
47
+ } Config;
48
+
49
+ /* Log-unary weight matrix */
50
+ typedef struct {
51
+ uint64_t *sign_bits; /* [out_dim * chunks] */
52
+ uint64_t *log_planes; /* [n_planes][out_dim * chunks] - plane p = 2^p */
53
+ float *scales; /* [out_dim] */
54
+ int out_dim;
55
+ int in_dim;
56
+ int n_planes;
57
+ int chunks;
58
+ } LogUnaryWeight;
59
+
60
+ /* Transformer layer */
61
+ typedef struct {
62
+ LogUnaryWeight q_proj, k_proj, v_proj, o_proj;
63
+ LogUnaryWeight gate_proj, up_proj, down_proj;
64
+ float *input_norm;
65
+ float *post_norm;
66
+ float *q_norm, *k_norm;
67
+ } Layer;
68
+
69
+ /* Full model */
70
+ typedef struct {
71
+ Config cfg;
72
+ uint16_t *embed;
73
+ Layer *layers;
74
+ float *final_norm;
75
+
76
+ /* KV cache */
77
+ float *k_cache;
78
+ float *v_cache;
79
+
80
+ /* Float scratch (O(dim) ops only) */
81
+ float *hidden;
82
+ float *normed;
83
+ float *q_float;
84
+ float *k_float;
85
+ float *v_float;
86
+ float *attn_out;
87
+ float *gate_float;
88
+ float *up_float;
89
+ float *mlp_act;
90
+ float *logits;
91
+ float *attn_scores;
92
+
93
+ /* Unary scratch for activation quantization */
94
+ uint64_t *act_sign;
95
+ uint64_t *act_planes;
96
+
97
+ /* Larger scratch for intermediate dim */
98
+ uint64_t *mlp_act_sign;
99
+ uint64_t *mlp_act_planes;
100
+ } Model;
101
+
102
+ /* ============================================================
103
+ * LOG-UNARY ACTIVATION QUANTIZATION
104
+ *
105
+ * Encode float value as sign + log-magnitude planes
106
+ * Plane p is set if |x| >= threshold_p
107
+ * threshold_p = scale * 2^p / max_level
108
+ *
109
+ * Effectively: compute integer magnitude = round(|x|/scale * max_level)
110
+ * Then decompose into binary: if bit p is set in magnitude, plane p is set
111
+ *
112
+ * Wait — that's just BINARY encoding of the magnitude!
113
+ * Log-unary IS binary representation stored as separate bitplanes.
114
+ * The magic is that AND+popcount+shift MULTIPLIES them.
115
+ * ============================================================ */
116
+ static void quantize_log_unary(
117
+ const float *x, int dim, int n_planes,
118
+ uint64_t *sign_out, uint64_t *planes_out, float *scale_out
119
+ ) {
120
+ int chunks = (dim + 63) / 64;
121
+ int max_level = (1 << n_planes) - 1; /* 2^n - 1 */
122
+
123
+ /* Find absmax */
124
+ float amax = 0.0f;
125
+ for (int i = 0; i < dim; i++) {
126
+ float a = fabsf(x[i]);
127
+ if (a > amax) amax = a;
128
+ }
129
+ if (amax == 0.0f) amax = 1.0f;
130
+ *scale_out = amax / max_level;
131
+
132
+ memset(sign_out, 0, chunks * sizeof(uint64_t));
133
+ memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
134
+
135
+ float inv_scale = max_level / amax;
136
+ for (int i = 0; i < dim; i++) {
137
+ int chunk = i / 64;
138
+ int bit = i % 64;
139
+ uint64_t mask = 1ULL << bit;
140
+
141
+ if (x[i] < 0.0f)
142
+ sign_out[chunk] |= mask;
143
+
144
+ int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
145
+ if (mag > max_level) mag = max_level;
146
+
147
+ /* Binary decomposition: plane p gets bit p of magnitude */
148
+ for (int p = 0; p < n_planes; p++) {
149
+ if (mag & (1 << p))
150
+ planes_out[(size_t)p * chunks + chunk] |= mask;
151
+ }
152
+ }
153
+ }
154
+
155
+ /* ============================================================
156
+ * LOG-UNARY MATVEC: y = W @ x
157
+ *
158
+ * W: log-unary (sign + wp log-planes, scales)
159
+ * x: log-unary (sign + xp log-planes, scale)
160
+ *
161
+ * For each output element i:
162
+ * acc = 0
163
+ * for each chunk c:
164
+ * same = ~(w_sign[c] ^ x_sign[c])
165
+ * diff = w_sign[c] ^ x_sign[c]
166
+ * for p in 0..wp-1:
167
+ * for q in 0..xp-1:
168
+ * active = w_plane[p][c] & x_plane[q][c]
169
+ * pos = popcount(active & same)
170
+ * neg = popcount(active & diff)
171
+ * acc += (pos - neg) << (p + q) <-- THE KEY: shift by p+q
172
+ * y[i] = acc * w_scale[i] * x_scale
173
+ * ============================================================ */
174
+ static void log_unary_matvec(
175
+ const LogUnaryWeight *W,
176
+ const uint64_t *x_sign, const uint64_t *x_planes,
177
+ float x_scale, int x_n_planes,
178
+ float *y_out
179
+ ) {
180
+ int out_dim = W->out_dim;
181
+ int chunks = W->chunks;
182
+ int wp = W->n_planes;
183
+ int xp = x_n_planes;
184
+
185
+ #pragma omp parallel for schedule(dynamic, 32)
186
+ for (int i = 0; i < out_dim; i++) {
187
+ const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
188
+ long long acc = 0;
189
+
190
+ for (int c = 0; c < chunks; c++) {
191
+ uint64_t ws = w_sign_row[c];
192
+ uint64_t xs = x_sign[c];
193
+ uint64_t same = ~(ws ^ xs);
194
+ uint64_t diff = ws ^ xs;
195
+
196
+ for (int p = 0; p < wp; p++) {
197
+ uint64_t w_mag = W->log_planes[((size_t)p * out_dim + i) * chunks + c];
198
+
199
+ for (int q = 0; q < xp; q++) {
200
+ uint64_t x_mag = x_planes[(size_t)q * chunks + c];
201
+ uint64_t active = w_mag & x_mag;
202
+ if (!active) continue; /* skip zero — common with log encoding */
203
+
204
+ uint64_t pos = active & same;
205
+ uint64_t neg = active & diff;
206
+ int shift = p + q;
207
+ acc += (long long)(__builtin_popcountll(pos) -
208
+ __builtin_popcountll(neg)) << shift;
209
+ }
210
+ }
211
+ }
212
+
213
+ y_out[i] = (float)acc * W->scales[i] * x_scale;
214
+ }
215
+ }
216
+
217
+ /* ============================================================
218
+ * FP16 ops (embedding, lm_head) — not in the critical per-layer path
219
+ * ============================================================ */
220
+ static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
221
+ const uint16_t *row = embed + (size_t)token_id * hidden;
222
+ int i;
223
+ for (i = 0; i + 16 <= hidden; i += 16) {
224
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
225
+ __m512 fv = _mm512_cvtph_ps(h);
226
+ _mm512_storeu_ps(out + i, fv);
227
+ }
228
+ for (; i < hidden; i++) {
229
+ __m128i hv = _mm_set1_epi16(row[i]);
230
+ __m128 fv = _mm_cvtph_ps(hv);
231
+ _mm_store_ss(out + i, fv);
232
+ }
233
+ }
234
+
235
+ static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
236
+ #pragma omp parallel for schedule(dynamic, 256)
237
+ for (int i = 0; i < out_dim; i++) {
238
+ __m512 acc = _mm512_setzero_ps();
239
+ int j;
240
+ for (j = 0; j + 16 <= in_dim; j += 16) {
241
+ __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
242
+ __m512 wv = _mm512_cvtph_ps(h);
243
+ __m512 xv = _mm512_loadu_ps(x + j);
244
+ acc = _mm512_fmadd_ps(wv, xv, acc);
245
+ }
246
+ float sum = _mm512_reduce_add_ps(acc);
247
+ for (; j < in_dim; j++) {
248
+ __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
249
+ __m128 fv = _mm_cvtph_ps(hv);
250
+ float wf; _mm_store_ss(&wf, fv);
251
+ sum += wf * x[j];
252
+ }
253
+ y[i] = sum;
254
+ }
255
+ }
256
+
257
+ /* ============================================================
258
+ * O(dim) float ops — RMSNorm, SiLU, Softmax, RoPE, residual
259
+ * ============================================================ */
260
+ static void rmsnorm(const float *x, const float *w, float *y, int dim) {
261
+ float ss = 0.0f;
262
+ for (int i = 0; i < dim; i++) ss += x[i] * x[i];
263
+ float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
264
+ for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
265
+ }
266
+
267
+ static void silu_mul(const float *gate, const float *up, float *out, int n) {
268
+ for (int i = 0; i < n; i++)
269
+ out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
270
+ }
271
+
272
+ static void vec_add(float *y, const float *x, int n) {
273
+ for (int i = 0; i < n; i++) y[i] += x[i];
274
+ }
275
+
276
+ static void apply_rope(float *vec, int pos, int dim, float theta) {
277
+ for (int i = 0; i < dim; i += 2) {
278
+ float freq = 1.0f / powf(theta, (float)i / dim);
279
+ float angle = pos * freq;
280
+ float co = cosf(angle), si = sinf(angle);
281
+ float v0 = vec[i], v1 = vec[i+1];
282
+ vec[i] = v0*co - v1*si;
283
+ vec[i+1] = v0*si + v1*co;
284
+ }
285
+ }
286
+
287
+ static void softmax(float *x, int n) {
288
+ float mx = x[0];
289
+ for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
290
+ float sum = 0.0f;
291
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
292
+ float inv = 1.0f / sum;
293
+ for (int i = 0; i < n; i++) x[i] *= inv;
294
+ }
295
+
296
+ static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
297
+ return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
298
+ (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
299
+ }
300
+
301
+ /* ============================================================
302
+ * ATTENTION
303
+ * ============================================================ */
304
+ static void attention(Model *m, int layer_idx, int pos) {
305
+ Config *c = &m->cfg;
306
+ Layer *L = &m->layers[layer_idx];
307
+ int heads_per_kv = c->n_heads / c->n_kv_heads;
308
+ int hidden_chunks = (c->hidden + 63) / 64;
309
+ float act_scale;
310
+
311
+ /* Quantize normed hidden -> log-unary */
312
+ quantize_log_unary(m->normed, c->hidden, c->a_planes,
313
+ m->act_sign, m->act_planes, &act_scale);
314
+
315
+ /* Q, K, V — log-unary matmul */
316
+ log_unary_matvec(&L->q_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->q_float);
317
+ log_unary_matvec(&L->k_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->k_float);
318
+ log_unary_matvec(&L->v_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->v_float);
319
+
320
+ /* QK-Norm */
321
+ if (L->q_norm)
322
+ for (int h = 0; h < c->n_heads; h++)
323
+ rmsnorm(m->q_float + h*c->head_dim, L->q_norm, m->q_float + h*c->head_dim, c->head_dim);
324
+ if (L->k_norm)
325
+ for (int h = 0; h < c->n_kv_heads; h++)
326
+ rmsnorm(m->k_float + h*c->head_dim, L->k_norm, m->k_float + h*c->head_dim, c->head_dim);
327
+
328
+ /* RoPE */
329
+ for (int h = 0; h < c->n_heads; h++)
330
+ apply_rope(m->q_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
331
+ for (int h = 0; h < c->n_kv_heads; h++)
332
+ apply_rope(m->k_float + h*c->head_dim, pos, c->head_dim, c->rope_theta);
333
+
334
+ /* KV cache store */
335
+ for (int h = 0; h < c->n_kv_heads; h++) {
336
+ memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
337
+ m->k_float + h*c->head_dim, c->head_dim * sizeof(float));
338
+ memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
339
+ m->v_float + h*c->head_dim, c->head_dim * sizeof(float));
340
+ }
341
+
342
+ /* Attention dot products + softmax + weighted sum */
343
+ float scale = 1.0f / sqrtf((float)c->head_dim);
344
+ memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
345
+
346
+ for (int h = 0; h < c->n_heads; h++) {
347
+ int kv_h = h / heads_per_kv;
348
+ float *qh = m->q_float + h*c->head_dim;
349
+ float *oh = m->attn_out + h*c->head_dim;
350
+
351
+ for (int t = 0; t <= pos; t++) {
352
+ float *kc = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
353
+ float dot = 0.0f;
354
+ for (int d = 0; d < c->head_dim; d++) dot += qh[d] * kc[d];
355
+ m->attn_scores[t] = dot * scale;
356
+ }
357
+ softmax(m->attn_scores, pos + 1);
358
+ for (int t = 0; t <= pos; t++) {
359
+ float w = m->attn_scores[t];
360
+ if (w < 1e-8f) continue;
361
+ float *vc = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
362
+ for (int d = 0; d < c->head_dim; d++) oh[d] += w * vc[d];
363
+ }
364
+ }
365
+
366
+ /* O projection — quantize attn_out, then log-unary matmul */
367
+ int o_dim = c->n_heads * c->head_dim;
368
+ int o_chunks = (o_dim + 63) / 64;
369
+ uint64_t *o_sign = (uint64_t *)aligned_alloc(64, o_chunks * sizeof(uint64_t));
370
+ uint64_t *o_planes = (uint64_t *)aligned_alloc(64, (size_t)c->a_planes * o_chunks * sizeof(uint64_t));
371
+ float o_scale;
372
+ quantize_log_unary(m->attn_out, o_dim, c->a_planes, o_sign, o_planes, &o_scale);
373
+
374
+ float *o_tmp = m->normed; /* reuse */
375
+ log_unary_matvec(&L->o_proj, o_sign, o_planes, o_scale, c->a_planes, o_tmp);
376
+ memcpy(m->attn_out, o_tmp, c->hidden * sizeof(float));
377
+
378
+ free(o_sign); free(o_planes);
379
+ }
380
+
381
+ /* ============================================================
382
+ * MLP
383
+ * ============================================================ */
384
+ static void mlp(Model *m, int layer_idx) {
385
+ Config *c = &m->cfg;
386
+ Layer *L = &m->layers[layer_idx];
387
+ int hidden_chunks = (c->hidden + 63) / 64;
388
+ int inter_chunks = (c->inter + 63) / 64;
389
+ float act_scale, mlp_scale;
390
+
391
+ /* Quantize normed input */
392
+ quantize_log_unary(m->normed, c->hidden, c->a_planes,
393
+ m->act_sign, m->act_planes, &act_scale);
394
+
395
+ /* Gate + Up — log-unary */
396
+ log_unary_matvec(&L->gate_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->gate_float);
397
+ log_unary_matvec(&L->up_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->up_float);
398
+
399
+ /* SiLU(gate) * up */
400
+ silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
401
+
402
+ /* Quantize for down projection */
403
+ quantize_log_unary(m->mlp_act, c->inter, c->a_planes,
404
+ m->mlp_act_sign, m->mlp_act_planes, &mlp_scale);
405
+
406
+ /* Down — log-unary */
407
+ log_unary_matvec(&L->down_proj, m->mlp_act_sign, m->mlp_act_planes, mlp_scale, c->a_planes, m->normed);
408
+ }
409
+
410
+ /* ============================================================
411
+ * FORWARD
412
+ * ============================================================ */
413
+ float* forward_token(Model *m, int token_id, int pos) {
414
+ Config *c = &m->cfg;
415
+
416
+ embed_token(m->embed, token_id, m->hidden, c->hidden);
417
+
418
+ for (int l = 0; l < c->n_layers; l++) {
419
+ rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
420
+ attention(m, l, pos);
421
+ vec_add(m->hidden, m->attn_out, c->hidden);
422
+ rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
423
+ mlp(m, l);
424
+ vec_add(m->hidden, m->normed, c->hidden);
425
+ }
426
+
427
+ rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
428
+
429
+ if (c->tie_embeddings)
430
+ fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
431
+
432
+ return m->logits;
433
+ }
434
+
435
+ /* ============================================================
436
+ * SAMPLING
437
+ * ============================================================ */
438
+ static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
439
+ if (temperature > 0) {
440
+ float inv_t = 1.0f / temperature;
441
+ for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
442
+ }
443
+ softmax(logits, vocab);
444
+
445
+ float *probs = (float *)malloc(vocab * sizeof(float));
446
+ int *indices = (int *)malloc(vocab * sizeof(int));
447
+ memcpy(probs, logits, vocab * sizeof(float));
448
+ for (int i = 0; i < vocab; i++) indices[i] = i;
449
+
450
+ int n = 0; float cum = 0.0f;
451
+ while (cum < top_p && n < vocab) {
452
+ int best = n;
453
+ for (int i = n+1; i < vocab; i++) if (probs[i] > probs[best]) best = i;
454
+ float t = probs[n]; probs[n] = probs[best]; probs[best] = t;
455
+ int ti = indices[n]; indices[n] = indices[best]; indices[best] = ti;
456
+ cum += probs[n]; n++;
457
+ if (n >= 40) break;
458
+ }
459
+ float sum = 0; for (int i = 0; i < n; i++) sum += probs[i];
460
+ float r = (float)rand() / RAND_MAX * sum;
461
+ float a = 0; int ch = indices[0];
462
+ for (int i = 0; i < n; i++) { a += probs[i]; if (a >= r) { ch = indices[i]; break; } }
463
+ free(probs); free(indices);
464
+ return ch;
465
+ }
466
+
467
+ int generate(Model *m, const int *prompt, int plen, int *out, int max_new,
468
+ float temperature, float top_p, int eos) {
469
+ srand(time(NULL));
470
+ for (int i = 0; i < plen; i++) forward_token(m, prompt[i], i);
471
+ int pos = plen, gen = 0;
472
+ for (int t = 0; t < max_new; t++) {
473
+ int next;
474
+ if (temperature <= 0) {
475
+ next = 0;
476
+ for (int i = 1; i < m->cfg.vocab; i++)
477
+ if (m->logits[i] > m->logits[next]) next = i;
478
+ } else {
479
+ next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
480
+ }
481
+ out[t] = next; gen++;
482
+ if (next == eos) break;
483
+ forward_token(m, next, pos); pos++;
484
+ }
485
+ return gen;
486
+ }
487
+
488
+ /* ============================================================
489
+ * ALLOCATION
490
+ * ============================================================ */
491
+ Model* model_alloc(
492
+ int w_planes, int a_planes,
493
+ int hidden, int inter, int n_heads, int n_kv_heads,
494
+ int head_dim, int n_layers, int vocab,
495
+ float rope_theta, int tie_embeddings
496
+ ) {
497
+ Model *m = (Model *)calloc(1, sizeof(Model));
498
+ Config *c = &m->cfg;
499
+ c->hidden = hidden; c->inter = inter;
500
+ c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
501
+ c->head_dim = head_dim; c->n_layers = n_layers;
502
+ c->vocab = vocab; c->rope_theta = rope_theta;
503
+ c->tie_embeddings = tie_embeddings;
504
+ c->w_planes = w_planes; c->a_planes = a_planes;
505
+
506
+ m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
507
+
508
+ size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
509
+ m->k_cache = (float *)calloc(kv_size, sizeof(float));
510
+ m->v_cache = (float *)calloc(kv_size, sizeof(float));
511
+
512
+ int max_dim = inter > hidden ? inter : hidden;
513
+ m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
514
+ m->normed = (float *)aligned_alloc(64, max_dim * sizeof(float));
515
+ m->q_float = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
516
+ m->k_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
517
+ m->v_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
518
+ m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
519
+ m->gate_float = (float *)aligned_alloc(64, inter * sizeof(float));
520
+ m->up_float = (float *)aligned_alloc(64, inter * sizeof(float));
521
+ m->mlp_act = (float *)aligned_alloc(64, inter * sizeof(float));
522
+ m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
523
+ m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
524
+ m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
525
+
526
+ /* Unary scratch for hidden dim */
527
+ int h_chunks = (hidden + 63) / 64;
528
+ m->act_sign = (uint64_t *)aligned_alloc(64, h_chunks * sizeof(uint64_t));
529
+ m->act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * h_chunks * sizeof(uint64_t));
530
+
531
+ /* Unary scratch for intermediate dim */
532
+ int i_chunks = (inter + 63) / 64;
533
+ m->mlp_act_sign = (uint64_t *)aligned_alloc(64, i_chunks * sizeof(uint64_t));
534
+ m->mlp_act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * i_chunks * sizeof(uint64_t));
535
+
536
+ int w_max = (1 << w_planes) - 1;
537
+ int a_max = (1 << a_planes) - 1;
538
+
539
+ printf("LOG-UNARY ENGINE\n");
540
+ printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
541
+ hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
542
+ printf(" Weight: %d log-planes -> %d levels (range -%d..+%d)\n",
543
+ w_planes, 2*w_max+1, w_max, w_max);
544
+ printf(" Activation: %d log-planes -> %d levels (range -%d..+%d)\n",
545
+ a_planes, 2*a_max+1, a_max, a_max);
546
+ printf(" Plane pairs per element: %d (vs %d linear)\n",
547
+ w_planes * a_planes, 7 * 4);
548
+ printf(" KV cache: %zu MB\n", kv_size * 2 * sizeof(float) / (1024*1024));
549
+
550
+ return m;
551
+ }
552
+
553
+ /* Weight setters */
554
+ void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
555
+ void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
556
+
557
+ void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
558
+ m->layers[l].input_norm = in_norm;
559
+ m->layers[l].post_norm = post_norm;
560
+ }
561
+
562
+ void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
563
+ m->layers[l].q_norm = q_norm;
564
+ m->layers[l].k_norm = k_norm;
565
+ }
566
+
567
+ static void init_weight(LogUnaryWeight *w, uint64_t *sign, uint64_t *planes, float *scales,
568
+ int out_dim, int in_dim, int n_planes) {
569
+ w->sign_bits = sign; w->log_planes = planes; w->scales = scales;
570
+ w->out_dim = out_dim; w->in_dim = in_dim; w->n_planes = n_planes;
571
+ w->chunks = (in_dim + 63) / 64;
572
+ }
573
+
574
+ void layer_set_linears(
575
+ Model *m, int l,
576
+ uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
577
+ uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
578
+ uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
579
+ uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
580
+ uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
581
+ uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
582
+ uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
583
+ int n_planes
584
+ ) {
585
+ init_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
586
+ init_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
587
+ init_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
588
+ init_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
589
+ init_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
590
+ init_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
591
+ init_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
592
+ }
593
+
594
+ void model_reset_cache(Model *m) {
595
+ size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
596
+ memset(m->k_cache, 0, kv_size * sizeof(float));
597
+ memset(m->v_cache, 0, kv_size * sizeof(float));
598
+ }
logunary_tensor.c ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #define _POSIX_C_SOURCE 199309L
2
+ /*
3
+ * LOG-UNARY TENSOR LIBRARY
4
+ *
5
+ * Native tensor type where values are represented as:
6
+ * sign (1 bit) + log-magnitude bitplanes
7
+ *
8
+ * Plane p is set if |value| >= 2^(p - bias)
9
+ * With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
10
+ *
11
+ * ALL arithmetic stays in this representation:
12
+ * - matmul: AND + weighted_popcount (shift by p+q-2*bias)
13
+ * - add: bitwise merge with carry propagation
14
+ * - scale: shift planes up/down
15
+ * - negate: flip sign bits
16
+ *
17
+ * Float conversion only at boundaries (embed lookup, final logits)
18
+ *
19
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
20
+ */
21
+
22
+ #include <immintrin.h>
23
+ #include <omp.h>
24
+ #include <stdint.h>
25
+ #include <stdlib.h>
26
+ #include <string.h>
27
+ #include <math.h>
28
+ #include <stdio.h>
29
+ #include <time.h>
30
+
31
+ /* ============================================================
32
+ * LOG-UNARY TENSOR
33
+ *
34
+ * For a vector of length `dim`:
35
+ * sign: uint64[chunks] - 1 bit per element
36
+ * planes: uint64[n_planes][chunks] - 1 bit per element per plane
37
+ * chunks = (dim + 63) / 64
38
+ *
39
+ * Plane p is set if |value| >= threshold[p]
40
+ * threshold[p] = base_scale * 2^(p - bias)
41
+ *
42
+ * This is a LOG thermometer code:
43
+ * value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
44
+ * value=1.0 with bias=10 -> planes 0-10 set
45
+ * value=64.0 with bias=10 -> planes 0-16 set
46
+ *
47
+ * ============================================================ */
48
+ typedef struct {
49
+ uint64_t *sign; /* [chunks] */
50
+ uint64_t *planes; /* [n_planes * chunks] contiguous */
51
+ int dim;
52
+ int chunks;
53
+ int n_planes;
54
+ int bias; /* log2 offset: threshold[p] = base * 2^(p-bias) */
55
+ float base_scale; /* per-tensor scale factor */
56
+ } LogUnaryTensor;
57
+
58
+ /* 2D tensor (matrix) - row-major */
59
+ typedef struct {
60
+ uint64_t *sign; /* [rows * chunks_per_row] */
61
+ uint64_t *planes; /* [n_planes * rows * chunks_per_row] */
62
+ float *row_scales; /* [rows] per-row base scales */
63
+ int rows;
64
+ int cols;
65
+ int chunks; /* chunks per row = (cols+63)/64 */
66
+ int n_planes;
67
+ int bias;
68
+ } LogUnaryMatrix;
69
+
70
+ /* ============================================================
71
+ * ALLOCATION
72
+ * ============================================================ */
73
+ LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
74
+ LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor));
75
+ t->dim = dim;
76
+ t->n_planes = n_planes;
77
+ t->bias = bias;
78
+ t->chunks = (dim + 63) / 64;
79
+ t->base_scale = 1.0f;
80
+ t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t));
81
+ t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t));
82
+ memset(t->sign, 0, t->chunks * sizeof(uint64_t));
83
+ memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
84
+ return t;
85
+ }
86
+
87
+ LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
88
+ LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix));
89
+ m->rows = rows;
90
+ m->cols = cols;
91
+ m->n_planes = n_planes;
92
+ m->bias = bias;
93
+ m->chunks = (cols + 63) / 64;
94
+ m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
95
+ m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
96
+ m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float));
97
+ memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
98
+ memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
99
+ for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
100
+ return m;
101
+ }
102
+
103
+ void lut_free(LogUnaryTensor *t) {
104
+ if (t) { free(t->sign); free(t->planes); free(t); }
105
+ }
106
+ void lum_free(LogUnaryMatrix *m) {
107
+ if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
108
+ }
109
+
110
+ /* ============================================================
111
+ * FLOAT <-> LOG-UNARY CONVERSION
112
+ * Only used at boundaries (embedding, final output)
113
+ * ============================================================ */
114
+ void lut_from_float(LogUnaryTensor *t, const float *x) {
115
+ int dim = t->dim;
116
+ int np = t->n_planes;
117
+ int bias = t->bias;
118
+ int chunks = t->chunks;
119
+
120
+ memset(t->sign, 0, chunks * sizeof(uint64_t));
121
+ memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));
122
+
123
+ /* Find absmax for base_scale */
124
+ float amax = 0.0f;
125
+ for (int i = 0; i < dim; i++) {
126
+ float a = fabsf(x[i]);
127
+ if (a > amax) amax = a;
128
+ }
129
+ if (amax == 0.0f) { t->base_scale = 1.0f; return; }
130
+
131
+ /* Set base_scale so that max value uses the highest plane */
132
+ /* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
133
+ t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);
134
+
135
+ for (int i = 0; i < dim; i++) {
136
+ int c = i / 64;
137
+ uint64_t bit = 1ULL << (i % 64);
138
+
139
+ if (x[i] < 0.0f) t->sign[c] |= bit;
140
+
141
+ float mag = fabsf(x[i]);
142
+ /* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
143
+ for (int p = 0; p < np; p++) {
144
+ float thresh = t->base_scale * ldexpf(1.0f, p - bias);
145
+ if (mag >= thresh)
146
+ t->planes[(size_t)p * chunks + c] |= bit;
147
+ else
148
+ break; /* thermometer: once we stop, all higher planes are 0 */
149
+ }
150
+ }
151
+ }
152
+
153
+ void lut_to_float(const LogUnaryTensor *t, float *out) {
154
+ int dim = t->dim;
155
+ int np = t->n_planes;
156
+ int bias = t->bias;
157
+ int chunks = t->chunks;
158
+
159
+ memset(out, 0, dim * sizeof(float));
160
+
161
+ for (int i = 0; i < dim; i++) {
162
+ int c = i / 64;
163
+ uint64_t bit = 1ULL << (i % 64);
164
+
165
+ /* Find highest set plane */
166
+ int highest = -1;
167
+ for (int p = np - 1; p >= 0; p--) {
168
+ if (t->planes[(size_t)p * chunks + c] & bit) {
169
+ highest = p;
170
+ break;
171
+ }
172
+ }
173
+
174
+ if (highest < 0) {
175
+ out[i] = 0.0f;
176
+ } else {
177
+ /* Value is approximately base * 2^(highest - bias) */
178
+ /* More precise: midpoint between this threshold and next */
179
+ float val = t->base_scale * ldexpf(1.0f, highest - bias);
180
+ if (highest < np - 1) {
181
+ float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
182
+ val = (val + next) * 0.5f; /* midpoint reconstruction */
183
+ }
184
+ out[i] = (t->sign[c] & bit) ? -val : val;
185
+ }
186
+ }
187
+ }
188
+
189
+ /* Convert float matrix to log-unary matrix (per-row scaling) */
190
+ void lum_from_float(LogUnaryMatrix *m, const float *data) {
191
+ int rows = m->rows, cols = m->cols;
192
+ int np = m->n_planes, bias = m->bias;
193
+ int chunks = m->chunks;
194
+
195
+ memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
196
+ memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));
197
+
198
+ for (int r = 0; r < rows; r++) {
199
+ const float *row = data + (size_t)r * cols;
200
+
201
+ /* Per-row absmax */
202
+ float amax = 0.0f;
203
+ for (int j = 0; j < cols; j++) {
204
+ float a = fabsf(row[j]);
205
+ if (a > amax) amax = a;
206
+ }
207
+ if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
208
+ m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);
209
+
210
+ uint64_t *row_sign = m->sign + (size_t)r * chunks;
211
+
212
+ for (int j = 0; j < cols; j++) {
213
+ int c = j / 64;
214
+ uint64_t bit = 1ULL << (j % 64);
215
+
216
+ if (row[j] < 0.0f) row_sign[c] |= bit;
217
+
218
+ float mag = fabsf(row[j]);
219
+ for (int p = 0; p < np; p++) {
220
+ float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
221
+ if (mag >= thresh)
222
+ m->planes[((size_t)p * rows + r) * chunks + c] |= bit;
223
+ else
224
+ break;
225
+ }
226
+ }
227
+ }
228
+ }
229
+
230
+ /* ============================================================
231
+ * LOG-UNARY MATMUL: y = M @ x
232
+ *
233
+ * Both M (matrix) and x (vector) are log-unary encoded.
234
+ *
235
+ * For each output element y[i]:
236
+ * For each weight plane p, activation plane q:
237
+ * active = M.planes[p][i] AND x.planes[q]
238
+ * same = active AND ~(M.sign[i] XOR x.sign)
239
+ * diff = active AND (M.sign[i] XOR x.sign)
240
+ * contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
241
+ *
242
+ * Output is a LogUnaryTensor (converted from integer accumulator)
243
+ * ============================================================ */
244
+ void lum_matvec(
245
+ const LogUnaryMatrix *M,
246
+ const LogUnaryTensor *x,
247
+ LogUnaryTensor *y_out /* output: log-unary encoded result */
248
+ ) {
249
+ int out_dim = M->rows;
250
+ int chunks = M->chunks;
251
+ int wp = M->n_planes;
252
+ int xp = x->n_planes;
253
+ int w_bias = M->bias;
254
+ int x_bias = x->bias;
255
+
256
+ /* Accumulate to float temporarily, then requantize to log-unary.
257
+ * The accumulator is integer shifts (2^(p+q-2bias)), which
258
+ * we can do as int64 left-shifts for small exponents.
259
+ *
260
+ * For the exponent range we're in (p+q in [0,14] with bias ~4),
261
+ * net shift is [-8, 6], so we use a fixed-point int64 accumulator
262
+ * with a base shift to keep everything positive.
263
+ */
264
+ int base_shift = w_bias + x_bias; /* shift to add to make all exponents >= 0 */
265
+
266
+ /* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
267
+ /* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */
268
+
269
+ float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
270
+
271
+ #pragma omp parallel for schedule(dynamic, 32)
272
+ for (int i = 0; i < out_dim; i++) {
273
+ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
274
+ long long acc = 0;
275
+
276
+ for (int c = 0; c < chunks; c++) {
277
+ uint64_t ws = w_sign_row[c];
278
+ uint64_t xs = x->sign[c];
279
+ uint64_t same = ~(ws ^ xs);
280
+ uint64_t diff = ws ^ xs;
281
+
282
+ for (int p = 0; p < wp; p++) {
283
+ uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];
284
+
285
+ for (int q = 0; q < xp; q++) {
286
+ uint64_t x_plane = x->planes[(size_t)q * chunks + c];
287
+ uint64_t active = w_plane & x_plane;
288
+ uint64_t pos = active & same;
289
+ uint64_t neg = active & diff;
290
+
291
+ int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);
292
+
293
+ /* Weighted by 2^(p + q) relative to base */
294
+ int shift = p + q; /* relative to 2^(-base_shift) */
295
+ if (count != 0)
296
+ acc += (long long)count << shift;
297
+ }
298
+ }
299
+ }
300
+
301
+ /* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
302
+ y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
303
+ * ldexpf(1.0f, -base_shift);
304
+ }
305
+
306
+ /* Requantize float result to log-unary */
307
+ lut_from_float(y_out, y_float);
308
+ free(y_float);
309
+ }
310
+
311
+ /* ============================================================
312
+ * LOG-UNARY ELEMENT-WISE ADD: z = a + b
313
+ *
314
+ * Dequant both, add as float, requant.
315
+ * This is O(dim) so not the bottleneck.
316
+ * Future: direct bitwise add with carry chains.
317
+ * ============================================================ */
318
+ void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) {
319
+ int dim = a->dim;
320
+ float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
321
+ float *fb = (float *)aligned_alloc(64, dim * sizeof(float));
322
+
323
+ lut_to_float(a, fa);
324
+ lut_to_float(b, fb);
325
+
326
+ for (int i = 0; i < dim; i++) fa[i] += fb[i];
327
+
328
+ lut_from_float(out, fa);
329
+ free(fa); free(fb);
330
+ }
331
+
332
+ /* In-place add: a += b (dequant a, add float b, requant) */
333
+ void lut_add_float(LogUnaryTensor *a, const float *b) {
334
+ int dim = a->dim;
335
+ float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
336
+ lut_to_float(a, fa);
337
+ for (int i = 0; i < dim; i++) fa[i] += b[i];
338
+ lut_from_float(a, fa);
339
+ free(fa);
340
+ }
341
+
342
+ /* ============================================================
343
+ * LOG-UNARY RMSNORM
344
+ *
345
+ * Needs float for the sqrt/reciprocal, but O(dim).
346
+ * Input: log-unary, Output: log-unary
347
+ * ============================================================ */
348
+ void lut_rmsnorm(
349
+ const LogUnaryTensor *x,
350
+ const float *weight, /* norm weights stay float (tiny) */
351
+ LogUnaryTensor *out,
352
+ float eps
353
+ ) {
354
+ int dim = x->dim;
355
+ float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
356
+ lut_to_float(x, xf);
357
+
358
+ float ss = 0.0f;
359
+ for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
360
+ float rms = 1.0f / sqrtf(ss / dim + eps);
361
+
362
+ for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];
363
+
364
+ lut_from_float(out, xf);
365
+ free(xf);
366
+ }
367
+
368
+ /* ============================================================
369
+ * LOG-UNARY SILU_MUL: out = SiLU(gate) * up
370
+ *
371
+ * O(dim), not bottleneck. Dequant, compute, requant.
372
+ * ============================================================ */
373
+ void lut_silu_mul(
374
+ const LogUnaryTensor *gate,
375
+ const LogUnaryTensor *up,
376
+ LogUnaryTensor *out
377
+ ) {
378
+ int dim = gate->dim;
379
+ float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
380
+ float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
381
+
382
+ lut_to_float(gate, gf);
383
+ lut_to_float(up, uf);
384
+
385
+ for (int i = 0; i < dim; i++)
386
+ gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
387
+
388
+ lut_from_float(out, gf);
389
+ free(gf); free(uf);
390
+ }
391
+
392
+ /* ============================================================
393
+ * LOG-UNARY ROPE
394
+ *
395
+ * O(dim), dequant-compute-requant per head.
396
+ * ============================================================ */
397
+ void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
398
+ /* Dequant the relevant slice, apply RoPE, requant */
399
+ float *f = (float *)aligned_alloc(64, head_dim * sizeof(float));
400
+
401
+ /* Extract slice */
402
+ float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
403
+ lut_to_float(t, full);
404
+ memcpy(f, full + start, head_dim * sizeof(float));
405
+
406
+ for (int i = 0; i < head_dim; i += 2) {
407
+ float freq = 1.0f / powf(theta, (float)i / head_dim);
408
+ float angle = offset * freq;
409
+ float c = cosf(angle), s = sinf(angle);
410
+ float v0 = f[i], v1 = f[i + 1];
411
+ f[i] = v0 * c - v1 * s;
412
+ f[i + 1] = v0 * s + v1 * c;
413
+ }
414
+
415
+ memcpy(full + start, f, head_dim * sizeof(float));
416
+ lut_from_float(t, full);
417
+ free(f); free(full);
418
+ }
419
+
420
+ /* ============================================================
421
+ * UTILITY: Get float slice from log-unary tensor
422
+ * (for attention scores which need float softmax)
423
+ * ============================================================ */
424
+ void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) {
425
+ float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
426
+ lut_to_float(t, full);
427
+ memcpy(out, full + start, len * sizeof(float));
428
+ free(full);
429
+ }
430
+
431
+ /* ============================================================
432
+ * BENCHMARK: measure matvec throughput
433
+ * ============================================================ */
434
+ typedef struct {
435
+ double total_and_ops;
436
+ double total_popcount_ops;
437
+ double wall_time_s;
438
+ double elements_per_sec;
439
+ double gops; /* giga-operations per second */
440
+ } BenchResult;
441
+
442
+ BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
443
+ LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
444
+ LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
445
+ LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);
446
+
447
+ /* Fill with random bits */
448
+ for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
449
+ M->sign[i] = ((uint64_t)rand() << 32) | rand();
450
+ for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
451
+ M->planes[i] = ((uint64_t)rand() << 32) | rand();
452
+ for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
453
+ for (size_t i = 0; i < (size_t)x->chunks; i++)
454
+ x->sign[i] = ((uint64_t)rand() << 32) | rand();
455
+ for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
456
+ x->planes[i] = ((uint64_t)rand() << 32) | rand();
457
+ x->base_scale = 1.0f;
458
+
459
+ /* Warmup */
460
+ lum_matvec(M, x, y);
461
+
462
+ struct timespec t0, t1;
463
+ clock_gettime(CLOCK_MONOTONIC, &t0);
464
+ for (int i = 0; i < iters; i++)
465
+ lum_matvec(M, x, y);
466
+ clock_gettime(CLOCK_MONOTONIC, &t1);
467
+
468
+ double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
469
+ int chunks = M->chunks;
470
+ double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; /* AND + popcount pairs */
471
+
472
+ BenchResult r;
473
+ r.wall_time_s = dt / iters;
474
+ r.total_and_ops = ops_per_call;
475
+ r.total_popcount_ops = ops_per_call;
476
+ r.elements_per_sec = (double)rows * cols * iters / dt;
477
+ r.gops = ops_per_call * iters / dt / 1e9;
478
+
479
+ lum_free(M); lut_free(x); lut_free(y);
480
+ return r;
481
+ }
482
+
483
+ /* ============================================================
484
+ * ACCURACY TEST: convert float->logunary->float roundtrip
485
+ * ============================================================ */
486
+ typedef struct {
487
+ float max_error;
488
+ float mean_error;
489
+ float cosine_sim;
490
+ float snr_db;
491
+ } AccuracyResult;
492
+
493
+ AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
494
+ float *original = (float *)aligned_alloc(64, dim * sizeof(float));
495
+ float *recovered = (float *)aligned_alloc(64, dim * sizeof(float));
496
+
497
+ /* Random normal-ish distribution */
498
+ for (int i = 0; i < dim; i++) {
499
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
500
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
501
+ original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
502
+ }
503
+
504
+ LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
505
+ lut_from_float(t, original);
506
+ lut_to_float(t, recovered);
507
+
508
+ float max_err = 0, sum_err = 0;
509
+ float dot = 0, na = 0, nb = 0;
510
+ for (int i = 0; i < dim; i++) {
511
+ float err = fabsf(original[i] - recovered[i]);
512
+ if (err > max_err) max_err = err;
513
+ sum_err += err;
514
+ dot += original[i] * recovered[i];
515
+ na += original[i] * original[i];
516
+ nb += recovered[i] * recovered[i];
517
+ }
518
+
519
+ float noise_power = 0;
520
+ for (int i = 0; i < dim; i++) {
521
+ float e = original[i] - recovered[i];
522
+ noise_power += e * e;
523
+ }
524
+
525
+ AccuracyResult r;
526
+ r.max_error = max_err;
527
+ r.mean_error = sum_err / dim;
528
+ r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
529
+ r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));
530
+
531
+ lut_free(t);
532
+ free(original); free(recovered);
533
+ return r;
534
+ }
packed_convert.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Packed unary converter: uint8 magnitudes + bitpacked signs + per-row scales."""
3
+ import os, json, sys, time
4
+ import numpy as np
5
+ from pathlib import Path
6
+
7
+ def load_safetensors(model_dir):
8
+ from safetensors.torch import load_file
9
+ tensors = {}
10
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
11
+ print(f" Loading {f.name}...")
12
+ for k, v in load_file(str(f)).items():
13
+ tensors[k] = v.float().numpy()
14
+ return tensors
15
+
16
+ def quantize_packed(w, n_levels=7):
17
+ out_dim, in_dim = w.shape
18
+ chunks = (in_dim + 63) // 64
19
+ padded = chunks * 64
20
+ row_max = np.max(np.abs(w), axis=1, keepdims=True)
21
+ row_max = np.where(row_max == 0, 1.0, row_max)
22
+ scales = (row_max.flatten() / n_levels).astype(np.float32)
23
+ mags = np.clip(np.round(np.abs(w / scales[:, None])), 0, n_levels).astype(np.uint8)
24
+ signs = (w < 0)
25
+ rmm = np.max(mags, axis=1).astype(np.uint8)
26
+ if in_dim < padded:
27
+ sp = np.zeros((out_dim, padded), dtype=bool)
28
+ sp[:, :in_dim] = signs
29
+ else:
30
+ sp = signs
31
+ bit_pos = np.uint64(1) << np.arange(64, dtype=np.uint64)
32
+ sign_bits = np.bitwise_or.reduce(sp.reshape(out_dim, chunks, 64).astype(np.uint64) * bit_pos, axis=2)
33
+ return mags, sign_bits, scales, rmm, np.mean(mags), np.mean(mags == 0)
34
+
35
+ def convert(tensors, output_dir, n_levels=7):
36
+ os.makedirs(output_dir, exist_ok=True)
37
+ config = {"hidden_size":1536,"intermediate_size":8960,"num_attention_heads":12,
38
+ "num_key_value_heads":2,"num_hidden_layers":28,"vocab_size":151936,
39
+ "head_dim":128,"rope_theta":1000000.0,"rms_norm_eps":1e-6,
40
+ "n_levels":n_levels,"quant_type":"packed_unary"}
41
+ linear_keys = [k for k in tensors if any(p in k for p in
42
+ ['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
43
+ 'gate_proj.weight','up_proj.weight','down_proj.weight'])]
44
+ other_keys = [k for k in tensors if k not in linear_keys]
45
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
46
+ json.dump(config, f, indent=2)
47
+ total_packed = total_orig = 0
48
+ all_avg = []
49
+ for key in linear_keys:
50
+ w = tensors[key]; total_orig += w.nbytes
51
+ t0 = time.time()
52
+ mags, sb, sc, rmm, am, sp = quantize_packed(w, n_levels)
53
+ dt = time.time() - t0
54
+ pfx = os.path.join(output_dir, key.replace(".", "_"))
55
+ mags.tofile(pfx+".mags"); sb.tofile(pfx+".signs")
56
+ sc.tofile(pfx+".scales"); rmm.tofile(pfx+".rmm")
57
+ ub = mags.nbytes + sb.nbytes + sc.nbytes + rmm.nbytes
58
+ total_packed += ub; all_avg.append(am)
59
+ print(f" {key}: {w.shape} -> {ub/1024:.0f}KB (avg_mag={am:.2f}, {dt:.1f}s)")
60
+ total_fp16 = 0
61
+ for key in other_keys:
62
+ w = tensors[key].astype(np.float16)
63
+ pfx = os.path.join(output_dir, key.replace(".", "_"))
64
+ w.tofile(pfx+".fp16"); total_fp16 += w.nbytes
65
+ manifest = {"packed":{k:list(tensors[k].shape) for k in linear_keys},
66
+ "fp16":{k:list(tensors[k].shape) for k in other_keys}}
67
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
68
+ json.dump(manifest, f, indent=2)
69
+ print(f"\n=== PACKED UNARY ===")
70
+ print(f"Packed linear: {total_packed/1e6:.1f} MB | FP16 other: {total_fp16/1e6:.1f} MB")
71
+ print(f"Total: {(total_packed+total_fp16)/1e6:.1f} MB | Avg mag: {np.mean(all_avg):.3f}")
72
+ print(f"Expected speedup vs 7-plane: {7/np.mean(all_avg):.1f}x")
73
+
74
+ if __name__ == "__main__":
75
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
76
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-packed"
77
+ tensors = load_safetensors(model_dir)
78
+ convert(tensors, output_dir)
79
+ print("Done!")
packed_engine.c ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PACKED UNARY TRANSFORMER ENGINE - AVX-512 + OpenMP
3
+ *
4
+ * Instead of 7 fixed bitplanes (scanning 80% zeros),
5
+ * store magnitude per weight directly. Kernel processes
6
+ * groups of 16 weights, only loops to local max magnitude.
7
+ *
8
+ * Weight j with magnitude 3: adds x[j] THREE times (pure unary).
9
+ * But only 3 passes for that group, not 7.
10
+ *
11
+ * Average magnitude = 1.374, so average ~1.4 passes per group
12
+ * instead of always 7. That's the 5x speedup.
13
+ *
14
+ * Format per output row:
15
+ * mags[in_dim] uint8 - magnitude 0-7 per weight
16
+ * signs[chunks] uint64 - bitpacked sign (1=negative)
17
+ * scale float - per-row scale
18
+ *
19
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
20
+ */
21
+
22
+ #include <immintrin.h>
23
+ #include <stdint.h>
24
+ #include <stdlib.h>
25
+ #include <string.h>
26
+ #include <math.h>
27
+ #include <stdio.h>
28
+ #include <time.h>
29
+ #include <omp.h>
30
+
31
+ #define HIDDEN 1536
32
+ #define INTER 8960
33
+ #define N_HEADS 12
34
+ #define N_KV_HEADS 2
35
+ #define HEAD_DIM 128
36
+ #define N_LAYERS 28
37
+ #define VOCAB 151936
38
+ #define RMS_EPS 1e-6f
39
+ #define ROPE_THETA 1000000.0f
40
+ #define MAX_SEQ 4096
41
+ #define GQA_RATIO (N_HEADS / N_KV_HEADS)
42
+
43
+ typedef struct {
44
+ uint8_t *mags; /* [out_dim * in_dim] magnitude per weight */
45
+ uint64_t *sign_bits; /* [out_dim * chunks] bitpacked signs */
46
+ float *scales; /* [out_dim] per-row scale */
47
+ float *bias; /* [out_dim] or NULL */
48
+ int out_dim, in_dim;
49
+ uint8_t *row_maxmag; /* [out_dim] max magnitude per row for early exit */
50
+ } PL; /* Packed Linear */
51
+
52
+ typedef struct { uint16_t *w; int od, id; } FL;
53
+
54
+ typedef struct {
55
+ PL qp, kp, vp, op, gp, up, dp;
56
+ float *in_norm, *pn_norm;
57
+ float *qb, *kb, *vb;
58
+ } Lay;
59
+
60
+ typedef struct {
61
+ uint16_t *emb;
62
+ Lay lay[N_LAYERS];
63
+ float *fnorm;
64
+ FL lmh;
65
+ float *kc, *vc;
66
+ float *h, *h2;
67
+ float *sq, *sk, *sv, *ao;
68
+ float *sg, *su, *sd;
69
+ float *lg, *as;
70
+ } M;
71
+
72
+ /* ============================================================
73
+ * PACKED UNARY MATVEC
74
+ *
75
+ * Process 16 weights at a time. For each group:
76
+ * 1. Load 16 magnitudes (uint8)
77
+ * 2. Find local max magnitude
78
+ * 3. For m = 1 to local_max:
79
+ * mask = (mag >= m)
80
+ * pos_mask = mask & ~sign
81
+ * neg_mask = mask & sign
82
+ * acc += masked x (pos)
83
+ * acc -= masked x (neg)
84
+ *
85
+ * Each pass = one unary "mark". Pure base-1.
86
+ * Groups where all mags <= 1: ONE pass.
87
+ * Groups where all mags == 0: ZERO passes. Skip entirely.
88
+ * ============================================================ */
89
+ static void pmv(const PL *L, const float *x, float *y) {
90
+ const int od = L->out_dim, id = L->in_dim;
91
+ const int chunks = (id + 63) / 64;
92
+ const int id16 = (id + 15) & ~15;
93
+
94
+ float *xp = (float*)aligned_alloc(64, id16 * sizeof(float));
95
+ memcpy(xp, x, id * sizeof(float));
96
+ if (id16 > id) memset(xp + id, 0, (id16 - id) * sizeof(float));
97
+
98
+ #pragma omp parallel for schedule(dynamic, 64)
99
+ for (int i = 0; i < od; i++) {
100
+ const uint8_t *row_mag = L->mags + (size_t)i * id;
101
+ const uint64_t *row_sign = L->sign_bits + (size_t)i * chunks;
102
+ const int rmax = L->row_maxmag[i];
103
+
104
+ __m512 acc = _mm512_setzero_ps();
105
+
106
+ for (int j = 0; j < id; j += 16) {
107
+ if (j >= id16) break;
108
+
109
+ /* Load 16 magnitudes */
110
+ __m128i mv = _mm_loadu_si128((__m128i*)(row_mag + j));
111
+
112
+ /* Quick check: if all 16 mags are zero, skip entirely */
113
+ if (_mm_testz_si128(mv, mv)) continue;
114
+
115
+ __m512 xv = _mm512_load_ps(xp + j);
116
+
117
+ /* Extract 16 sign bits from bitpacked array */
118
+ int chunk_idx = j / 64;
119
+ int bit_off = j % 64;
120
+ uint64_t sbits = row_sign[chunk_idx];
121
+ uint16_t signs = (uint16_t)((sbits >> bit_off) & 0xFFFF);
122
+
123
+ /* Find max magnitude in this group of 16 */
124
+ /* Use SSE horizontal max */
125
+ __m128i mx = mv;
126
+ mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 8));
127
+ mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 4));
128
+ mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 2));
129
+ mx = _mm_max_epu8(mx, _mm_srli_si128(mx, 1));
130
+ int local_max = _mm_extract_epi8(mx, 0);
131
+
132
+ /* Threshold vector for comparisons */
133
+ for (int m = 1; m <= local_max; m++) {
134
+ /* mask = (mag >= m) */
135
+ __m128i thresh = _mm_set1_epi8((char)m);
136
+ /* Compare: result is 0xFF where mag >= m, 0 otherwise */
137
+ /* SSE doesn't have >= for uint8, use: NOT(max(thresh, mag) == thresh XOR mag == thresh) */
138
+ /* Simpler: mag >= m iff mag - m doesn't underflow, i.e. saturating sub == 0 is false */
139
+ /* Or: max(mag, thresh) == mag means mag >= thresh */
140
+ __m128i cmp = _mm_cmpeq_epi8(_mm_max_epu8(mv, thresh), mv);
141
+ uint16_t active = (uint16_t)_mm_movemask_epi8(cmp);
142
+
143
+ __mmask16 pos = (__mmask16)(active & ~signs);
144
+ __mmask16 neg = (__mmask16)(active & signs);
145
+
146
+ acc = _mm512_mask_add_ps(acc, pos, acc, xv);
147
+ acc = _mm512_mask_sub_ps(acc, neg, acc, xv);
148
+ }
149
+ }
150
+
151
+ y[i] = _mm512_reduce_add_ps(acc) * L->scales[i];
152
+ if (L->bias) y[i] += L->bias[i];
153
+ }
154
+ free(xp);
155
+ }
156
+
157
+ /* FP16 matvec for lm_head */
158
+ static void fmv(const FL *L, const float *x, float *y) {
159
+ #pragma omp parallel for schedule(dynamic, 256)
160
+ for (int i = 0; i < L->od; i++) {
161
+ __m512 acc = _mm512_setzero_ps();
162
+ const uint16_t *row = L->w + (size_t)i * L->id;
163
+ int j;
164
+ for (j = 0; j + 16 <= L->id; j += 16) {
165
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + j));
166
+ acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
167
+ }
168
+ float s = _mm512_reduce_add_ps(acc);
169
+ for (; j < L->id; j++) {
170
+ float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
171
+ s += wf * x[j];
172
+ }
173
+ y[i] = s;
174
+ }
175
+ }
176
+
177
+ /* RMSNorm */
178
+ static void rn(const float *x, const float *w, float *y, int d) {
179
+ __m512 sq = _mm512_setzero_ps();
180
+ int i;
181
+ for (i = 0; i+16 <= d; i += 16) {
182
+ __m512 v = _mm512_loadu_ps(x+i);
183
+ sq = _mm512_fmadd_ps(v, v, sq);
184
+ }
185
+ float ss = _mm512_reduce_add_ps(sq);
186
+ for (; i < d; i++) ss += x[i]*x[i];
187
+ float r = 1.0f / sqrtf(ss/d + RMS_EPS);
188
+ __m512 rv = _mm512_set1_ps(r);
189
+ for (i = 0; i+16 <= d; i += 16)
190
+ _mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
191
+ _mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
192
+ for (; i < d; i++) y[i] = x[i]*r*w[i];
193
+ }
194
+
195
+ static void silu(float *x, int n) {
196
+ for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
197
+ }
198
+ static void emul(const float *a, const float *b, float *c, int n) {
199
+ int i;
200
+ for (i = 0; i+16 <= n; i += 16)
201
+ _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
202
+ for (; i < n; i++) c[i] = a[i]*b[i];
203
+ }
204
+ static void va(float *y, const float *x, int n) {
205
+ int i;
206
+ for (i = 0; i+16 <= n; i += 16)
207
+ _mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
208
+ for (; i < n; i++) y[i] += x[i];
209
+ }
210
+ static void rope(float *v, int pos, int d) {
211
+ for (int i = 0; i < d; i += 2) {
212
+ float f = 1.0f / powf(ROPE_THETA, (float)i/d);
213
+ float a = pos*f, co = cosf(a), si = sinf(a);
214
+ float v0 = v[i], v1 = v[i+1];
215
+ v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
216
+ }
217
+ }
218
+ static void sm(float *x, int n) {
219
+ float mx = x[0];
220
+ for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
221
+ float s = 0;
222
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
223
+ float iv = 1.0f/s;
224
+ for (int i = 0; i < n; i++) x[i] *= iv;
225
+ }
226
+ static void etok(const M *m, int t, float *o) {
227
+ const uint16_t *r = m->emb + (size_t)t * HIDDEN;
228
+ int i;
229
+ for (i = 0; i+16 <= HIDDEN; i += 16)
230
+ _mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
231
+ for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
232
+ }
233
+ static float* kvp(float *c, int l, int p, int h) {
234
+ return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
235
+ }
236
+
237
+ static void do_attn(M *m, int l, int pos) {
238
+ Lay *ly = &m->lay[l];
239
+ pmv(&ly->qp, m->h2, m->sq);
240
+ pmv(&ly->kp, m->h2, m->sk);
241
+ pmv(&ly->vp, m->h2, m->sv);
242
+ if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
243
+ if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
244
+ if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
245
+ for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
246
+ for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
247
+ for (int h = 0; h < N_KV_HEADS; h++) {
248
+ memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
249
+ memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
250
+ }
251
+ float sc = 1.0f/sqrtf((float)HEAD_DIM);
252
+ memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
253
+ for (int h = 0; h < N_HEADS; h++) {
254
+ int kvh = h / GQA_RATIO;
255
+ float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
256
+ for (int t = 0; t <= pos; t++) {
257
+ float *kk = kvp(m->kc,l,t,kvh);
258
+ __m512 a = _mm512_setzero_ps();
259
+ int d;
260
+ for (d = 0; d+16 <= HEAD_DIM; d += 16)
261
+ a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
262
+ float dot = _mm512_reduce_add_ps(a);
263
+ for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
264
+ m->as[t] = dot * sc;
265
+ }
266
+ sm(m->as, pos+1);
267
+ for (int t = 0; t <= pos; t++) {
268
+ float w = m->as[t];
269
+ if (w < 1e-8f) continue;
270
+ float *vv = kvp(m->vc,l,t,kvh);
271
+ __m512 wv = _mm512_set1_ps(w);
272
+ int d;
273
+ for (d = 0; d+16 <= HEAD_DIM; d += 16)
274
+ _mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
275
+ for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
276
+ }
277
+ }
278
+ pmv(&ly->op, m->ao, m->h2);
279
+ }
280
+
281
+ static void do_mlp(M *m, int l) {
282
+ Lay *ly = &m->lay[l];
283
+ pmv(&ly->gp, m->h2, m->sg);
284
+ pmv(&ly->up, m->h2, m->su);
285
+ silu(m->sg, INTER);
286
+ emul(m->sg, m->su, m->sd, INTER);
287
+ pmv(&ly->dp, m->sd, m->h2);
288
+ }
289
+
290
+ float* forward_token(M *m, int tid, int pos) {
291
+ etok(m, tid, m->h);
292
+ for (int l = 0; l < N_LAYERS; l++) {
293
+ rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
294
+ do_attn(m, l, pos);
295
+ va(m->h, m->h2, HIDDEN);
296
+ rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
297
+ do_mlp(m, l);
298
+ va(m->h, m->h2, HIDDEN);
299
+ }
300
+ rn(m->h, m->fnorm, m->h2, HIDDEN);
301
+ fmv(&m->lmh, m->h2, m->lg);
302
+ return m->lg;
303
+ }
304
+
305
+ static int samp(float *lg, int V, float T, float tp) {
306
+ if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
307
+ sm(lg, V);
308
+ float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
309
+ memcpy(pr, lg, V*4);
310
+ for (int i = 0; i < V; i++) ix[i] = i;
311
+ float cum = 0; int nk = 0;
312
+ while (cum < tp && nk < V && nk < 50) {
313
+ int b = nk;
314
+ for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
315
+ float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
316
+ int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
317
+ cum += pr[nk]; nk++;
318
+ }
319
+ float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
320
+ float r = (float)rand()/RAND_MAX * s, ac = 0;
321
+ int ch = ix[0];
322
+ for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
323
+ free(pr); free(ix);
324
+ return ch;
325
+ }
326
+
327
+ int generate(M *m, const int *pr, int pl, int *out, int mx,
328
+ float T, float tp, int eos) {
329
+ srand(time(NULL));
330
+ for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
331
+ int pos = pl, gen = 0;
332
+ for (int t = 0; t < mx; t++) {
333
+ int nx;
334
+ if (T <= 0) {
335
+ nx = 0;
336
+ for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
337
+ } else {
338
+ nx = samp(m->lg, VOCAB, T, tp);
339
+ }
340
+ out[t] = nx; gen++;
341
+ if (nx == eos) break;
342
+ forward_token(m, nx, pos); pos++;
343
+ }
344
+ return gen;
345
+ }
346
+
347
+ M* model_alloc(void) {
348
+ M *m = (M*)calloc(1, sizeof(M));
349
+ size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
350
+ m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
351
+ m->h = (float*)aligned_alloc(64,HIDDEN*4);
352
+ m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
353
+ m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
354
+ m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
355
+ m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
356
+ m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
357
+ m->sg = (float*)aligned_alloc(64,INTER*4);
358
+ m->su = (float*)aligned_alloc(64,INTER*4);
359
+ m->sd = (float*)aligned_alloc(64,INTER*4);
360
+ m->lg = (float*)aligned_alloc(64,VOCAB*4);
361
+ m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
362
+ m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
363
+ printf("Alloc: KV=%zuMB\n", kv*2*4/1024/1024);
364
+ return m;
365
+ }
366
+
367
+ void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
368
+ void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
369
+ void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
370
+ m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
371
+ }
372
+ void layer_set_norms(M *m, int l, float *i, float *p) {
373
+ m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
374
+ }
375
+ void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
376
+ m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
377
+ }
378
+ void set_pl(PL *p, uint8_t *mags, uint64_t *signs, float *scales,
379
+ uint8_t *rmm, int od, int id) {
380
+ p->mags = mags; p->sign_bits = signs; p->scales = scales;
381
+ p->row_maxmag = rmm; p->out_dim = od; p->in_dim = id; p->bias = NULL;
382
+ }
383
+ void layer_set_linears(M *m, int l,
384
+ uint8_t*qm,uint64_t*qs,float*qc,uint8_t*qx,int qo,int qi,
385
+ uint8_t*km,uint64_t*ks,float*kc,uint8_t*kx,int ko,int ki,
386
+ uint8_t*vm,uint64_t*vs,float*vc,uint8_t*vx,int vo,int vi,
387
+ uint8_t*om,uint64_t*os_,float*oc,uint8_t*ox,int oo,int oi,
388
+ uint8_t*gm,uint64_t*gs,float*gc,uint8_t*gx,int go,int gi,
389
+ uint8_t*um,uint64_t*us,float*uc,uint8_t*ux,int uo,int ui,
390
+ uint8_t*dm,uint64_t*ds,float*dc,uint8_t*dx,int doo,int di) {
391
+ set_pl(&m->lay[l].qp,qm,qs,qc,qx,qo,qi);
392
+ set_pl(&m->lay[l].kp,km,ks,kc,kx,ko,ki);
393
+ set_pl(&m->lay[l].vp,vm,vs,vc,vx,vo,vi);
394
+ set_pl(&m->lay[l].op,om,os_,oc,ox,oo,oi);
395
+ set_pl(&m->lay[l].gp,gm,gs,gc,gx,go,gi);
396
+ set_pl(&m->lay[l].up,um,us,uc,ux,uo,ui);
397
+ set_pl(&m->lay[l].dp,dm,ds,dc,dx,doo,di);
398
+ }
399
+ void model_reset_cache(M *m) {
400
+ size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
401
+ memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
402
+ }
403
+ void model_free(M *m) {
404
+ free(m->kc);free(m->vc);free(m->h);free(m->h2);
405
+ free(m->sq);free(m->sk);free(m->sv);free(m->ao);
406
+ free(m->sg);free(m->su);free(m->sd);
407
+ free(m->lg);free(m->as);free(m->fnorm);free(m);
408
+ }
packed_loader.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Packed unary loader. Loads weights, passes pointers to C engine."""
3
+ import ctypes, os, sys, time, json
4
+ import numpy as np
5
+ from ctypes import c_int, c_float, c_void_p, POINTER, c_uint8, c_uint64
6
+
7
+ class PackedEngine:
8
+ def __init__(self, model_dir, engine_path="./packed_engine.so"):
9
+ self.lib = ctypes.CDLL(engine_path)
10
+ self.lib.model_alloc.restype = c_void_p
11
+ self.lib.forward_token.restype = POINTER(c_float)
12
+ self.model_dir = model_dir
13
+
14
+ with open(os.path.join(model_dir, "manifest.json")) as f:
15
+ self.manifest = json.load(f)
16
+ with open(os.path.join(model_dir, "config.json")) as f:
17
+ self.config = json.load(f)
18
+
19
+ self.arrays = [] # prevent GC
20
+ self.model = self.lib.model_alloc()
21
+ self._load_weights()
22
+
23
+ def _keep(self, arr):
24
+ self.arrays.append(arr)
25
+ return arr.ctypes.data
26
+
27
+ def _load_file(self, key, ext, dtype):
28
+ path = os.path.join(self.model_dir, key.replace(".", "_") + ext)
29
+ return np.fromfile(path, dtype=dtype)
30
+
31
+ def _load_weights(self):
32
+ t0 = time.time()
33
+ fp16_keys = self.manifest["fp16"]
34
+ packed_keys = self.manifest["packed"]
35
+
36
+ # Embeddings
37
+ emb = self._load_file("model.embed_tokens.weight", ".fp16", np.uint16)
38
+ self.lib.model_set_embed(self.model, self._keep(emb))
39
+ print(f" Embeddings: {emb.nbytes/1e6:.1f} MB")
40
+
41
+ # LM head
42
+ lm = self._load_file("lm_head.weight", ".fp16", np.uint16)
43
+ od, id_ = fp16_keys["lm_head.weight"]
44
+ self.lib.model_set_lm_head(self.model, self._keep(lm), od, id_)
45
+ print(f" LM head: {lm.nbytes/1e6:.1f} MB")
46
+
47
+ # Final norm
48
+ fn = self._load_file("model.norm.weight", ".fp16", np.uint16).astype(np.float32)
49
+ # fp16 stored, convert
50
+ fn_f16 = self._load_file("model.norm.weight", ".fp16", np.float16)
51
+ fn = fn_f16.astype(np.float32)
52
+ self.lib.model_set_final_norm(self.model, self._keep(fn))
53
+
54
+ n_layers = self.config["num_hidden_layers"]
55
+ for l in range(n_layers):
56
+ pfx = f"model.layers.{l}"
57
+
58
+ # Norms
59
+ in_f16 = self._load_file(f"{pfx}.input_layernorm.weight", ".fp16", np.float16)
60
+ pn_f16 = self._load_file(f"{pfx}.post_attention_layernorm.weight", ".fp16", np.float16)
61
+ in_f = in_f16.astype(np.float32)
62
+ pn_f = pn_f16.astype(np.float32)
63
+ self.lib.layer_set_norms(self.model, l, self._keep(in_f), self._keep(pn_f))
64
+
65
+ # Biases (Q/K/V)
66
+ qb = kb = vb = None
67
+ qb_key = f"{pfx}.self_attn.q_proj.bias"
68
+ if qb_key in fp16_keys:
69
+ qb_f16 = self._load_file(qb_key, ".fp16", np.float16)
70
+ qb = qb_f16.astype(np.float32)
71
+ kb_f16 = self._load_file(f"{pfx}.self_attn.k_proj.bias", ".fp16", np.float16)
72
+ kb = kb_f16.astype(np.float32)
73
+ vb_f16 = self._load_file(f"{pfx}.self_attn.v_proj.bias", ".fp16", np.float16)
74
+ vb = vb_f16.astype(np.float32)
75
+ self.lib.layer_set_bias(self.model, l,
76
+ self._keep(qb), self._keep(kb), self._keep(vb))
77
+ else:
78
+ self.lib.layer_set_bias(self.model, l, None, None, None)
79
+
80
+ # 7 linear layers: q,k,v,o,gate,up,down
81
+ args = []
82
+ for name in ['self_attn.q_proj','self_attn.k_proj','self_attn.v_proj',
83
+ 'self_attn.o_proj','mlp.gate_proj','mlp.up_proj','mlp.down_proj']:
84
+ key = f"{pfx}.{name}.weight"
85
+ shape = packed_keys[key]
86
+ od, id_ = shape
87
+ mags = self._load_file(key, ".mags", np.uint8)
88
+ signs = self._load_file(key, ".signs", np.uint64)
89
+ scales = self._load_file(key, ".scales", np.float32)
90
+ rmm = self._load_file(key, ".rmm", np.uint8)
91
+ args.extend([self._keep(mags), self._keep(signs),
92
+ self._keep(scales), self._keep(rmm), od, id_])
93
+
94
+ self.lib.layer_set_linears(self.model, l, *args)
95
+
96
+ if (l+1) % 7 == 0 or l == n_layers-1:
97
+ print(f" Loaded {l+1}/{n_layers} layers")
98
+
99
+ dt = time.time() - t0
100
+ total = sum(a.nbytes for a in self.arrays)
101
+ print(f"\nModel loaded in {dt:.1f}s, {total/1e6:.0f} MB in Python arrays")
102
+
103
+ def generate(self, token_ids, max_new_tokens=100, temperature=0.6, top_p=0.9, eos_id=151643):
104
+ prompt = (c_int * len(token_ids))(*token_ids)
105
+ output = (c_int * max_new_tokens)()
106
+ self.lib.model_reset_cache(self.model)
107
+ t0 = time.time()
108
+ n = self.lib.generate(self.model, prompt, len(token_ids),
109
+ output, max_new_tokens, c_float(temperature),
110
+ c_float(top_p), eos_id)
111
+ dt = time.time() - t0
112
+ tokens = [output[i] for i in range(n)]
113
+ return tokens, n, dt
114
+
115
+
116
+ if __name__ == "__main__":
117
+ from transformers import AutoTokenizer
118
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-packed"
119
+ tok_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
120
+
121
+ print("Loading tokenizer...")
122
+ tok = AutoTokenizer.from_pretrained(tok_dir, trust_remote_code=True)
123
+ print("Loading packed unary engine...")
124
+ engine = PackedEngine(model_dir, "./packed_engine.so")
125
+
126
+ prompts = ["What is 2+2?", "Explain gravity in one sentence.", "Write a haiku about snow."]
127
+ for prompt in prompts:
128
+ msgs = [{"role": "user", "content": prompt}]
129
+ ids = tok.apply_chat_template(msgs, add_generation_prompt=True)
130
+ tokens, n, dt = engine.generate(ids, max_new_tokens=100, temperature=0.6)
131
+ text = tok.decode(tokens, skip_special_tokens=False)
132
+ print(f"\n[{prompt}] ({n} tok, {dt:.1f}s, {n/dt:.1f} tok/s)")
133
+ print(text[:300])
134
+ print("---")
proper_unary ADDED
Binary file (26 kB). View file
 
proper_unary.c ADDED
@@ -0,0 +1,563 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PROPER UNARY — ONE QUANTUM, NO SCALES
3
+ *
4
+ * Every single bit in the entire system has weight = 1 quantum.
5
+ * The quantum is set ONCE for the whole model.
6
+ * There are NO per-vector scales. NO per-row scales.
7
+ *
8
+ * The number 5.0 with quantum=0.1 is stored as 50 ones.
9
+ * The number 5.0 with quantum=0.01 is stored as 500 ones.
10
+ * More precision = more bits. That's the tradeoff.
11
+ *
12
+ * ADDITION = CONCATENATION. Always. No exceptions.
13
+ * Because every bit everywhere means the same thing.
14
+ *
15
+ * MATMUL: y[i] = sum_j W[i][j] * x[j]
16
+ * = sum over all (w_slot, x_slot) pairs:
17
+ * popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
18
+ * - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
19
+ * = quantum² * integer_count
20
+ *
21
+ * Output quantum = input_quantum² (magnitude grows)
22
+ * Or we pick output quantum = input_quantum and accept
23
+ * that the integer count includes the scaling.
24
+ *
25
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
26
+ */
27
+
28
+ #define _POSIX_C_SOURCE 199309L
29
+ #include <immintrin.h>
30
+ #include <omp.h>
31
+ #include <stdint.h>
32
+ #include <stdlib.h>
33
+ #include <string.h>
34
+ #include <math.h>
35
+ #include <stdio.h>
36
+ #include <time.h>
37
+
38
+ /* ============================================================
39
+ * PROPER UNARY VECTOR
40
+ * Every bit = 1 quantum. No local scale.
41
+ * ============================================================ */
42
+ typedef struct {
43
+ uint64_t *sign; /* [chunks] */
44
+ uint64_t *slots; /* [n_slots * chunks] */
45
+ int dim;
46
+ int chunks;
47
+ int n_slots;
48
+ int cap; /* max slots allocated */
49
+ } UVec;
50
+
51
+ /* Proper unary matrix — same quantum as vectors */
52
+ typedef struct {
53
+ uint64_t *sign; /* [rows * chunks] */
54
+ uint64_t *slots; /* [K * rows * chunks] */
55
+ int rows, cols, chunks, K;
56
+ } UMat;
57
+
58
+ /* Global system quantum */
59
+ typedef struct {
60
+ float quantum; /* every bit = this much */
61
+ /* quantum² is the matmul output unit */
62
+ } USystem;
63
+
64
+ /* ============================================================
65
+ * ALLOC
66
+ * ============================================================ */
67
+ UVec* uv_new(int dim, int cap) {
68
+ UVec *v = (UVec *)calloc(1, sizeof(UVec));
69
+ v->dim = dim;
70
+ v->chunks = (dim + 63) / 64;
71
+ v->n_slots = 0;
72
+ v->cap = cap;
73
+ v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
74
+ v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t));
75
+ memset(v->sign, 0, v->chunks * sizeof(uint64_t));
76
+ memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
77
+ return v;
78
+ }
79
+
80
+ UMat* um_new(int rows, int cols, int K) {
81
+ UMat *m = (UMat *)calloc(1, sizeof(UMat));
82
+ m->rows = rows; m->cols = cols; m->K = K;
83
+ m->chunks = (cols + 63) / 64;
84
+ m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
85
+ m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
86
+ memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
87
+ memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
88
+ return m;
89
+ }
90
+
91
+ void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
92
+ void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }
93
+
94
+ /* ============================================================
95
+ * QUANTIZE: float → proper unary
96
+ *
97
+ * Given global quantum q:
98
+ * magnitude = round(|value| / q)
99
+ * That many slots get bit set.
100
+ *
101
+ * NO per-vector absmax. NO local scale.
102
+ * Values that exceed K are clipped.
103
+ * ============================================================ */
104
+ void uv_from_float(UVec *v, const float *x, int K, float quantum) {
105
+ int dim = v->dim, chunks = v->chunks;
106
+ v->n_slots = K;
107
+
108
+ memset(v->sign, 0, chunks * sizeof(uint64_t));
109
+ memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
110
+
111
+ float inv_q = 1.0f / quantum;
112
+ for (int i = 0; i < dim; i++) {
113
+ int c = i / 64;
114
+ uint64_t bit = 1ULL << (i % 64);
115
+
116
+ if (x[i] < 0.0f) v->sign[c] |= bit;
117
+
118
+ int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
119
+ if (mag > K) mag = K; /* clip */
120
+ for (int s = 0; s < mag; s++)
121
+ v->slots[(size_t)s * chunks + c] |= bit;
122
+ }
123
+ }
124
+
125
+ void uv_to_float(const UVec *v, float *out, float quantum) {
126
+ int dim = v->dim, chunks = v->chunks;
127
+
128
+ for (int i = 0; i < dim; i++) {
129
+ int c = i / 64;
130
+ uint64_t bit = 1ULL << (i % 64);
131
+
132
+ int mag = 0;
133
+ for (int s = 0; s < v->n_slots; s++)
134
+ if (v->slots[(size_t)s * chunks + c] & bit)
135
+ mag++;
136
+
137
+ out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
138
+ }
139
+ }
140
+
141
+ void um_from_float(UMat *m, const float *data, float quantum) {
142
+ int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
143
+
144
+ memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
145
+ memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
146
+
147
+ float inv_q = 1.0f / quantum;
148
+ for (int r = 0; r < rows; r++) {
149
+ const float *row = data + (size_t)r * cols;
150
+ uint64_t *rs = m->sign + (size_t)r * chunks;
151
+
152
+ for (int j = 0; j < cols; j++) {
153
+ int c = j / 64;
154
+ uint64_t bit = 1ULL << (j % 64);
155
+ if (row[j] < 0.0f) rs[c] |= bit;
156
+
157
+ int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
158
+ if (mag > K) mag = K;
159
+ for (int s = 0; s < mag; s++)
160
+ m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
161
+ }
162
+ }
163
+ }
164
+
165
+ /* ============================================================
166
+ * CONCATENATION = ADDITION
167
+ *
168
+ * Since every bit everywhere = same quantum,
169
+ * appending slots IS adding magnitudes. Period.
170
+ *
171
+ * Sign handling: for elements where signs differ,
172
+ * cancel bits from existing slots.
173
+ * ============================================================ */
174
+ void uv_concat(UVec *dst, const UVec *src) {
175
+ int chunks = dst->chunks;
176
+
177
+ for (int s = 0; s < src->n_slots; s++) {
178
+ if (dst->n_slots >= dst->cap) {
179
+ printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
180
+ return;
181
+ }
182
+
183
+ const uint64_t *src_slot = src->slots + (size_t)s * chunks;
184
+ uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks;
185
+
186
+ for (int c = 0; c < chunks; c++) {
187
+ uint64_t sb = src_slot[c];
188
+ uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
189
+ uint64_t disagree = dst->sign[c] ^ src->sign[c];
190
+
191
+ /* Same sign: straight append */
192
+ uint64_t add = sb & agree;
193
+
194
+ /* Different sign: cancel from existing */
195
+ uint64_t cancel = sb & disagree;
196
+ for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
197
+ uint64_t *ds = dst->slots + (size_t)d * chunks + c;
198
+ uint64_t overlap = *ds & cancel;
199
+ *ds &= ~overlap;
200
+ cancel &= ~overlap;
201
+ }
202
+ /* Leftover cancel = src magnitude exceeds dst, flip sign */
203
+ if (cancel) {
204
+ dst->sign[c] ^= cancel;
205
+ add |= cancel;
206
+ }
207
+
208
+ new_slot[c] = add;
209
+ }
210
+
211
+ /* Check if slot has any bits */
212
+ int any = 0;
213
+ for (int c = 0; c < chunks && !any; c++)
214
+ if (new_slot[c]) any = 1;
215
+ if (any) dst->n_slots++;
216
+ }
217
+ }
218
+
219
+ /* ============================================================
220
+ * MATMUL: y = M @ x
221
+ *
222
+ * Output unit = quantum² (one quantum from weight × one from activation)
223
+ * The integer accumulator directly gives the value in units of quantum².
224
+ *
225
+ * To keep everything in the same quantum system:
226
+ * y_float[i] = acc * quantum²
227
+ * Then requantize to unary with the SAME global quantum.
228
+ * y_mag[i] = acc * quantum² / quantum = acc * quantum
229
+ *
230
+ * ============================================================ */
231
+ void uv_matmul(
232
+ const UMat *M, const UVec *x,
233
+ UVec *y, int K_out, float quantum
234
+ ) {
235
+ int out_dim = M->rows;
236
+ int chunks = M->chunks;
237
+ int wK = M->K;
238
+ int xK = x->n_slots;
239
+
240
+ float q2 = quantum * quantum;
241
+
242
+ y->n_slots = K_out;
243
+ memset(y->sign, 0, y->chunks * sizeof(uint64_t));
244
+ memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));
245
+
246
+ /* Compute integer dot products */
247
+ int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int));
248
+ uint8_t *neg = (uint8_t *)calloc(out_dim, 1);
249
+
250
+ #pragma omp parallel for schedule(dynamic, 32)
251
+ for (int i = 0; i < out_dim; i++) {
252
+ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
253
+ long long a = 0;
254
+
255
+ for (int c = 0; c < chunks; c++) {
256
+ uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
257
+ uint64_t diff = w_sign_row[c] ^ x->sign[c];
258
+
259
+ for (int p = 0; p < wK; p++) {
260
+ uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
261
+ for (int q = 0; q < xK; q++) {
262
+ uint64_t xq = x->slots[(size_t)q * chunks + c];
263
+ uint64_t active = wp & xq;
264
+ a += __builtin_popcountll(active & same)
265
+ - __builtin_popcountll(active & diff);
266
+ }
267
+ }
268
+ }
269
+
270
+ /* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
271
+ float val = (float)a * quantum;
272
+ int mag = (int)(fabsf(val) + 0.5f);
273
+ if (mag > K_out) mag = K_out;
274
+ acc[i] = mag;
275
+ neg[i] = (val < 0.0f) ? 1 : 0;
276
+ }
277
+
278
+ /* Encode directly to unary — no float intermediate */
279
+ for (int i = 0; i < out_dim; i++) {
280
+ int c = i / 64;
281
+ uint64_t bit = 1ULL << (i % 64);
282
+ if (neg[i]) y->sign[c] |= bit;
283
+ for (int s = 0; s < acc[i]; s++)
284
+ y->slots[(size_t)s * y->chunks + c] |= bit;
285
+ }
286
+
287
+ free(acc); free(neg);
288
+ }
289
+
290
+ /* ============================================================
291
+ * RMSNORM — resets slot count, keeps same quantum
292
+ * ============================================================ */
293
+ void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) {
294
+ int dim = x->dim;
295
+ float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
296
+ uv_to_float(x, xf, quantum);
297
+
298
+ float ss = 0.0f;
299
+ for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
300
+ float rms = 1.0f / sqrtf(ss / dim + eps);
301
+ for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
302
+
303
+ uv_from_float(out, xf, K_out, quantum);
304
+ free(xf);
305
+ }
306
+
307
+ /* ============================================================
308
+ * TESTS
309
+ * ============================================================ */
310
+
311
+ void test_concat_correct() {
312
+ printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");
313
+
314
+ float quantum = 0.25f; /* every bit = 0.25 */
315
+ int dim = 8;
316
+
317
+ /* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
318
+ * In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
319
+ * Need K >= 28 slots to hold 7.0
320
+ */
321
+ float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
322
+ float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
323
+ float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};
324
+
325
+ int K = 32;
326
+ UVec *a = uv_new(dim, 128);
327
+ UVec *b = uv_new(dim, 128);
328
+
329
+ uv_from_float(a, a_vals, K, quantum);
330
+ uv_from_float(b, b_vals, K, quantum);
331
+
332
+ float a_rec[8], b_rec[8];
333
+ uv_to_float(a, a_rec, quantum);
334
+ uv_to_float(b, b_rec, quantum);
335
+
336
+ printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
337
+ printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
338
+ printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
339
+ printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
340
+ printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");
341
+
342
+ printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
343
+ uv_concat(a, b);
344
+ printf("After concat: %d slots\n\n", a->n_slots);
345
+
346
+ float result[8];
347
+ uv_to_float(a, result, quantum);
348
+
349
+ printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
350
+ printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
351
+ printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");
352
+
353
+ uv_free(a); uv_free(b);
354
+ }
355
+
356
+ void test_chain_concat() {
357
+ printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");
358
+
359
+ float quantum = 0.1f;
360
+ int dim = 4;
361
+ int K = 64;
362
+
363
+ float vals[] = {1.0, -2.0, 3.0, -0.5};
364
+ UVec *acc = uv_new(dim, 512);
365
+ uv_from_float(acc, vals, K, quantum);
366
+
367
+ printf("Start: ");
368
+ float tmp[4];
369
+ uv_to_float(acc, tmp, quantum);
370
+ for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
371
+ printf(" (%d slots)\n", acc->n_slots);
372
+
373
+ float expected[] = {1.0, -2.0, 3.0, -0.5};
374
+
375
+ for (int step = 0; step < 5; step++) {
376
+ float add_vals[] = {0.5, 0.3, -1.0, 0.7};
377
+ UVec *delta = uv_new(dim, K);
378
+ uv_from_float(delta, add_vals, K, quantum);
379
+
380
+ uv_concat(acc, delta);
381
+
382
+ for (int i = 0; i < 4; i++) expected[i] += add_vals[i];
383
+
384
+ uv_to_float(acc, tmp, quantum);
385
+ printf(" +[0.5,0.3,-1.0,0.7] = ");
386
+ for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
387
+ printf(" (%d slots) expect:", acc->n_slots);
388
+ for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);
389
+
390
+ /* Check error */
391
+ float max_err = 0;
392
+ for(int i=0;i<4;i++) {
393
+ float e = fabsf(expected[i] - tmp[i]);
394
+ if (e > max_err) max_err = e;
395
+ }
396
+ printf(" err=%.2f\n", max_err);
397
+
398
+ uv_free(delta);
399
+ }
400
+
401
+ uv_free(acc);
402
+ }
403
+
404
+ void test_matmul() {
405
+ printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");
406
+
407
+ int rows = 512, cols = 256;
408
+ int wK = 32, xK = 32;
409
+
410
+ srand(42);
411
+ float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
412
+ float *xf = (float *)malloc(cols * sizeof(float));
413
+ float *y_ref = (float *)calloc(rows, sizeof(float));
414
+
415
+ /* Small values so magnitudes fit in K slots */
416
+ for (size_t i = 0; i < (size_t)rows * cols; i++)
417
+ Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
418
+ for (int i = 0; i < cols; i++)
419
+ xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
420
+ for (int i = 0; i < rows; i++)
421
+ for (int j = 0; j < cols; j++)
422
+ y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
423
+
424
+ /* Find quantum that fits the data range */
425
+ float data_max = 0;
426
+ for (size_t i = 0; i < (size_t)rows * cols; i++) {
427
+ float a = fabsf(Mf[i]);
428
+ if (a > data_max) data_max = a;
429
+ }
430
+ for (int i = 0; i < cols; i++) {
431
+ float a = fabsf(xf[i]);
432
+ if (a > data_max) data_max = a;
433
+ }
434
+ float quantum = data_max / wK;
435
+
436
+ printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
437
+ printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK);
438
+ printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);
439
+
440
+ UMat *M = um_new(rows, cols, wK);
441
+ UVec *x = uv_new(cols, xK);
442
+
443
+ um_from_float(M, Mf, quantum);
444
+ uv_from_float(x, xf, xK, quantum);
445
+
446
+ /* Output needs enough K for the matmul result range */
447
+ float ymax = 0;
448
+ for (int i = 0; i < rows; i++) {
449
+ float a = fabsf(y_ref[i]);
450
+ if (a > ymax) ymax = a;
451
+ }
452
+ int K_out = (int)(ymax / quantum + 1);
453
+ if (K_out > 4096) K_out = 4096;
454
+ printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);
455
+
456
+ UVec *y = uv_new(rows, K_out);
457
+
458
+ struct timespec t0, t1;
459
+ clock_gettime(CLOCK_MONOTONIC, &t0);
460
+ uv_matmul(M, x, y, K_out, quantum);
461
+ clock_gettime(CLOCK_MONOTONIC, &t1);
462
+ double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
463
+
464
+ float *yf = (float *)malloc(rows * sizeof(float));
465
+ uv_to_float(y, yf, quantum);
466
+
467
+ float dot = 0, na = 0, nb = 0, noise = 0;
468
+ for (int i = 0; i < rows; i++) {
469
+ dot += y_ref[i] * yf[i];
470
+ na += y_ref[i] * y_ref[i];
471
+ nb += yf[i] * yf[i];
472
+ float e = y_ref[i] - yf[i]; noise += e * e;
473
+ }
474
+ float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
475
+ float snr = 10.0f * log10f(na / (noise + 1e-10f));
476
+
477
+ printf("\nCosine: %.6f\n", cosine);
478
+ printf("SNR: %.1f dB\n", snr);
479
+ printf("Time: %.1f ms\n", ms);
480
+
481
+ printf("\nFirst 10 values:\n");
482
+ printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
483
+ for (int i = 0; i < 10; i++)
484
+ printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);
485
+
486
+ um_free(M); uv_free(x); uv_free(y);
487
+ free(Mf); free(xf); free(y_ref); free(yf);
488
+ }
489
+
490
+ void test_residual_chain() {
491
+ printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");
492
+
493
+ float quantum = 0.05f;
494
+ int dim = 1024;
495
+ int K = 128; /* fits values up to 6.4 */
496
+
497
+ srand(123);
498
+ float *embed = (float *)malloc(dim * sizeof(float));
499
+ for (int i = 0; i < dim; i++)
500
+ embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;
501
+
502
+ /* Float reference: accumulate residuals */
503
+ float *ref = (float *)malloc(dim * sizeof(float));
504
+ memcpy(ref, embed, dim * sizeof(float));
505
+
506
+ /* Unary: grow via concat */
507
+ int total_cap = K + 10 * K; /* room for 10 concat operations */
508
+ UVec *residual = uv_new(dim, total_cap);
509
+ uv_from_float(residual, embed, K, quantum);
510
+
511
+ printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
512
+ printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");
513
+
514
+ for (int step = 0; step < 10; step++) {
515
+ float *delta = (float *)malloc(dim * sizeof(float));
516
+ for (int i = 0; i < dim; i++)
517
+ delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;
518
+
519
+ /* Float reference */
520
+ for (int i = 0; i < dim; i++) ref[i] += delta[i];
521
+
522
+ /* Unary: concat */
523
+ UVec *d = uv_new(dim, K);
524
+ uv_from_float(d, delta, K, quantum);
525
+ uv_concat(residual, d);
526
+
527
+ /* Compare */
528
+ float *rec = (float *)malloc(dim * sizeof(float));
529
+ uv_to_float(residual, rec, quantum);
530
+
531
+ float dot = 0, na = 0, nb = 0, max_err = 0;
532
+ for (int i = 0; i < dim; i++) {
533
+ dot += ref[i] * rec[i];
534
+ na += ref[i] * ref[i];
535
+ nb += rec[i] * rec[i];
536
+ float e = fabsf(ref[i] - rec[i]);
537
+ if (e > max_err) max_err = e;
538
+ }
539
+ float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
540
+
541
+ printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);
542
+
543
+ uv_free(d); free(delta); free(rec);
544
+ }
545
+
546
+ uv_free(residual);
547
+ free(embed); free(ref);
548
+ }
549
+
550
+ int main() {
551
+ printf("================================================\n");
552
+ printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
553
+ printf(" Every bit = 1 quantum. Concat = Add.\n");
554
+ printf("================================================\n\n");
555
+
556
+ test_concat_correct();
557
+ test_chain_concat();
558
+ test_matmul();
559
+ test_residual_chain();
560
+
561
+ printf("\n=== DONE ===\n");
562
+ return 0;
563
+ }
pure_unary_engine.c ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PURE UNARY TRANSFORMER ENGINE
3
+ *
4
+ * ALL matrix multiplications use base-1 arithmetic:
5
+ * - Weights: unary encoded (sign + N magnitude planes)
6
+ * - Activations: unary encoded (sign + M magnitude planes)
7
+ * - Matmul = bitwise AND + popcount across plane pairs
8
+ * - Float only used for: RMSNorm, SiLU, Softmax, rescale, residual add
9
+ * - These are all O(dim) not O(dim²), so don't dominate
10
+ *
11
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
12
+ */
13
+
14
+ #include <immintrin.h>
15
+ #include <omp.h>
16
+ #include <stdint.h>
17
+ #include <stdlib.h>
18
+ #include <string.h>
19
+ #include <math.h>
20
+ #include <stdio.h>
21
+ #include <time.h>
22
+
23
+ #define MAX_SEQ 4096
24
+ #define RMS_EPS 1e-6f
25
+
26
+ /* ============================================================
27
+ * Unary vector: a quantized 1D activation or intermediate
28
+ * ============================================================ */
29
+ typedef struct {
30
+ uint64_t *sign; /* [chunks] */
31
+ uint64_t *planes; /* [n_planes][chunks] */
32
+ float scale;
33
+ int dim;
34
+ int chunks;
35
+ int n_planes;
36
+ } UnaryVec;
37
+
38
+ /* ============================================================
39
+ * Config
40
+ * ============================================================ */
41
+ typedef struct {
42
+ int hidden;
43
+ int inter;
44
+ int n_heads;
45
+ int n_kv_heads;
46
+ int head_dim;
47
+ int n_layers;
48
+ int vocab;
49
+ float rope_theta;
50
+ int tie_embeddings;
51
+ int w_planes; /* weight quantization planes */
52
+ int a_planes; /* activation quantization planes */
53
+ } Config;
54
+
55
+ /* Unary weight matrix */
56
+ typedef struct {
57
+ uint64_t *sign_bits;
58
+ uint64_t *mag_planes;
59
+ float *scales;
60
+ int out_dim;
61
+ int in_dim;
62
+ int n_planes;
63
+ int chunks; /* = (in_dim + 63) / 64 */
64
+ } UnaryWeight;
65
+
66
+ /* Transformer layer */
67
+ typedef struct {
68
+ UnaryWeight q_proj, k_proj, v_proj, o_proj;
69
+ UnaryWeight gate_proj, up_proj, down_proj;
70
+ float *input_norm;
71
+ float *post_norm;
72
+ float *q_norm, *k_norm;
73
+ } Layer;
74
+
75
+ /* Full model */
76
+ typedef struct {
77
+ Config cfg;
78
+ uint16_t *embed;
79
+ Layer *layers;
80
+ float *final_norm;
81
+
82
+ /* KV cache (float - only O(seq × heads × dim) not O(dim²)) */
83
+ float *k_cache;
84
+ float *v_cache;
85
+
86
+ /* Scratch - float buffers for non-matmul ops */
87
+ float *hidden; /* residual stream */
88
+ float *normed; /* after RMSNorm, before quantization */
89
+ float *q_float;
90
+ float *k_float;
91
+ float *v_float;
92
+ float *attn_out;
93
+ float *gate_float;
94
+ float *up_float;
95
+ float *mlp_act; /* gate*up result before quantization */
96
+ float *logits;
97
+ float *attn_scores;
98
+
99
+ /* Scratch - unary vectors for matmul inputs */
100
+ UnaryVec uv_normed;
101
+ UnaryVec uv_mlp_in;
102
+ UnaryVec uv_mlp_act; /* for down_proj input */
103
+
104
+ /* Output integer accumulators (avoid malloc per call) */
105
+ int *acc_buf;
106
+ } Model;
107
+
108
+ /* ============================================================
109
+ * ACTIVATION QUANTIZATION: float -> unary
110
+ * Runs per-vector: one scale for entire vector
111
+ * O(dim) operation, not in the hot path
112
+ * ============================================================ */
113
+ static void quantize_to_unary(
114
+ const float *x, int dim, int n_planes,
115
+ uint64_t *sign_out, uint64_t *planes_out, float *scale_out
116
+ ) {
117
+ int chunks = (dim + 63) / 64;
118
+
119
+ /* Find absmax */
120
+ float amax = 0.0f;
121
+ for (int i = 0; i < dim; i++) {
122
+ float a = fabsf(x[i]);
123
+ if (a > amax) amax = a;
124
+ }
125
+ if (amax == 0.0f) amax = 1.0f;
126
+ *scale_out = amax / n_planes;
127
+
128
+ /* Clear output */
129
+ memset(sign_out, 0, chunks * sizeof(uint64_t));
130
+ memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t));
131
+
132
+ /* Quantize element by element */
133
+ float inv_scale = n_planes / amax;
134
+ for (int i = 0; i < dim; i++) {
135
+ int chunk = i / 64;
136
+ int bit = i % 64;
137
+ uint64_t mask = 1ULL << bit;
138
+
139
+ /* Sign */
140
+ if (x[i] < 0.0f)
141
+ sign_out[chunk] |= mask;
142
+
143
+ /* Magnitude: thermometer encode */
144
+ int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f);
145
+ if (mag > n_planes) mag = n_planes;
146
+ for (int p = 0; p < mag; p++)
147
+ planes_out[(size_t)p * chunks + chunk] |= mask;
148
+ }
149
+ }
150
+
151
+ /* ============================================================
152
+ * PURE UNARY MATVEC: y = W @ x
153
+ *
154
+ * Both W and x are unary encoded.
155
+ * Inner loop is purely: AND + popcount
156
+ * Float multiply happens ONCE per output element (rescale)
157
+ * ============================================================ */
158
+ static void pure_unary_matvec(
159
+ const UnaryWeight *W,
160
+ const uint64_t *x_sign, const uint64_t *x_planes,
161
+ float x_scale, int x_n_planes,
162
+ float *y_out, /* float output for non-matmul ops */
163
+ int *acc_buf /* scratch for integer accumulators */
164
+ ) {
165
+ int out_dim = W->out_dim;
166
+ int chunks = W->chunks;
167
+ int wp = W->n_planes;
168
+ int xp = x_n_planes;
169
+
170
+ #pragma omp parallel for schedule(dynamic, 32)
171
+ for (int i = 0; i < out_dim; i++) {
172
+ const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks;
173
+
174
+ /* Precompute same_sign mask for this row vs input */
175
+ /* same_sign[c] = ~(w_sign[c] ^ x_sign[c]) */
176
+ /* We compute this per-chunk inside the loop to avoid allocation */
177
+
178
+ long long acc = 0;
179
+
180
+ for (int c = 0; c < chunks; c++) {
181
+ uint64_t ws = w_sign_row[c];
182
+ uint64_t xs = x_sign[c];
183
+ uint64_t same = ~(ws ^ xs); /* bits where signs agree */
184
+ uint64_t diff = ws ^ xs; /* bits where signs differ */
185
+
186
+ for (int p = 0; p < wp; p++) {
187
+ uint64_t w_mag = W->mag_planes[((size_t)p * out_dim + i) * chunks + c];
188
+
189
+ for (int q = 0; q < xp; q++) {
190
+ uint64_t x_mag = x_planes[(size_t)q * chunks + c];
191
+ uint64_t active = w_mag & x_mag;
192
+
193
+ /* Count positive and negative contributions */
194
+ uint64_t pos = active & same;
195
+ uint64_t neg = active & diff;
196
+ acc += __builtin_popcountll(pos) - __builtin_popcountll(neg);
197
+ }
198
+ }
199
+ }
200
+
201
+ /* Single float rescale per output element */
202
+ y_out[i] = (float)acc * W->scales[i] * x_scale;
203
+ }
204
+ }
205
+
206
+ /* ============================================================
207
+ * FP16 embedding lookup (only used for embed/lm_head)
208
+ * ============================================================ */
209
+ static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) {
210
+ const uint16_t *row = embed + (size_t)token_id * hidden;
211
+ int i;
212
+ for (i = 0; i + 16 <= hidden; i += 16) {
213
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
214
+ __m512 fv = _mm512_cvtph_ps(h);
215
+ _mm512_storeu_ps(out + i, fv);
216
+ }
217
+ for (; i < hidden; i++) {
218
+ __m128i hv = _mm_set1_epi16(row[i]);
219
+ __m128 fv = _mm_cvtph_ps(hv);
220
+ _mm_store_ss(out + i, fv);
221
+ }
222
+ }
223
+
224
+ /* FP16 matvec for lm_head (vocab is huge, keep as FP16) */
225
+ static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) {
226
+ #pragma omp parallel for schedule(dynamic, 256)
227
+ for (int i = 0; i < out_dim; i++) {
228
+ __m512 acc = _mm512_setzero_ps();
229
+ int j;
230
+ for (j = 0; j + 16 <= in_dim; j += 16) {
231
+ __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
232
+ __m512 wv = _mm512_cvtph_ps(h);
233
+ __m512 xv = _mm512_loadu_ps(x + j);
234
+ acc = _mm512_fmadd_ps(wv, xv, acc);
235
+ }
236
+ float sum = _mm512_reduce_add_ps(acc);
237
+ for (; j < in_dim; j++) {
238
+ __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
239
+ __m128 fv = _mm_cvtph_ps(hv);
240
+ float wf;
241
+ _mm_store_ss(&wf, fv);
242
+ sum += wf * x[j];
243
+ }
244
+ y[i] = sum;
245
+ }
246
+ }
247
+
248
+ /* ============================================================
249
+ * O(dim) operations - float is fine here, not the bottleneck
250
+ * ============================================================ */
251
+ static void rmsnorm(const float *x, const float *w, float *y, int dim) {
252
+ float ss = 0.0f;
253
+ for (int i = 0; i < dim; i++) ss += x[i] * x[i];
254
+ float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
255
+ for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i];
256
+ }
257
+
258
+ static void rmsnorm_head(const float *x, const float *w, float *y, int dim) {
259
+ /* RMSNorm for a single attention head */
260
+ rmsnorm(x, w, y, dim);
261
+ }
262
+
263
+ static void silu_mul(const float *gate, const float *up, float *out, int n) {
264
+ for (int i = 0; i < n; i++)
265
+ out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i];
266
+ }
267
+
268
+ static void vec_add(float *y, const float *x, int n) {
269
+ for (int i = 0; i < n; i++) y[i] += x[i];
270
+ }
271
+
272
+ static void apply_rope(float *vec, int pos, int dim, float theta) {
273
+ for (int i = 0; i < dim; i += 2) {
274
+ float freq = 1.0f / powf(theta, (float)i / dim);
275
+ float angle = pos * freq;
276
+ float c = cosf(angle), s = sinf(angle);
277
+ float v0 = vec[i], v1 = vec[i + 1];
278
+ vec[i] = v0 * c - v1 * s;
279
+ vec[i + 1] = v0 * s + v1 * c;
280
+ }
281
+ }
282
+
283
+ static void softmax(float *x, int n) {
284
+ float mx = x[0];
285
+ for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
286
+ float sum = 0.0f;
287
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; }
288
+ float inv = 1.0f / sum;
289
+ for (int i = 0; i < n; i++) x[i] *= inv;
290
+ }
291
+
292
+ /* KV cache access */
293
+ static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
294
+ return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
295
+ (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
296
+ }
297
+
298
+ /* ============================================================
299
+ * ALLOC unary vector scratch
300
+ * ============================================================ */
301
+ static void uv_alloc(UnaryVec *uv, int dim, int n_planes) {
302
+ int chunks = (dim + 63) / 64;
303
+ uv->dim = dim;
304
+ uv->chunks = chunks;
305
+ uv->n_planes = n_planes;
306
+ uv->sign = (uint64_t *)aligned_alloc(64, chunks * sizeof(uint64_t));
307
+ uv->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * chunks * sizeof(uint64_t));
308
+ uv->scale = 0.0f;
309
+ }
310
+
311
+ /* ============================================================
312
+ * ATTENTION (using pure unary for projections)
313
+ * ============================================================ */
314
+ static void attention(Model *m, int layer_idx, int pos) {
315
+ Config *c = &m->cfg;
316
+ Layer *layer = &m->layers[layer_idx];
317
+ int heads_per_kv = c->n_heads / c->n_kv_heads;
318
+
319
+ /* Quantize normed hidden to unary */
320
+ quantize_to_unary(m->normed, c->hidden, c->a_planes,
321
+ m->uv_normed.sign, m->uv_normed.planes, &m->uv_normed.scale);
322
+
323
+ /* Q, K, V projections - PURE UNARY */
324
+ pure_unary_matvec(&layer->q_proj,
325
+ m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
326
+ m->q_float, m->acc_buf);
327
+ pure_unary_matvec(&layer->k_proj,
328
+ m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
329
+ m->k_float, m->acc_buf);
330
+ pure_unary_matvec(&layer->v_proj,
331
+ m->uv_normed.sign, m->uv_normed.planes, m->uv_normed.scale, c->a_planes,
332
+ m->v_float, m->acc_buf);
333
+
334
+ /* QK-Norm (per head) */
335
+ if (layer->q_norm) {
336
+ for (int h = 0; h < c->n_heads; h++)
337
+ rmsnorm_head(m->q_float + h * c->head_dim, layer->q_norm,
338
+ m->q_float + h * c->head_dim, c->head_dim);
339
+ }
340
+ if (layer->k_norm) {
341
+ for (int h = 0; h < c->n_kv_heads; h++)
342
+ rmsnorm_head(m->k_float + h * c->head_dim, layer->k_norm,
343
+ m->k_float + h * c->head_dim, c->head_dim);
344
+ }
345
+
346
+ /* RoPE */
347
+ for (int h = 0; h < c->n_heads; h++)
348
+ apply_rope(m->q_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
349
+ for (int h = 0; h < c->n_kv_heads; h++)
350
+ apply_rope(m->k_float + h * c->head_dim, pos, c->head_dim, c->rope_theta);
351
+
352
+ /* Store K, V to cache */
353
+ for (int h = 0; h < c->n_kv_heads; h++) {
354
+ memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
355
+ m->k_float + h * c->head_dim, c->head_dim * sizeof(float));
356
+ memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
357
+ m->v_float + h * c->head_dim, c->head_dim * sizeof(float));
358
+ }
359
+
360
+ /* Attention scores + weighted sum (O(seq × head_dim), not O(dim²)) */
361
+ float scale = 1.0f / sqrtf((float)c->head_dim);
362
+ memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
363
+
364
+ for (int h = 0; h < c->n_heads; h++) {
365
+ int kv_h = h / heads_per_kv;
366
+ float *q_head = m->q_float + h * c->head_dim;
367
+ float *out_head = m->attn_out + h * c->head_dim;
368
+
369
+ for (int t = 0; t <= pos; t++) {
370
+ float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
371
+ float dot = 0.0f;
372
+ for (int d = 0; d < c->head_dim; d++)
373
+ dot += q_head[d] * k_cached[d];
374
+ m->attn_scores[t] = dot * scale;
375
+ }
376
+
377
+ softmax(m->attn_scores, pos + 1);
378
+
379
+ for (int t = 0; t <= pos; t++) {
380
+ float w = m->attn_scores[t];
381
+ if (w < 1e-8f) continue;
382
+ float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
383
+ for (int d = 0; d < c->head_dim; d++)
384
+ out_head[d] += w * v_cached[d];
385
+ }
386
+ }
387
+
388
+ /* O projection - quantize attn_out, then pure unary */
389
+ int o_in = c->n_heads * c->head_dim;
390
+ UnaryVec uv_attn;
391
+ uv_alloc(&uv_attn, o_in, c->a_planes);
392
+ quantize_to_unary(m->attn_out, o_in, c->a_planes,
393
+ uv_attn.sign, uv_attn.planes, &uv_attn.scale);
394
+
395
+ /* Temp buffer for O projection output */
396
+ float *o_out = m->normed; /* reuse normed buffer */
397
+ pure_unary_matvec(&layer->o_proj,
398
+ uv_attn.sign, uv_attn.planes, uv_attn.scale, c->a_planes,
399
+ o_out, m->acc_buf);
400
+
401
+ /* Copy o_out to where caller expects it (normed acts as temp) */
402
+ memcpy(m->attn_out, o_out, c->hidden * sizeof(float));
403
+
404
+ free(uv_attn.sign);
405
+ free(uv_attn.planes);
406
+ }
407
+
408
+ /* ============================================================
409
+ * MLP (using pure unary for all projections)
410
+ * ============================================================ */
411
+ static void mlp(Model *m, int layer_idx) {
412
+ Config *c = &m->cfg;
413
+ Layer *layer = &m->layers[layer_idx];
414
+
415
+ /* Quantize normed input */
416
+ quantize_to_unary(m->normed, c->hidden, c->a_planes,
417
+ m->uv_mlp_in.sign, m->uv_mlp_in.planes, &m->uv_mlp_in.scale);
418
+
419
+ /* Gate and Up projections - PURE UNARY */
420
+ pure_unary_matvec(&layer->gate_proj,
421
+ m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
422
+ m->gate_float, m->acc_buf);
423
+ pure_unary_matvec(&layer->up_proj,
424
+ m->uv_mlp_in.sign, m->uv_mlp_in.planes, m->uv_mlp_in.scale, c->a_planes,
425
+ m->up_float, m->acc_buf);
426
+
427
+ /* SiLU(gate) * up - O(inter) float op */
428
+ silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter);
429
+
430
+ /* Quantize for down projection */
431
+ quantize_to_unary(m->mlp_act, c->inter, c->a_planes,
432
+ m->uv_mlp_act.sign, m->uv_mlp_act.planes, &m->uv_mlp_act.scale);
433
+
434
+ /* Down projection - PURE UNARY */
435
+ pure_unary_matvec(&layer->down_proj,
436
+ m->uv_mlp_act.sign, m->uv_mlp_act.planes, m->uv_mlp_act.scale, c->a_planes,
437
+ m->normed, m->acc_buf); /* reuse normed as output */
438
+ }
439
+
440
+ /* ============================================================
441
+ * FORWARD ONE TOKEN
442
+ * ============================================================ */
443
+ float* forward_token(Model *m, int token_id, int pos) {
444
+ Config *c = &m->cfg;
445
+
446
+ embed_token(m->embed, token_id, m->hidden, c->hidden);
447
+
448
+ for (int l = 0; l < c->n_layers; l++) {
449
+ /* Pre-attention norm */
450
+ rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden);
451
+
452
+ /* Attention (quantizes normed internally, outputs to attn_out) */
453
+ attention(m, l, pos);
454
+ vec_add(m->hidden, m->attn_out, c->hidden);
455
+
456
+ /* Post-attention norm */
457
+ rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden);
458
+
459
+ /* MLP (quantizes normed internally, outputs to normed) */
460
+ mlp(m, l);
461
+ vec_add(m->hidden, m->normed, c->hidden);
462
+ }
463
+
464
+ /* Final norm */
465
+ rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden);
466
+
467
+ /* LM head - FP16 for now (vocab projection is O(vocab × hidden), not repeated per-layer) */
468
+ if (c->tie_embeddings) {
469
+ fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden);
470
+ }
471
+
472
+ return m->logits;
473
+ }
474
+
475
+ /* ============================================================
476
+ * SAMPLING
477
+ * ============================================================ */
478
+ static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
479
+ if (temperature > 0) {
480
+ float inv_t = 1.0f / temperature;
481
+ for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
482
+ }
483
+ softmax(logits, vocab);
484
+
485
+ int n_keep = 0;
486
+ float cum = 0.0f;
487
+ float *probs = (float *)malloc(vocab * sizeof(float));
488
+ int *indices = (int *)malloc(vocab * sizeof(int));
489
+ memcpy(probs, logits, vocab * sizeof(float));
490
+ for (int i = 0; i < vocab; i++) indices[i] = i;
491
+
492
+ while (cum < top_p && n_keep < vocab) {
493
+ int best = n_keep;
494
+ for (int i = n_keep + 1; i < vocab; i++)
495
+ if (probs[i] > probs[best]) best = i;
496
+ float tmp = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp;
497
+ int ti = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = ti;
498
+ cum += probs[n_keep];
499
+ n_keep++;
500
+ if (n_keep >= 40) break;
501
+ }
502
+
503
+ float sum = 0.0f;
504
+ for (int i = 0; i < n_keep; i++) sum += probs[i];
505
+ float r = (float)rand() / RAND_MAX * sum;
506
+ float acc = 0.0f;
507
+ int chosen = indices[0];
508
+ for (int i = 0; i < n_keep; i++) {
509
+ acc += probs[i];
510
+ if (acc >= r) { chosen = indices[i]; break; }
511
+ }
512
+ free(probs); free(indices);
513
+ return chosen;
514
+ }
515
+
516
+ int generate(
517
+ Model *m,
518
+ const int *prompt_ids, int prompt_len,
519
+ int *out_tokens, int max_new_tokens,
520
+ float temperature, float top_p, int eos_token
521
+ ) {
522
+ srand(time(NULL));
523
+
524
+ for (int i = 0; i < prompt_len; i++)
525
+ forward_token(m, prompt_ids[i], i);
526
+
527
+ int pos = prompt_len;
528
+ int generated = 0;
529
+
530
+ for (int t = 0; t < max_new_tokens; t++) {
531
+ int next;
532
+ if (temperature <= 0) {
533
+ next = 0;
534
+ for (int i = 1; i < m->cfg.vocab; i++)
535
+ if (m->logits[i] > m->logits[next]) next = i;
536
+ } else {
537
+ next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p);
538
+ }
539
+
540
+ out_tokens[t] = next;
541
+ generated++;
542
+ if (next == eos_token) break;
543
+
544
+ forward_token(m, next, pos);
545
+ pos++;
546
+ }
547
+ return generated;
548
+ }
549
+
550
+ /* ============================================================
551
+ * ALLOCATION
552
+ * ============================================================ */
553
+ Model* model_alloc(
554
+ int w_planes, int a_planes,
555
+ int hidden, int inter, int n_heads, int n_kv_heads,
556
+ int head_dim, int n_layers, int vocab,
557
+ float rope_theta, int tie_embeddings
558
+ ) {
559
+ Model *m = (Model *)calloc(1, sizeof(Model));
560
+ Config *c = &m->cfg;
561
+ c->hidden = hidden; c->inter = inter;
562
+ c->n_heads = n_heads; c->n_kv_heads = n_kv_heads;
563
+ c->head_dim = head_dim; c->n_layers = n_layers;
564
+ c->vocab = vocab; c->rope_theta = rope_theta;
565
+ c->tie_embeddings = tie_embeddings;
566
+ c->w_planes = w_planes; c->a_planes = a_planes;
567
+
568
+ m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
569
+
570
+ size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
571
+ m->k_cache = (float *)calloc(kv_size, sizeof(float));
572
+ m->v_cache = (float *)calloc(kv_size, sizeof(float));
573
+
574
+ m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
575
+ m->normed = (float *)aligned_alloc(64, (inter > hidden ? inter : hidden) * sizeof(float));
576
+ m->q_float = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
577
+ m->k_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
578
+ m->v_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
579
+ m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
580
+ m->gate_float = (float *)aligned_alloc(64, inter * sizeof(float));
581
+ m->up_float = (float *)aligned_alloc(64, inter * sizeof(float));
582
+ m->mlp_act = (float *)aligned_alloc(64, inter * sizeof(float));
583
+ m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
584
+ m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
585
+ m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
586
+ m->acc_buf = (int *)aligned_alloc(64, (inter > vocab ? inter : vocab) * sizeof(int));
587
+
588
+ /* Unary vector scratch */
589
+ uv_alloc(&m->uv_normed, hidden, a_planes);
590
+ uv_alloc(&m->uv_mlp_in, hidden, a_planes);
591
+ uv_alloc(&m->uv_mlp_act, inter, a_planes);
592
+
593
+ size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
594
+ printf("PURE UNARY ENGINE\n");
595
+ printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n",
596
+ hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
597
+ printf(" Weight planes: %d, Activation planes: %d\n", w_planes, a_planes);
598
+ printf(" Plane pairs per matvec element: %d\n", w_planes * a_planes);
599
+ printf(" KV cache: %zu MB\n", kv_mb);
600
+ printf(" Float ops: RMSNorm, SiLU, Softmax, RoPE, residual (all O(dim))\n");
601
+ printf(" Integer ops: ALL matmuls (O(dim²) — the actual bottleneck)\n");
602
+
603
+ return m;
604
+ }
605
+
606
+ /* Weight setters (same interface as v2) */
607
+ void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
608
+ void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
609
+
610
+ void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
611
+ m->layers[l].input_norm = in_norm;
612
+ m->layers[l].post_norm = post_norm;
613
+ }
614
+
615
+ void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
616
+ m->layers[l].q_norm = q_norm;
617
+ m->layers[l].k_norm = k_norm;
618
+ }
619
+
620
+ static void init_unary_weight(
621
+ UnaryWeight *uw,
622
+ uint64_t *sign, uint64_t *planes, float *scales,
623
+ int out_dim, int in_dim, int n_planes
624
+ ) {
625
+ uw->sign_bits = sign;
626
+ uw->mag_planes = planes;
627
+ uw->scales = scales;
628
+ uw->out_dim = out_dim;
629
+ uw->in_dim = in_dim;
630
+ uw->n_planes = n_planes;
631
+ uw->chunks = (in_dim + 63) / 64;
632
+ }
633
+
634
+ void layer_set_linears(
635
+ Model *m, int l,
636
+ uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in,
637
+ uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in,
638
+ uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in,
639
+ uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in,
640
+ uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in,
641
+ uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in,
642
+ uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in,
643
+ int n_planes
644
+ ) {
645
+ init_unary_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes);
646
+ init_unary_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes);
647
+ init_unary_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes);
648
+ init_unary_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes);
649
+ init_unary_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes);
650
+ init_unary_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes);
651
+ init_unary_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes);
652
+ }
653
+
654
+ void model_reset_cache(Model *m) {
655
+ size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
656
+ memset(m->k_cache, 0, kv_size * sizeof(float));
657
+ memset(m->v_cache, 0, kv_size * sizeof(float));
658
+ }
run_convert.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, json, numpy as np, time, sys
2
+ from pathlib import Path
3
+ from safetensors import safe_open
4
+ import torch
5
+ sys.path.insert(0, "/root/ternary_engine")
6
+ from convert import quantize_weight_matrix
7
+
8
+ model_dir = "/root/ternary_engine/deepseek-r1-1.5b-hf"
9
+ output_dir = "/root/ternary_engine/deepseek-r1-1.5b-ternary"
10
+ alpha = 0.7
11
+
12
+ os.makedirs(output_dir, exist_ok=True)
13
+
14
+ tensors = {}
15
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
16
+ print("Loading " + f.name)
17
+ with safe_open(str(f), framework="pt") as st:
18
+ for key in st.keys():
19
+ tensors[key] = st.get_tensor(key).float().numpy()
20
+
21
+ print("Loaded " + str(len(tensors)) + " tensors")
22
+
23
+ config = {
24
+ "hidden_size": 1536, "intermediate_size": 8960,
25
+ "num_attention_heads": 12, "num_key_value_heads": 2,
26
+ "num_hidden_layers": 28, "vocab_size": 151936,
27
+ "head_dim": 128, "rope_theta": 1000000.0,
28
+ "rms_norm_eps": 1e-6, "alpha": alpha,
29
+ }
30
+
31
+ ternary_manifest = {}
32
+ fp16_manifest = {}
33
+
34
+ linear_suffixes = ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
35
+ 'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
36
+ 'down_proj.weight']
37
+
38
+ total_tb = 0
39
+ total_ob = 0
40
+
41
+ for key, w in tensors.items():
42
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
43
+ is_linear = any(key.endswith(s) for s in linear_suffixes)
44
+
45
+ if is_linear and len(w.shape) == 2:
46
+ out_dim, in_dim = w.shape
47
+ total_ob += w.nbytes
48
+
49
+ t0 = time.time()
50
+ pos, neg, scales, sparsity = quantize_weight_matrix(w, alpha)
51
+ dt = time.time() - t0
52
+
53
+ pos.tofile(prefix + ".pos")
54
+ neg.tofile(prefix + ".neg")
55
+ scales.tofile(prefix + ".scales")
56
+
57
+ tb = pos.nbytes + neg.nbytes + scales.nbytes
58
+ total_tb += tb
59
+ ratio = w.nbytes / tb
60
+ ternary_manifest[key] = list(w.shape)
61
+ print(" T %s: %s -> %dKB (%.1fx, %.0f%% sparse, %.1fs)" % (
62
+ key, str(w.shape), tb // 1024, ratio, sparsity * 100, dt))
63
+ else:
64
+ w16 = w.astype(np.float16)
65
+ w16.tofile(prefix + ".fp16")
66
+ fp16_manifest[key] = list(w.shape)
67
+ print(" F %s: %s -> %dKB" % (key, str(w.shape), w16.nbytes // 1024))
68
+
69
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
70
+ json.dump(config, f, indent=2)
71
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
72
+ json.dump({"ternary": ternary_manifest, "fp16": fp16_manifest}, f, indent=2)
73
+
74
+ print("")
75
+ print("Ternary: %.1fMB (from %.1fMB FP32)" % (total_tb / 1024 / 1024, total_ob / 1024 / 1024))
76
+ print("DONE")
run_log_unary.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Log-unary model loader. (c) 2026 OpenTransformers Ltd"""
3
+ import ctypes, numpy as np, os, sys, json, time
4
+
5
+ def load_and_run(model_dir, prompt, max_tokens=32, temperature=0.0, top_p=0.9, a_planes=4):
6
+ config = json.load(open(os.path.join(model_dir, "config.json")))
7
+ manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
8
+ w_planes = manifest["n_planes"]
9
+ n_layers = config["num_hidden_layers"]
10
+ hidden = config["hidden_size"]
11
+ inter = config["intermediate_size"]
12
+ n_heads = config["num_attention_heads"]
13
+ n_kv_heads = config["num_key_value_heads"]
14
+ head_dim = config.get("head_dim", hidden // n_heads)
15
+ vocab = config["vocab_size"]
16
+ rope_theta = config.get("rope_theta", 10000.0)
17
+ tie = 1 if config.get("tie_word_embeddings", False) else 0
18
+
19
+ w_max = (1 << w_planes) - 1
20
+ a_max = (1 << a_planes) - 1
21
+ print(f"Config: {n_layers}L hidden={hidden} inter={inter} heads={n_heads}/{n_kv_heads}")
22
+ print(f"Weight: {w_planes} log-planes ({2*w_max+1} levels)")
23
+ print(f"Activation: {a_planes} log-planes ({2*a_max+1} levels)")
24
+ print(f"Plane pairs: {w_planes * a_planes}")
25
+
26
+ engine = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "log_unary_engine.so")
27
+ lib = ctypes.CDLL(engine)
28
+
29
+ lib.model_alloc.restype = ctypes.c_void_p
30
+ lib.model_alloc.argtypes = [ctypes.c_int]*2 + [ctypes.c_int]*7 + [ctypes.c_float, ctypes.c_int]
31
+ lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
32
+ lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
33
+ lib.generate.restype = ctypes.c_int
34
+ lib.generate.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_int), ctypes.c_int,
35
+ ctypes.POINTER(ctypes.c_int), ctypes.c_int,
36
+ ctypes.c_float, ctypes.c_float, ctypes.c_int]
37
+
38
+ u16p = ctypes.POINTER(ctypes.c_uint16)
39
+ f32p = ctypes.POINTER(ctypes.c_float)
40
+ u64p = ctypes.POINTER(ctypes.c_uint64)
41
+
42
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
43
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
44
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
45
+ lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
46
+ lib.layer_set_linears.argtypes = [ctypes.c_void_p, ctypes.c_int] + \
47
+ [u64p, u64p, f32p, ctypes.c_int, ctypes.c_int] * 7 + [ctypes.c_int]
48
+
49
+ print("Allocating...")
50
+ model = lib.model_alloc(w_planes, a_planes, hidden, inter, n_heads, n_kv_heads,
51
+ head_dim, n_layers, vocab, rope_theta, tie)
52
+ _refs = []
53
+
54
+ def load_fp16(name):
55
+ d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
56
+ _refs.append(d); return d.ctypes.data_as(u16p)
57
+
58
+ def load_f32(name):
59
+ d = np.fromfile(os.path.join(model_dir, name.replace(".","_")+".fp16"), dtype=np.uint16)
60
+ f = d.view(np.float16).astype(np.float32); _refs.append(f); return f.ctypes.data_as(f32p)
61
+
62
+ def load_unary(name):
63
+ fn = name.replace(".","_")
64
+ s = np.fromfile(os.path.join(model_dir, f"{fn}.sign"), dtype=np.uint64)
65
+ p = np.fromfile(os.path.join(model_dir, f"{fn}.planes"), dtype=np.uint64)
66
+ sc = np.fromfile(os.path.join(model_dir, f"{fn}.scales"), dtype=np.float32)
67
+ _refs.extend([s,p,sc])
68
+ return s.ctypes.data_as(u64p), p.ctypes.data_as(u64p), sc.ctypes.data_as(f32p)
69
+
70
+ lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
71
+ lib.model_set_final_norm(model, load_f32("model.norm.weight"))
72
+
73
+ print(f"Loading {n_layers} layers...")
74
+ um = manifest["unary"]
75
+ for l in range(n_layers):
76
+ p = f"model.layers.{l}"
77
+ lib.layer_set_norms(model, l, load_f32(f"{p}.input_layernorm.weight"),
78
+ load_f32(f"{p}.post_attention_layernorm.weight"))
79
+ qn = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
80
+ if os.path.exists(qn):
81
+ lib.layer_set_qk_norm(model, l, load_f32(f"{p}.self_attn.q_norm.weight"),
82
+ load_f32(f"{p}.self_attn.k_norm.weight"))
83
+
84
+ projs = ["self_attn.q_proj","self_attn.k_proj","self_attn.v_proj","self_attn.o_proj",
85
+ "mlp.gate_proj","mlp.up_proj","mlp.down_proj"]
86
+ args = [model, l]
87
+ for pj in projs:
88
+ key = f"{p}.{pj}.weight"
89
+ s,pl,sc = load_unary(key)
90
+ args.extend([s, pl, sc, um[key][0], um[key][1]])
91
+ args.append(w_planes)
92
+ lib.layer_set_linears(*args)
93
+
94
+ if (l+1) % 12 == 0 or l == n_layers-1:
95
+ print(f" Layer {l+1}/{n_layers}")
96
+
97
+ print("Tokenizing...")
98
+ from transformers import AutoTokenizer
99
+ tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
100
+ ids = tok.encode(prompt)
101
+ print(f"Prompt: {len(ids)} tokens")
102
+
103
+ eos = config.get("eos_token_id", 151645)
104
+ pa = (ctypes.c_int * len(ids))(*ids)
105
+ oa = (ctypes.c_int * max_tokens)()
106
+
107
+ print(f"\nGenerating (w={w_planes}log a={a_planes}log pairs={w_planes*a_planes})...")
108
+ t0 = time.time()
109
+ n = lib.generate(model, pa, len(ids), oa, max_tokens,
110
+ ctypes.c_float(temperature), ctypes.c_float(top_p), eos)
111
+ dt = time.time() - t0
112
+
113
+ text = tok.decode([oa[i] for i in range(n)], skip_special_tokens=True)
114
+ print(f"\n=== LOG-UNARY ({n} tok in {dt:.1f}s = {n/dt:.2f} tok/s) ===")
115
+ print(text)
116
+ print(f"\nDecode: {n/dt:.2f} tok/s")
117
+
118
+ if __name__ == "__main__":
119
+ d = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-log-unary"
120
+ p = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
121
+ mt = int(sys.argv[3]) if len(sys.argv) > 3 else 32
122
+ ap = int(sys.argv[4]) if len(sys.argv) > 4 else 4
123
+ load_and_run(d, p, mt, a_planes=ap)
run_pure_unary.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pure unary model loader - ALL matmuls are AND+popcount
4
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
5
+ """
6
+ import ctypes, numpy as np, os, sys, json, time
7
+
8
+ def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9, a_planes=4):
9
+ config = json.load(open(os.path.join(model_dir, "config.json")))
10
+ manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
11
+ w_planes = manifest["n_planes"]
12
+ n_layers = config["num_hidden_layers"]
13
+ hidden = config["hidden_size"]
14
+ inter = config["intermediate_size"]
15
+ n_heads = config["num_attention_heads"]
16
+ n_kv_heads = config["num_key_value_heads"]
17
+ head_dim = config.get("head_dim", hidden // n_heads)
18
+ vocab = config["vocab_size"]
19
+ rope_theta = config.get("rope_theta", 10000.0)
20
+ tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
21
+
22
+ print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}")
23
+ print(f"Weight planes: {w_planes}, Activation planes: {a_planes}")
24
+ print(f"Plane pairs per element: {w_planes * a_planes}")
25
+ print(f"Tied embeddings: {'yes' if tie_embeddings else 'no'}")
26
+
27
+ engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "pure_unary_engine.so")
28
+ lib = ctypes.CDLL(engine_path)
29
+
30
+ lib.model_alloc.restype = ctypes.c_void_p
31
+ lib.model_alloc.argtypes = [
32
+ ctypes.c_int, ctypes.c_int, # w_planes, a_planes
33
+ ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_int,
34
+ ctypes.c_int, ctypes.c_int, ctypes.c_int,
35
+ ctypes.c_float, ctypes.c_int,
36
+ ]
37
+
38
+ lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
39
+ lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
40
+
41
+ lib.generate.restype = ctypes.c_int
42
+ lib.generate.argtypes = [
43
+ ctypes.c_void_p,
44
+ ctypes.POINTER(ctypes.c_int), ctypes.c_int,
45
+ ctypes.POINTER(ctypes.c_int), ctypes.c_int,
46
+ ctypes.c_float, ctypes.c_float, ctypes.c_int
47
+ ]
48
+
49
+ u16p = ctypes.POINTER(ctypes.c_uint16)
50
+ f32p = ctypes.POINTER(ctypes.c_float)
51
+ u64p = ctypes.POINTER(ctypes.c_uint64)
52
+
53
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
54
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
55
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
56
+ lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
57
+ lib.layer_set_linears.argtypes = [
58
+ ctypes.c_void_p, ctypes.c_int,
59
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
60
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
61
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
62
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
63
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
64
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
65
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
66
+ ctypes.c_int,
67
+ ]
68
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
69
+
70
+ print("Allocating model...")
71
+ model = lib.model_alloc(
72
+ w_planes, a_planes,
73
+ hidden, inter, n_heads, n_kv_heads,
74
+ head_dim, n_layers, vocab, rope_theta, tie_embeddings
75
+ )
76
+
77
+ _refs = []
78
+
79
+ def load_fp16(name):
80
+ fname = name.replace(".", "_") + ".fp16"
81
+ data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
82
+ _refs.append(data)
83
+ return data.ctypes.data_as(u16p)
84
+
85
+ def load_f32(name):
86
+ fname = name.replace(".", "_") + ".fp16"
87
+ data = np.fromfile(os.path.join(model_dir, fname), dtype=np.uint16)
88
+ f32 = data.view(np.float16).astype(np.float32)
89
+ _refs.append(f32)
90
+ return f32.ctypes.data_as(f32p)
91
+
92
+ def load_unary(name):
93
+ fname = name.replace(".", "_")
94
+ sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
95
+ planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
96
+ scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
97
+ _refs.extend([sign, planes, scales])
98
+ return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
99
+ scales.ctypes.data_as(f32p))
100
+
101
+ print("Loading embeddings...")
102
+ lib.model_set_embed(model, load_fp16("model.embed_tokens.weight"))
103
+
104
+ print("Loading final norm...")
105
+ lib.model_set_final_norm(model, load_f32("model.norm.weight"))
106
+
107
+ print(f"Loading {n_layers} layers...")
108
+ for l in range(n_layers):
109
+ p = f"model.layers.{l}"
110
+ lib.layer_set_norms(model, l,
111
+ load_f32(f"{p}.input_layernorm.weight"),
112
+ load_f32(f"{p}.post_attention_layernorm.weight"))
113
+
114
+ # QK-Norm (Qwen3)
115
+ qn_path = os.path.join(model_dir, f"{p.replace('.','_')}_self_attn_q_norm_weight.fp16")
116
+ if os.path.exists(qn_path):
117
+ lib.layer_set_qk_norm(model, l,
118
+ load_f32(f"{p}.self_attn.q_norm.weight"),
119
+ load_f32(f"{p}.self_attn.k_norm.weight"))
120
+
121
+ q_s, q_p, q_sc = load_unary(f"{p}.self_attn.q_proj.weight")
122
+ k_s, k_p, k_sc = load_unary(f"{p}.self_attn.k_proj.weight")
123
+ v_s, v_p, v_sc = load_unary(f"{p}.self_attn.v_proj.weight")
124
+ o_s, o_p, o_sc = load_unary(f"{p}.self_attn.o_proj.weight")
125
+ g_s, g_p, g_sc = load_unary(f"{p}.mlp.gate_proj.weight")
126
+ u_s, u_p, u_sc = load_unary(f"{p}.mlp.up_proj.weight")
127
+ d_s, d_p, d_sc = load_unary(f"{p}.mlp.down_proj.weight")
128
+
129
+ um = manifest["unary"]
130
+ lib.layer_set_linears(model, l,
131
+ q_s, q_p, q_sc, um[f"{p}.self_attn.q_proj.weight"][0], um[f"{p}.self_attn.q_proj.weight"][1],
132
+ k_s, k_p, k_sc, um[f"{p}.self_attn.k_proj.weight"][0], um[f"{p}.self_attn.k_proj.weight"][1],
133
+ v_s, v_p, v_sc, um[f"{p}.self_attn.v_proj.weight"][0], um[f"{p}.self_attn.v_proj.weight"][1],
134
+ o_s, o_p, o_sc, um[f"{p}.self_attn.o_proj.weight"][0], um[f"{p}.self_attn.o_proj.weight"][1],
135
+ g_s, g_p, g_sc, um[f"{p}.mlp.gate_proj.weight"][0], um[f"{p}.mlp.gate_proj.weight"][1],
136
+ u_s, u_p, u_sc, um[f"{p}.mlp.up_proj.weight"][0], um[f"{p}.mlp.up_proj.weight"][1],
137
+ d_s, d_p, d_sc, um[f"{p}.mlp.down_proj.weight"][0], um[f"{p}.mlp.down_proj.weight"][1],
138
+ w_planes)
139
+
140
+ if (l + 1) % 6 == 0 or l == n_layers - 1:
141
+ print(f" Loaded layer {l+1}/{n_layers}")
142
+
143
+ # Tokenize
144
+ print("Tokenizing...")
145
+ from transformers import AutoTokenizer
146
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
147
+ input_ids = tokenizer.encode(prompt)
148
+ print(f"Prompt: {len(input_ids)} tokens -> {repr(prompt[:60])}")
149
+
150
+ eos_token = config.get("eos_token_id", 151645)
151
+ prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
152
+ out_arr = (ctypes.c_int * max_tokens)()
153
+
154
+ print(f"\nGenerating (temp={temperature}, top_p={top_p}, a_planes={a_planes})...")
155
+ t0 = time.time()
156
+ n_gen = lib.generate(
157
+ model, prompt_arr, len(input_ids),
158
+ out_arr, max_tokens,
159
+ ctypes.c_float(temperature), ctypes.c_float(top_p), eos_token
160
+ )
161
+ dt = time.time() - t0
162
+
163
+ out_ids = [out_arr[i] for i in range(n_gen)]
164
+ text = tokenizer.decode(out_ids, skip_special_tokens=True)
165
+
166
+ print(f"\n=== PURE UNARY Output ({n_gen} tokens in {dt:.1f}s = {n_gen/dt:.2f} tok/s) ===")
167
+ print(text)
168
+ print(f"\nDecode speed: {n_gen/dt:.2f} tok/s")
169
+ return text
170
+
171
+ if __name__ == "__main__":
172
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
173
+ prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
174
+ max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 32
175
+ a_planes = int(sys.argv[4]) if len(sys.argv) > 4 else 4
176
+ load_and_run(model_dir, prompt, max_tokens=max_tokens, a_planes=a_planes)
run_qwen3_4b.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unary model loader for Qwen3-4B-Thinking.
4
+ Loads converted weights and runs inference via unary_engine_v2.so
5
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
6
+ """
7
+ import ctypes, numpy as np, os, sys, json, time
8
+
9
+ def load_and_run(model_dir, prompt, max_tokens=128, temperature=0.0, top_p=0.9):
10
+ # Load config
11
+ config = json.load(open(os.path.join(model_dir, "config.json")))
12
+ manifest = json.load(open(os.path.join(model_dir, "manifest.json")))
13
+ n_planes = manifest["n_planes"]
14
+ n_layers = config["num_hidden_layers"]
15
+ hidden = config["hidden_size"]
16
+ inter = config["intermediate_size"]
17
+ n_heads = config["num_attention_heads"]
18
+ n_kv_heads = config["num_key_value_heads"]
19
+ head_dim = config.get("head_dim", hidden // n_heads)
20
+ vocab = config["vocab_size"]
21
+ rope_theta = config.get("rope_theta", 10000.0)
22
+ has_attn_bias = 1 if config.get("attention_bias", False) else 0
23
+ tie_embeddings = 1 if config.get("tie_word_embeddings", False) else 0
24
+
25
+ print(f"Config: {n_layers}L, hidden={hidden}, inter={inter}, heads={n_heads}/{n_kv_heads}, vocab={vocab}")
26
+ print(f"QK-Norm: yes, Tied embeddings: {'yes' if tie_embeddings else 'no'}, n_planes={n_planes}")
27
+
28
+ # Load C engine
29
+ engine_path = os.path.join(os.path.dirname(os.path.abspath(model_dir)), "unary_engine_v2.so")
30
+ lib = ctypes.CDLL(engine_path)
31
+
32
+ # Configure function signatures
33
+ lib.model_alloc.restype = ctypes.c_void_p
34
+ lib.model_alloc.argtypes = [
35
+ ctypes.c_int, # n_planes
36
+ ctypes.c_int, # hidden
37
+ ctypes.c_int, # inter
38
+ ctypes.c_int, # n_heads
39
+ ctypes.c_int, # n_kv_heads
40
+ ctypes.c_int, # head_dim
41
+ ctypes.c_int, # n_layers
42
+ ctypes.c_int, # vocab
43
+ ctypes.c_float, # rope_theta
44
+ ctypes.c_int, # has_attn_bias
45
+ ctypes.c_int, # tie_embeddings
46
+ ]
47
+
48
+ lib.forward_token.restype = ctypes.POINTER(ctypes.c_float)
49
+ lib.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
50
+
51
+ lib.generate.restype = ctypes.c_int
52
+ lib.generate.argtypes = [
53
+ ctypes.c_void_p,
54
+ ctypes.POINTER(ctypes.c_int), ctypes.c_int,
55
+ ctypes.POINTER(ctypes.c_int), ctypes.c_int,
56
+ ctypes.c_float, ctypes.c_float, ctypes.c_int
57
+ ]
58
+
59
+ u16p = ctypes.POINTER(ctypes.c_uint16)
60
+ f32p = ctypes.POINTER(ctypes.c_float)
61
+ u64p = ctypes.POINTER(ctypes.c_uint64)
62
+
63
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, u16p]
64
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, f32p]
65
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, u16p, ctypes.c_int, ctypes.c_int]
66
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
67
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p, f32p]
68
+ lib.layer_set_qk_norm.argtypes = [ctypes.c_void_p, ctypes.c_int, f32p, f32p]
69
+ lib.layer_set_linears.argtypes = [
70
+ ctypes.c_void_p, ctypes.c_int,
71
+ # q: sign, planes, scales, out, in
72
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
73
+ # k
74
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
75
+ # v
76
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
77
+ # o
78
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
79
+ # gate
80
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
81
+ # up
82
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
83
+ # down
84
+ u64p, u64p, f32p, ctypes.c_int, ctypes.c_int,
85
+ ctypes.c_int, # n_planes
86
+ ]
87
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
88
+
89
+ # Allocate model
90
+ print("Allocating model...")
91
+ model = lib.model_alloc(
92
+ n_planes, hidden, inter, n_heads, n_kv_heads,
93
+ head_dim, n_layers, vocab, rope_theta,
94
+ has_attn_bias, tie_embeddings
95
+ )
96
+
97
+ # Keep references to prevent GC
98
+ _refs = []
99
+
100
+ def load_fp16(name):
101
+ fname = name.replace(".", "_") + ".fp16"
102
+ path = os.path.join(model_dir, fname)
103
+ data = np.fromfile(path, dtype=np.uint16)
104
+ _refs.append(data)
105
+ return data.ctypes.data_as(u16p)
106
+
107
+ def load_f32_from_fp16(name):
108
+ fname = name.replace(".", "_") + ".fp16"
109
+ path = os.path.join(model_dir, fname)
110
+ data = np.fromfile(path, dtype=np.uint16)
111
+ # Convert FP16 -> FP32
112
+ f32 = data.view(np.float16).astype(np.float32)
113
+ _refs.append(f32)
114
+ return f32.ctypes.data_as(f32p)
115
+
116
+ def load_unary(name):
117
+ fname = name.replace(".", "_")
118
+ sign = np.fromfile(os.path.join(model_dir, f"{fname}.sign"), dtype=np.uint64)
119
+ planes = np.fromfile(os.path.join(model_dir, f"{fname}.planes"), dtype=np.uint64)
120
+ scales = np.fromfile(os.path.join(model_dir, f"{fname}.scales"), dtype=np.float32)
121
+ _refs.extend([sign, planes, scales])
122
+ return (sign.ctypes.data_as(u64p), planes.ctypes.data_as(u64p),
123
+ scales.ctypes.data_as(f32p))
124
+
125
+ # Load embeddings
126
+ print("Loading embeddings...")
127
+ embed_ptr = load_fp16("model.embed_tokens.weight")
128
+ lib.model_set_embed(model, embed_ptr)
129
+
130
+ # Load final norm
131
+ print("Loading final norm...")
132
+ fnorm_ptr = load_f32_from_fp16("model.norm.weight")
133
+ lib.model_set_final_norm(model, fnorm_ptr)
134
+
135
+ # Load layers
136
+ print(f"Loading {n_layers} layers...")
137
+ for l in range(n_layers):
138
+ prefix = f"model.layers.{l}"
139
+
140
+ # Norms
141
+ in_norm = load_f32_from_fp16(f"{prefix}.input_layernorm.weight")
142
+ post_norm = load_f32_from_fp16(f"{prefix}.post_attention_layernorm.weight")
143
+ lib.layer_set_norms(model, l, in_norm, post_norm)
144
+
145
+ # QK-Norm
146
+ q_norm = load_f32_from_fp16(f"{prefix}.self_attn.q_norm.weight")
147
+ k_norm = load_f32_from_fp16(f"{prefix}.self_attn.k_norm.weight")
148
+ lib.layer_set_qk_norm(model, l, q_norm, k_norm)
149
+
150
+ # Linear layers
151
+ q_s, q_p, q_sc = load_unary(f"{prefix}.self_attn.q_proj.weight")
152
+ k_s, k_p, k_sc = load_unary(f"{prefix}.self_attn.k_proj.weight")
153
+ v_s, v_p, v_sc = load_unary(f"{prefix}.self_attn.v_proj.weight")
154
+ o_s, o_p, o_sc = load_unary(f"{prefix}.self_attn.o_proj.weight")
155
+ g_s, g_p, g_sc = load_unary(f"{prefix}.mlp.gate_proj.weight")
156
+ u_s, u_p, u_sc = load_unary(f"{prefix}.mlp.up_proj.weight")
157
+ d_s, d_p, d_sc = load_unary(f"{prefix}.mlp.down_proj.weight")
158
+
159
+ # Dims from manifest
160
+ q_shape = manifest["unary"][f"{prefix}.self_attn.q_proj.weight"]
161
+ k_shape = manifest["unary"][f"{prefix}.self_attn.k_proj.weight"]
162
+ v_shape = manifest["unary"][f"{prefix}.self_attn.v_proj.weight"]
163
+ o_shape = manifest["unary"][f"{prefix}.self_attn.o_proj.weight"]
164
+ g_shape = manifest["unary"][f"{prefix}.mlp.gate_proj.weight"]
165
+ u_shape = manifest["unary"][f"{prefix}.mlp.up_proj.weight"]
166
+ d_shape = manifest["unary"][f"{prefix}.mlp.down_proj.weight"]
167
+
168
+ lib.layer_set_linears(
169
+ model, l,
170
+ q_s, q_p, q_sc, q_shape[0], q_shape[1],
171
+ k_s, k_p, k_sc, k_shape[0], k_shape[1],
172
+ v_s, v_p, v_sc, v_shape[0], v_shape[1],
173
+ o_s, o_p, o_sc, o_shape[0], o_shape[1],
174
+ g_s, g_p, g_sc, g_shape[0], g_shape[1],
175
+ u_s, u_p, u_sc, u_shape[0], u_shape[1],
176
+ d_s, d_p, d_sc, d_shape[0], d_shape[1],
177
+ n_planes
178
+ )
179
+
180
+ if (l + 1) % 6 == 0 or l == n_layers - 1:
181
+ print(f" Loaded layer {l+1}/{n_layers}")
182
+
183
+ # Tokenize
184
+ print("Tokenizing prompt...")
185
+ from transformers import AutoTokenizer
186
+ tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
187
+ input_ids = tokenizer.encode(prompt)
188
+ print(f"Prompt: {len(input_ids)} tokens")
189
+
190
+ eos_token = config.get("eos_token_id", 151645)
191
+
192
+ # Generate
193
+ prompt_arr = (ctypes.c_int * len(input_ids))(*input_ids)
194
+ out_arr = (ctypes.c_int * max_tokens)()
195
+
196
+ print(f"\nGenerating (temp={temperature}, top_p={top_p})...")
197
+ t0 = time.time()
198
+ n_generated = lib.generate(
199
+ model, prompt_arr, len(input_ids),
200
+ out_arr, max_tokens,
201
+ ctypes.c_float(temperature), ctypes.c_float(top_p),
202
+ eos_token
203
+ )
204
+ dt = time.time() - t0
205
+
206
+ out_ids = [out_arr[i] for i in range(n_generated)]
207
+ text = tokenizer.decode(out_ids, skip_special_tokens=True)
208
+
209
+ total_tokens = len(input_ids) + n_generated
210
+ print(f"\n=== Output ({n_generated} tokens in {dt:.1f}s = {n_generated/dt:.1f} tok/s) ===")
211
+ print(text)
212
+ print(f"\nPrefill: {len(input_ids)} tokens, Decode: {n_generated} tokens")
213
+ print(f"Total time: {dt:.1f}s, Speed: {total_tokens/dt:.1f} tok/s total, {n_generated/dt:.1f} tok/s decode")
214
+
215
+ return text
216
+
217
+ if __name__ == "__main__":
218
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "qwen3-4b-thinking-unary"
219
+ prompt = sys.argv[2] if len(sys.argv) > 2 else "What is 2+2? Think step by step."
220
+ max_tokens = int(sys.argv[3]) if len(sys.argv) > 3 else 64
221
+ load_and_run(model_dir, prompt, max_tokens=max_tokens)
server.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ OpenAI-compatible API server for Ternary Transformer Engine.
4
+ Drop-in replacement for llama-server.
5
+
6
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
7
+ """
8
+
9
+ import json
10
+ import time
11
+ import threading
12
+ from http.server import HTTPServer, BaseHTTPRequestHandler
13
+ from inference import TernaryQwen, Tokenizer, load_kernel
14
+ import os
15
+
16
+ MODEL_DIR = os.environ.get("TERNARY_MODEL_DIR", "deepseek-r1-1.5b-ternary")
17
+ TOKENIZER_DIR = os.environ.get("TOKENIZER_DIR", "deepseek-r1-1.5b-hf")
18
+ HOST = os.environ.get("HOST", "127.0.0.1")
19
+ PORT = int(os.environ.get("PORT", "8080"))
20
+
21
+ print("Loading ternary kernel...")
22
+ kernel = load_kernel(os.path.join(os.path.dirname(__file__), "ternary_kernel.so"))
23
+
24
+ print(f"Loading model from {MODEL_DIR}...")
25
+ model = TernaryQwen(MODEL_DIR, kernel)
26
+
27
+ print(f"Loading tokenizer from {TOKENIZER_DIR}...")
28
+ tokenizer = Tokenizer(TOKENIZER_DIR)
29
+
30
+ lock = threading.Lock()
31
+ print("Ready!")
32
+
33
+ class Handler(BaseHTTPRequestHandler):
34
+ def do_POST(self):
35
+ if self.path == "/v1/chat/completions":
36
+ length = int(self.headers.get("Content-Length", 0))
37
+ body = json.loads(self.rfile.read(length))
38
+
39
+ messages = body.get("messages", [])
40
+ max_tokens = body.get("max_tokens", 256)
41
+ temperature = body.get("temperature", 0.6)
42
+ top_p = body.get("top_p", 0.95)
43
+
44
+ # Build prompt
45
+ prompt = tokenizer.apply_chat_template(messages)
46
+ input_ids = tokenizer.encode(prompt)
47
+
48
+ # Generate
49
+ with lock:
50
+ gen_ids, stats = model.generate(
51
+ input_ids,
52
+ max_new_tokens=max_tokens,
53
+ temperature=temperature,
54
+ top_p=top_p
55
+ )
56
+
57
+ text = tokenizer.decode(gen_ids)
58
+
59
+ response = {
60
+ "id": f"chatcmpl-ternary-{int(time.time())}",
61
+ "object": "chat.completion",
62
+ "created": int(time.time()),
63
+ "model": "DeepSeek-R1-Distill-Qwen-1.5B-TERNARY",
64
+ "choices": [{
65
+ "index": 0,
66
+ "message": {"role": "assistant", "content": text},
67
+ "finish_reason": "stop"
68
+ }],
69
+ "usage": {
70
+ "prompt_tokens": len(input_ids),
71
+ "completion_tokens": stats["tokens_generated"],
72
+ "total_tokens": len(input_ids) + stats["tokens_generated"]
73
+ },
74
+ "timings": {
75
+ "prompt_n": stats["prefill_tokens"],
76
+ "prompt_ms": stats["prefill_ms"],
77
+ "predicted_n": stats["tokens_generated"],
78
+ "predicted_ms": stats["decode_ms"],
79
+ "predicted_per_second": stats["tok_per_sec"],
80
+ }
81
+ }
82
+
83
+ self.send_response(200)
84
+ self.send_header("Content-Type", "application/json")
85
+ self.end_headers()
86
+ self.wfile.write(json.dumps(response).encode())
87
+ else:
88
+ self.send_response(404)
89
+ self.end_headers()
90
+
91
+ def do_GET(self):
92
+ if self.path == "/health":
93
+ self.send_response(200)
94
+ self.send_header("Content-Type", "application/json")
95
+ self.end_headers()
96
+ self.wfile.write(b'{"status":"ok","engine":"ternary-avx512"}')
97
+ else:
98
+ self.send_response(404)
99
+ self.end_headers()
100
+
101
+ def log_message(self, format, *args):
102
+ pass
103
+
104
+ if __name__ == "__main__":
105
+ server = HTTPServer((HOST, PORT), Handler)
106
+ print(f"Ternary engine serving on {HOST}:{PORT}")
107
+ server.serve_forever()
ternary_kernel.c ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Ternary Neural Network Kernel - AVX-512 optimized
3
+ *
4
+ * Weights are stored as two bitplanes per row:
5
+ * pos_mask: bit=1 where weight = +1
6
+ * neg_mask: bit=1 where weight = -1
7
+ * (both 0 = weight is 0)
8
+ *
9
+ * Matmul becomes: y[i] = sum(x[j] where pos) - sum(x[j] where neg)
10
+ * No multiplication at all — just masked add/subtract.
11
+ *
12
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
13
+ */
14
+
15
+ #include <immintrin.h>
16
+ #include <stdint.h>
17
+ #include <stdlib.h>
18
+ #include <string.h>
19
+ #include <math.h>
20
+ #include <stdio.h>
21
+
22
+ /* ============================================================
23
+ * Core ternary matmul: y = W_ternary @ x
24
+ *
25
+ * W stored as bitplanes: pos_bits[out_dim][ceil(in_dim/64)] uint64
26
+ * neg_bits[out_dim][ceil(in_dim/64)] uint64
27
+ * x: float32[in_dim]
28
+ * y: float32[out_dim]
29
+ * bias: float32[out_dim] or NULL
30
+ * scale: float32 per-row scale factor (to recover magnitude)
31
+ * ============================================================ */
32
+ void ternary_matvec_avx512(
33
+ const uint64_t *pos_bits, /* [out_dim * chunks] */
34
+ const uint64_t *neg_bits, /* [out_dim * chunks] */
35
+ const float *scales, /* [out_dim] per-row scale */
36
+ const float *x, /* [in_dim] input activations */
37
+ float *y, /* [out_dim] output */
38
+ int out_dim,
39
+ int in_dim
40
+ ) {
41
+ int chunks = (in_dim + 63) / 64; /* 64 weights per uint64 */
42
+
43
+ /* Pad input to multiple of 16 floats for AVX-512 */
44
+ int in_padded = (in_dim + 15) & ~15;
45
+ float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
46
+ memcpy(x_pad, x, in_dim * sizeof(float));
47
+ memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
48
+
49
+ for (int i = 0; i < out_dim; i++) {
50
+ __m512 acc = _mm512_setzero_ps();
51
+
52
+ const uint64_t *row_pos = pos_bits + (size_t)i * chunks;
53
+ const uint64_t *row_neg = neg_bits + (size_t)i * chunks;
54
+
55
+ /* Process 64 weights at a time (4 AVX-512 ops of 16 floats each) */
56
+ for (int c = 0; c < chunks; c++) {
57
+ uint64_t pb = row_pos[c];
58
+ uint64_t nb = row_neg[c];
59
+ int base = c * 64;
60
+
61
+ /* Process in groups of 16 floats */
62
+ for (int g = 0; g < 4 && (base + g * 16) < in_padded; g++) {
63
+ int offset = base + g * 16;
64
+ __m512 xv = _mm512_load_ps(x_pad + offset);
65
+
66
+ /* Extract 16 bits for this group */
67
+ __mmask16 pmask = (__mmask16)((pb >> (g * 16)) & 0xFFFF);
68
+ __mmask16 nmask = (__mmask16)((nb >> (g * 16)) & 0xFFFF);
69
+
70
+ /* Masked add where weight = +1, masked subtract where weight = -1 */
71
+ acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
72
+ acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
73
+ }
74
+ }
75
+
76
+ /* Horizontal sum */
77
+ float sum = _mm512_reduce_add_ps(acc);
78
+
79
+ /* Apply per-row scale to recover magnitude */
80
+ y[i] = sum * scales[i];
81
+ }
82
+
83
+ free(x_pad);
84
+ }
85
+
86
+ /* ============================================================
87
+ * Batched version: Y = W_ternary @ X (multiple input vectors)
88
+ * X: [batch, in_dim], Y: [batch, out_dim]
89
+ * ============================================================ */
90
+ void ternary_matmul_avx512(
91
+ const uint64_t *pos_bits,
92
+ const uint64_t *neg_bits,
93
+ const float *scales,
94
+ const float *X,
95
+ float *Y,
96
+ int batch,
97
+ int out_dim,
98
+ int in_dim
99
+ ) {
100
+ for (int b = 0; b < batch; b++) {
101
+ ternary_matvec_avx512(
102
+ pos_bits, neg_bits, scales,
103
+ X + (size_t)b * in_dim,
104
+ Y + (size_t)b * out_dim,
105
+ out_dim, in_dim
106
+ );
107
+ }
108
+ }
109
+
110
+ /* ============================================================
111
+ * RMSNorm: y = x * (1/rms(x)) * weight
112
+ * ============================================================ */
113
+ void rmsnorm_avx512(
114
+ const float *x,
115
+ const float *weight,
116
+ float *y,
117
+ int dim,
118
+ float eps
119
+ ) {
120
+ /* Compute sum of squares */
121
+ __m512 sum_sq = _mm512_setzero_ps();
122
+ int i;
123
+ for (i = 0; i + 16 <= dim; i += 16) {
124
+ __m512 xv = _mm512_loadu_ps(x + i);
125
+ sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
126
+ }
127
+ float ss = _mm512_reduce_add_ps(sum_sq);
128
+ /* Handle remainder */
129
+ for (; i < dim; i++) ss += x[i] * x[i];
130
+
131
+ float rms = 1.0f / sqrtf(ss / dim + eps);
132
+
133
+ /* Apply norm and weight */
134
+ for (i = 0; i + 16 <= dim; i += 16) {
135
+ __m512 xv = _mm512_loadu_ps(x + i);
136
+ __m512 wv = _mm512_loadu_ps(weight + i);
137
+ __m512 rv = _mm512_set1_ps(rms);
138
+ __m512 out = _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv);
139
+ _mm512_storeu_ps(y + i, out);
140
+ }
141
+ for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
142
+ }
143
+
144
+ /* ============================================================
145
+ * SiLU activation: x * sigmoid(x)
146
+ * ============================================================ */
147
+ static inline float silu_scalar(float x) {
148
+ return x / (1.0f + expf(-x));
149
+ }
150
+
151
+ void silu_avx512(float *x, int n) {
152
+ /* Scalar fallback — vectorized exp is complex */
153
+ for (int i = 0; i < n; i++) {
154
+ x[i] = silu_scalar(x[i]);
155
+ }
156
+ }
157
+
158
+ /* ============================================================
159
+ * Element-wise multiply: y = a * b
160
+ * ============================================================ */
161
+ void elemwise_mul_avx512(const float *a, const float *b, float *y, int n) {
162
+ int i;
163
+ for (i = 0; i + 16 <= n; i += 16) {
164
+ __m512 av = _mm512_loadu_ps(a + i);
165
+ __m512 bv = _mm512_loadu_ps(b + i);
166
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(av, bv));
167
+ }
168
+ for (; i < n; i++) y[i] = a[i] * b[i];
169
+ }
170
+
171
+ /* ============================================================
172
+ * Softmax
173
+ * ============================================================ */
174
+ void softmax(float *x, int n) {
175
+ float max_val = x[0];
176
+ for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
177
+ float sum = 0;
178
+ for (int i = 0; i < n; i++) {
179
+ x[i] = expf(x[i] - max_val);
180
+ sum += x[i];
181
+ }
182
+ float inv_sum = 1.0f / sum;
183
+ for (int i = 0; i < n; i++) x[i] *= inv_sum;
184
+ }
185
+
186
+ /* ============================================================
187
+ * RoPE (Rotary Position Embedding)
188
+ * ============================================================ */
189
+ void apply_rope(
190
+ float *q, /* [n_heads, head_dim] */
191
+ float *k, /* [n_kv_heads, head_dim] */
192
+ int n_heads,
193
+ int n_kv_heads,
194
+ int head_dim,
195
+ int pos,
196
+ float rope_theta
197
+ ) {
198
+ for (int h = 0; h < n_heads + n_kv_heads; h++) {
199
+ float *vec = (h < n_heads) ? q + h * head_dim : k + (h - n_heads) * head_dim;
200
+ for (int i = 0; i < head_dim; i += 2) {
201
+ float freq = 1.0f / powf(rope_theta, (float)i / head_dim);
202
+ float angle = pos * freq;
203
+ float cos_a = cosf(angle);
204
+ float sin_a = sinf(angle);
205
+ float v0 = vec[i];
206
+ float v1 = vec[i + 1];
207
+ vec[i] = v0 * cos_a - v1 * sin_a;
208
+ vec[i + 1] = v0 * sin_a + v1 * cos_a;
209
+ }
210
+ }
211
+ }
212
+
213
+ /* ============================================================
214
+ * Quantization: convert float weights to ternary
215
+ * Uses per-row threshold: threshold = alpha * mean(|w|)
216
+ * Returns: pos_bits, neg_bits, scales
217
+ * ============================================================ */
218
+ void quantize_to_ternary(
219
+ const float *weights, /* [out_dim, in_dim] */
220
+ uint64_t *pos_bits, /* [out_dim * chunks] output */
221
+ uint64_t *neg_bits, /* [out_dim * chunks] output */
222
+ float *scales, /* [out_dim] output */
223
+ int out_dim,
224
+ int in_dim,
225
+ float alpha /* threshold multiplier, typically 0.7-1.0 */
226
+ ) {
227
+ int chunks = (in_dim + 63) / 64;
228
+
229
+ for (int i = 0; i < out_dim; i++) {
230
+ const float *row = weights + (size_t)i * in_dim;
231
+
232
+ /* Compute mean absolute value for threshold */
233
+ float abs_sum = 0;
234
+ for (int j = 0; j < in_dim; j++) abs_sum += fabsf(row[j]);
235
+ float mean_abs = abs_sum / in_dim;
236
+ float threshold = alpha * mean_abs;
237
+
238
+ /* Compute scale: mean of absolute values of non-zero quantized weights */
239
+ float nz_sum = 0;
240
+ int nz_count = 0;
241
+ for (int j = 0; j < in_dim; j++) {
242
+ if (fabsf(row[j]) >= threshold) {
243
+ nz_sum += fabsf(row[j]);
244
+ nz_count++;
245
+ }
246
+ }
247
+ scales[i] = (nz_count > 0) ? (nz_sum / nz_count) : 1.0f;
248
+
249
+ /* Quantize to ternary bits */
250
+ for (int c = 0; c < chunks; c++) {
251
+ uint64_t pb = 0, nb = 0;
252
+ for (int b = 0; b < 64; b++) {
253
+ int j = c * 64 + b;
254
+ if (j >= in_dim) break;
255
+ if (row[j] >= threshold) {
256
+ pb |= (1ULL << b);
257
+ } else if (row[j] <= -threshold) {
258
+ nb |= (1ULL << b);
259
+ }
260
+ }
261
+ pos_bits[(size_t)i * chunks + c] = pb;
262
+ neg_bits[(size_t)i * chunks + c] = nb;
263
+ }
264
+ }
265
+ }
test_logunary ADDED
Binary file (26.3 kB). View file
 
test_logunary.c ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Log-Unary Tensor Tests
3
+ * Benchmarks accuracy and speed of native base-1 log-encoded tensors
4
+ */
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+ #include <math.h>
8
+ #include <time.h>
9
+
10
+ /* Forward declarations from library */
11
+ typedef struct LogUnaryTensor LogUnaryTensor;
12
+ typedef struct LogUnaryMatrix LogUnaryMatrix;
13
+ typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
14
+ typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;
15
+
16
+ extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
17
+ extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
18
+ extern void lut_free(LogUnaryTensor *t);
19
+ extern void lum_free(LogUnaryMatrix *m);
20
+ extern void lut_from_float(LogUnaryTensor *t, const float *x);
21
+ extern void lut_to_float(const LogUnaryTensor *t, float *out);
22
+ extern void lum_from_float(LogUnaryMatrix *m, const float *data);
23
+ extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
24
+ extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
25
+ extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
26
+ extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
27
+ extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);
28
+
29
+ /* Test matvec correctness against float reference */
30
+ static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
31
+ printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);
32
+
33
+ /* Random float matrix and vector */
34
+ float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
35
+ float *x_float = (float *)malloc(cols * sizeof(float));
36
+ float *y_ref = (float *)calloc(rows, sizeof(float));
37
+ float *y_lut = (float *)malloc(rows * sizeof(float));
38
+
39
+ srand(42);
40
+ for (size_t i = 0; i < (size_t)rows * cols; i++) {
41
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
42
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
43
+ M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
44
+ }
45
+ for (int i = 0; i < cols; i++) {
46
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
47
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
48
+ x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
49
+ }
50
+
51
+ /* Float reference matmul */
52
+ for (int i = 0; i < rows; i++)
53
+ for (int j = 0; j < cols; j++)
54
+ y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];
55
+
56
+ /* Log-unary matmul */
57
+ LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
58
+ LogUnaryTensor *x = lut_alloc(cols, planes, bias);
59
+ LogUnaryTensor *y = lut_alloc(rows, planes, bias);
60
+
61
+ lum_from_float(M, M_float);
62
+ lut_from_float(x, x_float);
63
+ lum_matvec(M, x, y);
64
+ lut_to_float(y, y_lut);
65
+
66
+ /* Compare */
67
+ float dot = 0, na = 0, nb = 0, max_err = 0;
68
+ for (int i = 0; i < rows; i++) {
69
+ dot += y_ref[i] * y_lut[i];
70
+ na += y_ref[i] * y_ref[i];
71
+ nb += y_lut[i] * y_lut[i];
72
+ float err = fabsf(y_ref[i] - y_lut[i]);
73
+ if (err > max_err) max_err = err;
74
+ }
75
+ float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
76
+
77
+ float noise = 0;
78
+ for (int i = 0; i < rows; i++) {
79
+ float e = y_ref[i] - y_lut[i];
80
+ noise += e * e;
81
+ }
82
+ float snr = 10.0f * log10f(na / (noise + 1e-10f));
83
+
84
+ printf(" Cosine similarity: %.6f\n", cosine);
85
+ printf(" SNR: %.1f dB\n", snr);
86
+ printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));
87
+
88
+ /* Show first few values */
89
+ printf(" First 5 values:\n");
90
+ for (int i = 0; i < 5 && i < rows; i++)
91
+ printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);
92
+
93
+ lum_free(M); lut_free(x); lut_free(y);
94
+ free(M_float); free(x_float); free(y_ref); free(y_lut);
95
+ }
96
+
97
+ int main() {
98
+ srand(time(NULL));
99
+
100
+ printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");
101
+
102
+ /* 1. Roundtrip accuracy at different plane counts */
103
+ printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
104
+ printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
105
+ for (int np = 4; np <= 12; np += 2) {
106
+ int bias = np / 2;
107
+ AccuracyResult r = lut_accuracy_test(4096, np, bias);
108
+ printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
109
+ np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
110
+ }
111
+
112
+ /* 2. Matvec correctness */
113
+ test_matvec_correctness(64, 256, 7, 3);
114
+ test_matvec_correctness(256, 1024, 7, 3);
115
+ test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */
116
+
117
+ /* 3. Speed benchmarks - various configurations */
118
+ printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
119
+ printf("%10s %6s %6s %6s %10s %10s %10s\n",
120
+ "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");
121
+
122
+ struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
123
+ /* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
124
+ {4096, 2560, 7, 4, 3, "q_proj"},
125
+ {4096, 2560, 7, 7, 3, "q_proj_7x7"},
126
+ {1024, 2560, 7, 4, 3, "k_proj"},
127
+ /* Qwen3-4B MLP: inter=9728 */
128
+ {9728, 2560, 7, 4, 3, "gate_proj"},
129
+ {2560, 9728, 7, 4, 3, "down_proj"},
130
+ /* Different plane counts */
131
+ {4096, 2560, 4, 4, 2, "4x4"},
132
+ {4096, 2560, 8, 8, 4, "8x8"},
133
+ {4096, 2560, 10, 6, 3, "10x6"},
134
+ };
135
+ int n_configs = sizeof(configs) / sizeof(configs[0]);
136
+
137
+ for (int c = 0; c < n_configs; c++) {
138
+ int iters = 3;
139
+ BenchResult r = lum_bench_matvec(
140
+ configs[c].rows, configs[c].cols,
141
+ configs[c].wp, configs[c].xp, configs[c].bias, iters);
142
+ printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n",
143
+ configs[c].rows, configs[c].cols,
144
+ configs[c].wp, configs[c].xp, configs[c].bias,
145
+ r.wall_time_s * 1000,
146
+ r.elements_per_sec / 1e6,
147
+ r.gops,
148
+ configs[c].label);
149
+ }
150
+
151
+ printf("\n=== DONE ===\n");
152
+ return 0;
153
+ }
test_popcount.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Test the full-unary popcount engine."""
3
+ import ctypes, numpy as np, os, time, sys
4
+ os.environ["OMP_NUM_THREADS"] = "16"
5
+
6
+ MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary4"
7
+ HF_DIR = "deepseek-r1-1.5b-hf"
8
+ N_PLANES = int(sys.argv[2]) if len(sys.argv) > 2 else 4
9
+
10
+ lib = ctypes.CDLL("./unary_full.so")
11
+ lib.model_alloc.restype = ctypes.c_void_p
12
+ lib.model_alloc.argtypes = [ctypes.c_int]
13
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
14
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
15
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
16
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
17
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
18
+ args = [ctypes.c_void_p, ctypes.c_int]
19
+ for _ in range(7):
20
+ args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
21
+ args.append(ctypes.c_int)
22
+ lib.layer_set_linears.argtypes = args
23
+ lib.generate.restype = ctypes.c_int
24
+ lib.generate.argtypes = [
25
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int,
26
+ ctypes.c_void_p, ctypes.c_int,
27
+ ctypes.c_float, ctypes.c_float, ctypes.c_int
28
+ ]
29
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
30
+ lib.model_free.argtypes = [ctypes.c_void_p]
31
+
32
+ _refs = []
33
+ def keep(a):
34
+ _refs.append(a)
35
+ return a.ctypes.data
36
+
37
+ print(f"Loading model from {MODEL_DIR} (w_planes={N_PLANES})...")
38
+ m = lib.model_alloc(N_PLANES)
39
+
40
+ # Embed + final norm + lm_head
41
+ e = np.fromfile(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"), dtype=np.uint16)
42
+ lib.model_set_embed(m, keep(e))
43
+ fn = np.fromfile(os.path.join(MODEL_DIR, "model_norm_weight.fp16"), dtype=np.float16).astype(np.float32)
44
+ lib.model_set_final_norm(m, keep(fn))
45
+ lm = np.fromfile(os.path.join(MODEL_DIR, "lm_head_weight.fp16"), dtype=np.uint16)
46
+ lib.model_set_lm_head(m, keep(lm), 151936, 1536)
47
+
48
+ PROJS = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
49
+ "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
50
+ DIMS = {
51
+ "self_attn_q_proj": (1536, 1536), "self_attn_k_proj": (256, 1536),
52
+ "self_attn_v_proj": (256, 1536), "self_attn_o_proj": (1536, 1536),
53
+ "mlp_gate_proj": (8960, 1536), "mlp_up_proj": (8960, 1536),
54
+ "mlp_down_proj": (1536, 8960),
55
+ }
56
+
57
+ for l in range(28):
58
+ in_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
59
+ po_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32)
60
+ lib.layer_set_norms(m, l, keep(in_n), keep(po_n))
61
+
62
+ qb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
63
+ kb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
64
+ vb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"), dtype=np.float16).astype(np.float32)
65
+ lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb))
66
+
67
+ pa = []
68
+ for p in PROJS:
69
+ base = os.path.join(MODEL_DIR, f"model_layers_{l}_{p}_weight")
70
+ s = np.fromfile(base + ".sign", dtype=np.uint64)
71
+ pl = np.fromfile(base + ".planes", dtype=np.uint64)
72
+ sc = np.fromfile(base + ".scales", dtype=np.float32)
73
+ od, id_ = DIMS[p]
74
+ pa.extend([keep(s), keep(pl), keep(sc), od, id_])
75
+ lib.layer_set_linears(m, l, *pa, N_PLANES)
76
+ if (l + 1) % 7 == 0:
77
+ print(f" Layer {l+1}/28")
78
+
79
+ print("Model loaded!")
80
+
81
+ from transformers import AutoTokenizer
82
+ tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
83
+
84
+ msg = [{"role": "user", "content": "What is 2+2?"}]
85
+ ids = tok.apply_chat_template(msg, add_generation_prompt=True)
86
+ arr = np.array(ids, dtype=np.int32)
87
+ out = np.zeros(30, dtype=np.int32)
88
+
89
+ lib.model_reset_cache(m)
90
+ print(f"Prompt: {len(ids)} tokens, generating 30...")
91
+ t0 = time.time()
92
+ n = lib.generate(m, arr.ctypes.data, len(ids), out.ctypes.data, 30,
93
+ ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id)
94
+ dt = time.time() - t0
95
+ text = tok.decode(out[:n].tolist(), skip_special_tokens=False)
96
+ print(f"\n=== {n} tokens, {dt:.1f}s, {n/dt:.1f} tok/s ===")
97
+ print(text)
98
+ print("===")
99
+ lib.model_free(m)
true_unary ADDED
Binary file (29.9 kB). View file
 
true_unary.c ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
3
+ *
4
+ * Representation:
5
+ * A value of magnitude M is stored as M consecutive 1-bits.
6
+ * The number IS the count of ones.
7
+ * Every bit has weight exactly 1.
8
+ *
9
+ * For a vector element quantized to integer range [-K, K]:
10
+ * sign: 1 bit (0=positive, 1=negative)
11
+ * magnitude: K bit positions, first |value| are 1, rest are 0
12
+ *
13
+ * Storage layout for a vector of dim D with max magnitude K:
14
+ * sign: uint64[(D+63)/64] — one sign bit per element
15
+ * unary: uint64[K * (D+63)/64] — K bitplanes across D elements
16
+ * Plane p has bit j set iff |element_j| > p
17
+ * (thermometer = true unary in bitplane form)
18
+ *
19
+ * Multiplication: w * x = popcount of ones(w) matched with ones(x)
20
+ * Since every bit = 1, the dot product is JUST COUNTING.
21
+ * No weights, no shifts, no corrections.
22
+ * sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
23
+ * = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
24
+ *
25
+ * YES this uses more memory. A 2560-dim vector with K=32 uses:
26
+ * 32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
27
+ * But the MATH IS EXACT (to quantization level).
28
+ *
29
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
30
+ */
31
+
32
+ #define _POSIX_C_SOURCE 199309L
33
+ #include <immintrin.h>
34
+ #include <omp.h>
35
+ #include <stdint.h>
36
+ #include <stdlib.h>
37
+ #include <string.h>
38
+ #include <math.h>
39
+ #include <stdio.h>
40
+ #include <time.h>
41
+
42
+ /* ============================================================
43
+ * TRUE UNARY VECTOR
44
+ * ============================================================ */
45
+ typedef struct {
46
+ uint64_t *sign; /* [chunks] — 1 bit per element */
47
+ uint64_t *unary; /* [K * chunks] — K bitplanes, each bit = weight 1 */
48
+ float scale; /* float scale: real_value = sign * count * scale */
49
+ int dim;
50
+ int chunks; /* (dim+63)/64 */
51
+ int K; /* max magnitude = number of unary bitplanes */
52
+ } TrueUnaryVec;
53
+
54
+ /* TRUE UNARY MATRIX — row-major */
55
+ typedef struct {
56
+ uint64_t *sign; /* [rows * chunks] */
57
+ uint64_t *unary; /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */
58
+ float *scales; /* [rows] — per-row scale factors */
59
+ int rows;
60
+ int cols;
61
+ int chunks; /* (cols+63)/64 */
62
+ int K; /* max magnitude per element */
63
+ } TrueUnaryMat;
64
+
65
+ /* ============================================================
66
+ * ALLOCATION
67
+ * ============================================================ */
68
+ TrueUnaryVec* tuv_alloc(int dim, int K) {
69
+ TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec));
70
+ v->dim = dim;
71
+ v->K = K;
72
+ v->chunks = (dim + 63) / 64;
73
+ v->scale = 1.0f;
74
+ v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
75
+ v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t));
76
+ memset(v->sign, 0, v->chunks * sizeof(uint64_t));
77
+ memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
78
+ return v;
79
+ }
80
+
81
+ TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
82
+ TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat));
83
+ m->rows = rows;
84
+ m->cols = cols;
85
+ m->K = K;
86
+ m->chunks = (cols + 63) / 64;
87
+ m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
88
+ m->unary = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
89
+ m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
90
+ memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
91
+ memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
92
+ for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
93
+ return m;
94
+ }
95
+
96
+ void tuv_free(TrueUnaryVec *v) {
97
+ if (v) { free(v->sign); free(v->unary); free(v); }
98
+ }
99
+ void tum_free(TrueUnaryMat *m) {
100
+ if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
101
+ }
102
+
103
+ /* ============================================================
104
+ * FLOAT → TRUE UNARY
105
+ *
106
+ * Quantize: integer_val = round(float_val / scale * K)
107
+ * Then store |integer_val| as that many 1-bits.
108
+ *
109
+ * For vector: single global scale = absmax / K
110
+ * For matrix: per-row scale = row_absmax / K
111
+ * ============================================================ */
112
+ void tuv_from_float(TrueUnaryVec *v, const float *x) {
113
+ int dim = v->dim, K = v->K, chunks = v->chunks;
114
+
115
+ memset(v->sign, 0, chunks * sizeof(uint64_t));
116
+ memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));
117
+
118
+ float amax = 0.0f;
119
+ for (int i = 0; i < dim; i++) {
120
+ float a = fabsf(x[i]);
121
+ if (a > amax) amax = a;
122
+ }
123
+ if (amax == 0.0f) { v->scale = 1.0f; return; }
124
+ v->scale = amax / K;
125
+
126
+ float inv = K / amax;
127
+ for (int i = 0; i < dim; i++) {
128
+ int c = i / 64;
129
+ uint64_t bit = 1ULL << (i % 64);
130
+
131
+ if (x[i] < 0.0f) v->sign[c] |= bit;
132
+
133
+ int mag = (int)(fabsf(x[i]) * inv + 0.5f);
134
+ if (mag > K) mag = K;
135
+
136
+ /* TRUE UNARY: set planes 0 through mag-1 */
137
+ for (int p = 0; p < mag; p++)
138
+ v->unary[(size_t)p * chunks + c] |= bit;
139
+ }
140
+ }
141
+
142
+ void tuv_to_float(const TrueUnaryVec *v, float *out) {
143
+ int dim = v->dim, K = v->K, chunks = v->chunks;
144
+
145
+ for (int i = 0; i < dim; i++) {
146
+ int c = i / 64;
147
+ uint64_t bit = 1ULL << (i % 64);
148
+
149
+ /* Count set planes = magnitude in base-1 */
150
+ int mag = 0;
151
+ for (int p = 0; p < K; p++) {
152
+ if (v->unary[(size_t)p * chunks + c] & bit)
153
+ mag++;
154
+ }
155
+
156
+ float val = (float)mag * v->scale;
157
+ out[i] = (v->sign[c] & bit) ? -val : val;
158
+ }
159
+ }
160
+
161
+ void tum_from_float(TrueUnaryMat *m, const float *data) {
162
+ int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
163
+
164
+ memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
165
+ memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
166
+
167
+ for (int r = 0; r < rows; r++) {
168
+ const float *row = data + (size_t)r * cols;
169
+
170
+ float amax = 0.0f;
171
+ for (int j = 0; j < cols; j++) {
172
+ float a = fabsf(row[j]);
173
+ if (a > amax) amax = a;
174
+ }
175
+ if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
176
+ m->scales[r] = amax / K;
177
+ float inv = K / amax;
178
+
179
+ uint64_t *row_sign = m->sign + (size_t)r * chunks;
180
+
181
+ for (int j = 0; j < cols; j++) {
182
+ int c = j / 64;
183
+ uint64_t bit = 1ULL << (j % 64);
184
+
185
+ if (row[j] < 0.0f) row_sign[c] |= bit;
186
+
187
+ int mag = (int)(fabsf(row[j]) * inv + 0.5f);
188
+ if (mag > K) mag = K;
189
+
190
+ for (int p = 0; p < mag; p++)
191
+ m->unary[((size_t)p * rows + r) * chunks + c] |= bit;
192
+ }
193
+ }
194
+ }
195
+
196
+ /* ============================================================
197
+ * TRUE UNARY MATVEC: y = M @ x
198
+ *
199
+ * THE CORE OPERATION.
200
+ *
201
+ * For each output element y[i]:
202
+ * For each pair of planes (p from weight, q from activation):
203
+ * active = w_plane_p[i] AND x_plane_q
204
+ * same = active AND ~(w_sign[i] XOR x_sign)
205
+ * diff = active AND (w_sign[i] XOR x_sign)
206
+ * acc += popcount(same) - popcount(diff)
207
+ *
208
+ * EVERY PLANE PAIR HAS WEIGHT = 1.
209
+ * No shifts. No scaling between planes. No corrections.
210
+ * The count IS the answer.
211
+ *
212
+ * y[i] = acc * w_scale[i] * x_scale
213
+ * (single float multiply at the very end)
214
+ *
215
+ * ============================================================ */
216
+ void tum_matvec(
217
+ const TrueUnaryMat *M,
218
+ const TrueUnaryVec *x,
219
+ float *y_out /* float output, requantize externally if needed */
220
+ ) {
221
+ int out_dim = M->rows;
222
+ int chunks = M->chunks;
223
+ int wK = M->K;
224
+ int xK = x->K;
225
+
226
+ #pragma omp parallel for schedule(dynamic, 32)
227
+ for (int i = 0; i < out_dim; i++) {
228
+ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
229
+ long long acc = 0;
230
+
231
+ for (int c = 0; c < chunks; c++) {
232
+ uint64_t ws = w_sign_row[c];
233
+ uint64_t xs = x->sign[c];
234
+ uint64_t same = ~(ws ^ xs);
235
+ uint64_t diff = ws ^ xs;
236
+
237
+ /*
238
+ * PURE BASE-1: every plane pair contributes weight 1.
239
+ * acc += popcount(w_plane AND x_plane AND same_sign)
240
+ * - popcount(w_plane AND x_plane AND diff_sign)
241
+ */
242
+ for (int p = 0; p < wK; p++) {
243
+ uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];
244
+
245
+ for (int q = 0; q < xK; q++) {
246
+ uint64_t xq = x->unary[(size_t)q * chunks + c];
247
+ uint64_t active = wp & xq;
248
+ acc += __builtin_popcountll(active & same)
249
+ - __builtin_popcountll(active & diff);
250
+ }
251
+ }
252
+ }
253
+
254
+ /* Single float rescale per output element */
255
+ y_out[i] = (float)acc * M->scales[i] * x->scale;
256
+ }
257
+ }
258
+
259
+ /* ============================================================
260
+ * OPTIMIZED MATVEC: collapse x planes first
261
+ *
262
+ * Instead of iterating wK * xK plane pairs per chunk,
263
+ * precompute per-chunk activation sums:
264
+ * x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
265
+ * x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
266
+ *
267
+ * Then for each weight plane p:
268
+ * This doesn't directly simplify because we need AND with wp first.
269
+ *
270
+ * ALTERNATIVE: precompute per-element x magnitudes in unary,
271
+ * then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
272
+ *
273
+ * For now: provide both the naive and a vertically-accumulated variant.
274
+ *
275
+ * VERTICAL ACCUMULATE: sum all weight planes into a per-element
276
+ * count, then multiply by x count. Reduces from O(wK*xK*chunks)
277
+ * to O((wK+xK)*chunks + dim).
278
+ * ============================================================ */
279
+ void tum_matvec_fast(
280
+ const TrueUnaryMat *M,
281
+ const TrueUnaryVec *x,
282
+ float *y_out
283
+ ) {
284
+ int out_dim = M->rows;
285
+ int cols = M->cols;
286
+ int chunks = M->chunks;
287
+ int xK = x->K;
288
+
289
+ /* Step 1: compute x magnitudes (per-element popcount across planes)
290
+ * x_mag[j] = number of x planes where bit j is set
291
+ * This is O(xK * chunks) = O(xK * dim / 64)
292
+ */
293
+ int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
294
+ memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
295
+
296
+ for (int q = 0; q < xK; q++) {
297
+ const uint64_t *xplane = x->unary + (size_t)q * chunks;
298
+ for (int c = 0; c < chunks; c++) {
299
+ uint64_t bits = xplane[c];
300
+ while (bits) {
301
+ int bit = __builtin_ctzll(bits);
302
+ int j = c * 64 + bit;
303
+ if (j < cols) x_mag[j]++;
304
+ bits &= bits - 1;
305
+ }
306
+ }
307
+ }
308
+
309
+ /* Apply sign to x_mag: positive if same sign as...
310
+ * Actually we need signed x_mag relative to each weight row's sign.
311
+ * So we keep x_mag unsigned and handle sign per output element.
312
+ */
313
+
314
+ /* Step 2: for each output row, compute:
315
+ * y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
316
+ *
317
+ * w_mag[i][j] = number of weight planes where bit j is set
318
+ * sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
319
+ *
320
+ * We compute w_mag by vertical popcount across weight planes.
321
+ * This is O(wK * chunks) per row.
322
+ */
323
+
324
+ #pragma omp parallel
325
+ {
326
+ int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
327
+
328
+ #pragma omp for schedule(dynamic, 32)
329
+ for (int i = 0; i < out_dim; i++) {
330
+ memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
331
+
332
+ /* Vertical popcount: count set planes per element */
333
+ for (int p = 0; p < M->K; p++) {
334
+ const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks;
335
+ for (int c = 0; c < chunks; c++) {
336
+ uint64_t bits = wplane[c];
337
+ while (bits) {
338
+ int bit = __builtin_ctzll(bits);
339
+ int j = c * 64 + bit;
340
+ if (j < cols) w_mag[j]++;
341
+ bits &= bits - 1;
342
+ }
343
+ }
344
+ }
345
+
346
+ /* Dot product with sign */
347
+ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
348
+ long long acc = 0;
349
+
350
+ for (int j = 0; j < cols; j++) {
351
+ int c = j / 64;
352
+ uint64_t bit = 1ULL << (j % 64);
353
+ int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
354
+ int product = (int)w_mag[j] * (int)x_mag[j];
355
+ acc += same_sign ? product : -product;
356
+ }
357
+
358
+ y_out[i] = (float)acc * M->scales[i] * x->scale;
359
+ }
360
+
361
+ free(w_mag);
362
+ }
363
+
364
+ free(x_mag);
365
+ }
366
+
367
+ /* ============================================================
368
+ * BENCHMARK + ACCURACY
369
+ * ============================================================ */
370
+ typedef struct {
371
+ float cosine;
372
+ float snr_db;
373
+ float max_rel_err;
374
+ double ms_naive;
375
+ double ms_fast;
376
+ double gops_naive;
377
+ double gops_fast;
378
+ } TestResult;
379
+
380
+ TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
381
+ TestResult r = {0};
382
+ srand(42);
383
+
384
+ /* Random float matrix and vector (normal distribution) */
385
+ float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
386
+ float *xf = (float *)malloc(cols * sizeof(float));
387
+ float *y_ref = (float *)calloc(rows, sizeof(float));
388
+ float *y_naive = (float *)malloc(rows * sizeof(float));
389
+ float *y_fast = (float *)malloc(rows * sizeof(float));
390
+
391
+ for (size_t i = 0; i < (size_t)rows * cols; i++) {
392
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
393
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
394
+ Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
395
+ }
396
+ for (int i = 0; i < cols; i++) {
397
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
398
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
399
+ xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
400
+ }
401
+
402
+ /* Float reference */
403
+ for (int i = 0; i < rows; i++)
404
+ for (int j = 0; j < cols; j++)
405
+ y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
406
+
407
+ /* Convert to true unary */
408
+ TrueUnaryMat *M = tum_alloc(rows, cols, wK);
409
+ TrueUnaryVec *x = tuv_alloc(cols, xK);
410
+ tum_from_float(M, Mf);
411
+ tuv_from_float(x, xf);
412
+
413
+ /* Naive matvec */
414
+ struct timespec t0, t1;
415
+ tum_matvec(M, x, y_naive); /* warmup */
416
+ clock_gettime(CLOCK_MONOTONIC, &t0);
417
+ for (int i = 0; i < iters; i++)
418
+ tum_matvec(M, x, y_naive);
419
+ clock_gettime(CLOCK_MONOTONIC, &t1);
420
+ r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
421
+
422
+ /* Fast matvec */
423
+ tum_matvec_fast(M, x, y_fast); /* warmup */
424
+ clock_gettime(CLOCK_MONOTONIC, &t0);
425
+ for (int i = 0; i < iters; i++)
426
+ tum_matvec_fast(M, x, y_fast);
427
+ clock_gettime(CLOCK_MONOTONIC, &t1);
428
+ r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
429
+
430
+ /* Accuracy vs float reference */
431
+ float dot = 0, na = 0, nb = 0, max_re = 0;
432
+ for (int i = 0; i < rows; i++) {
433
+ dot += y_ref[i] * y_naive[i];
434
+ na += y_ref[i] * y_ref[i];
435
+ nb += y_naive[i] * y_naive[i];
436
+ float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
437
+ if (re > max_re) max_re = re;
438
+ }
439
+ r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
440
+ float noise = 0;
441
+ for (int i = 0; i < rows; i++) {
442
+ float e = y_ref[i] - y_naive[i]; noise += e * e;
443
+ }
444
+ r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
445
+ r.max_rel_err = max_re;
446
+
447
+ /* Verify naive == fast */
448
+ float fast_err = 0;
449
+ for (int i = 0; i < rows; i++) {
450
+ float e = fabsf(y_naive[i] - y_fast[i]);
451
+ if (e > fast_err) fast_err = e;
452
+ }
453
+ if (fast_err > 0.01f)
454
+ printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err);
455
+
456
+ double ops = 2.0 * rows * cols;
457
+ r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
458
+ r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);
459
+
460
+ tum_free(M); tuv_free(x);
461
+ free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
462
+ return r;
463
+ }
464
+
465
+ /* ============================================================
466
+ * MAIN: sweep K values, show accuracy + speed tradeoff
467
+ * ============================================================ */
468
+ int main() {
469
+ printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
470
+ printf("Every bit has weight 1. Value = count of ones.\n");
471
+ printf("Matmul = AND + popcount, no weighting.\n\n");
472
+
473
+ /* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
474
+ int rows = 4096, cols = 2560;
475
+ printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);
476
+
477
+ printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n",
478
+ "wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
479
+ "Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");
480
+
481
+ struct { int wK; int xK; } configs[] = {
482
+ {8, 4},
483
+ {8, 8},
484
+ {16, 8},
485
+ {16, 16},
486
+ {32, 8},
487
+ {32, 16},
488
+ {32, 32},
489
+ {64, 16},
490
+ {64, 32},
491
+ };
492
+ int n = sizeof(configs) / sizeof(configs[0]);
493
+
494
+ for (int c = 0; c < n; c++) {
495
+ int wK = configs[c].wK;
496
+ int xK = configs[c].xK;
497
+ int iters = (wK <= 16 && xK <= 16) ? 3 : 1;
498
+
499
+ TestResult r = tum_test(rows, cols, wK, xK, iters);
500
+
501
+ /* Memory for this layer's weights */
502
+ size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
503
+ size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
504
+ size_t scale_bytes = rows * 4;
505
+ double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;
506
+
507
+ printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n",
508
+ wK, xK, r.cosine, r.snr_db, r.max_rel_err,
509
+ r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
510
+ }
511
+
512
+ /* Show first 5 values for K=32,16 case */
513
+ printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
514
+ {
515
+ int sr = 512, sc = 2560;
516
+ srand(42);
517
+ float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float));
518
+ float *xf = (float *)malloc(sc * sizeof(float));
519
+ float *y_ref = (float *)calloc(sr, sizeof(float));
520
+ float *y_unary = (float *)malloc(sr * sizeof(float));
521
+
522
+ for (size_t i = 0; i < (size_t)sr * sc; i++) {
523
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
524
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
525
+ Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
526
+ }
527
+ for (int i = 0; i < sc; i++) {
528
+ float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
529
+ float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
530
+ xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
531
+ }
532
+ for (int i = 0; i < sr; i++)
533
+ for (int j = 0; j < sc; j++)
534
+ y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];
535
+
536
+ TrueUnaryMat *M = tum_alloc(sr, sc, 32);
537
+ TrueUnaryVec *x = tuv_alloc(sc, 16);
538
+ tum_from_float(M, Mf);
539
+ tuv_from_float(x, xf);
540
+ tum_matvec(M, x, y_unary);
541
+
542
+ printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
543
+ for (int i = 0; i < 10; i++)
544
+ printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);
545
+
546
+ tum_free(M); tuv_free(x);
547
+ free(Mf); free(xf); free(y_ref); free(y_unary);
548
+ }
549
+
550
+ printf("\n=== DONE ===\n");
551
+ return 0;
552
+ }
unary_convert.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert model weights to UNARY (base-1) thermometer encoding.
4
+
5
+ True unary: magnitude N = N consecutive 1-bits across N bitplanes.
6
+ Each bitplane contributes equally (value=1), NOT binary powers.
7
+
8
+ Weight 0.3 with scale -> magnitude 5 -> planes 0,1,2,3,4 have bit set
9
+ Weight -0.1 with scale -> magnitude 2, sign=neg -> planes 0,1 set + sign bit
10
+
11
+ More precision than ternary (N+1 levels vs 3), still no multiplication.
12
+
13
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
14
+ """
15
+
16
+ import os
17
+ import json
18
+ import numpy as np
19
+ from pathlib import Path
20
+ import time
21
+
22
+
23
+ def load_safetensors(model_dir):
24
+ """Load all tensors from safetensors files."""
25
+ import torch
26
+ from safetensors.torch import load_file
27
+
28
+ tensors = {}
29
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
30
+ print(f"Loading {f.name}...")
31
+ state = load_file(str(f))
32
+ for key, val in state.items():
33
+ tensors[key] = val.float().numpy()
34
+ return tensors
35
+
36
+
37
+ def quantize_matrix_unary(weight, n_planes=7):
38
+ """Quantize weight matrix to unary thermometer encoding.
39
+
40
+ n_planes determines max magnitude (and precision levels = n_planes + 1).
41
+ n_planes=7 gives 8 levels: {0,1,2,3,4,5,6,7} * sign = 15 distinct values.
42
+
43
+ Returns: sign_bits, mag_planes, scales, sparsity
44
+ """
45
+ w = weight.astype(np.float32)
46
+ out_dim, in_dim = w.shape
47
+ chunks = ((in_dim + 63) // 64)
48
+ padded = chunks * 64
49
+
50
+ # Per-row quantization
51
+ row_max = np.max(np.abs(w), axis=1, keepdims=True)
52
+ row_max = np.where(row_max == 0, 1.0, row_max)
53
+
54
+ # Scale to [0, n_planes] range per row
55
+ scales = (row_max.flatten() / n_planes).astype(np.float32)
56
+
57
+ # Quantize to integer magnitudes
58
+ w_scaled = w / scales[:, None] # Now in [-n_planes, +n_planes]
59
+ magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
60
+ magnitudes = np.clip(magnitudes, 0, n_planes)
61
+ signs = (w < 0) # True = negative
62
+
63
+ # Sparsity (magnitude 0)
64
+ sparsity = np.mean(magnitudes == 0)
65
+
66
+ # Pad to multiple of 64
67
+ if in_dim < padded:
68
+ magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded - in_dim), dtype=np.int32)], axis=1)
69
+ signs = np.concatenate([signs, np.zeros((out_dim, padded - in_dim), dtype=bool)], axis=1)
70
+
71
+ # Pack sign bits - vectorized
72
+ bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
73
+ signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
74
+ sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2) # [out_dim, chunks]
75
+
76
+ # Pack magnitude planes - thermometer encoding
77
+ # Plane p has bit set where magnitude > p (i.e., magnitude >= p+1)
78
+ mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
79
+
80
+ for p in range(n_planes):
81
+ active = (magnitudes >= (p + 1)) # [out_dim, padded]
82
+ active_r = active.reshape(out_dim, chunks, 64).astype(np.uint64)
83
+ mag_planes[p] = np.bitwise_or.reduce(active_r * bit_positions, axis=2)
84
+
85
+ return sign_bits, mag_planes, scales, sparsity
86
+
87
+
88
+ def save_unary_model(tensors, output_dir, n_planes=7):
89
+ """Convert and save full model to unary format."""
90
+ os.makedirs(output_dir, exist_ok=True)
91
+
92
+ config = {
93
+ "hidden_size": 1536,
94
+ "intermediate_size": 8960,
95
+ "num_attention_heads": 12,
96
+ "num_key_value_heads": 2,
97
+ "num_hidden_layers": 28,
98
+ "vocab_size": 151936,
99
+ "head_dim": 128,
100
+ "rope_theta": 1000000.0,
101
+ "rms_norm_eps": 1e-6,
102
+ "n_planes": n_planes,
103
+ "quant_type": "unary",
104
+ }
105
+
106
+ ternary_keys = []
107
+ keep_keys = []
108
+
109
+ for key in tensors:
110
+ if any(p in key for p in ['q_proj.weight', 'k_proj.weight', 'v_proj.weight',
111
+ 'o_proj.weight', 'gate_proj.weight', 'up_proj.weight',
112
+ 'down_proj.weight']):
113
+ ternary_keys.append(key)
114
+ else:
115
+ keep_keys.append(key)
116
+
117
+ print(f"\nUnary layers: {len(ternary_keys)} (n_planes={n_planes}, levels={n_planes+1})")
118
+ print(f"FP16 layers: {len(keep_keys)}")
119
+
120
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
121
+ json.dump(config, f, indent=2)
122
+
123
+ total_unary_bytes = 0
124
+ total_original_bytes = 0
125
+
126
+ for key in ternary_keys:
127
+ w = tensors[key]
128
+ out_dim, in_dim = w.shape
129
+ total_original_bytes += w.nbytes
130
+
131
+ t0 = time.time()
132
+ sign_bits, mag_planes, scales, sparsity = quantize_matrix_unary(w, n_planes)
133
+ dt = time.time() - t0
134
+
135
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
136
+ sign_bits.tofile(prefix + ".sign")
137
+ mag_planes.tofile(prefix + ".planes")
138
+ scales.tofile(prefix + ".scales")
139
+
140
+ unary_bytes = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
141
+ total_unary_bytes += unary_bytes
142
+ ratio = w.nbytes / unary_bytes
143
+
144
+ # Calculate effective bits per weight
145
+ bpw = (unary_bytes * 8) / (out_dim * in_dim)
146
+
147
+ print(f" {key}: {w.shape} -> unary ({unary_bytes/1024:.0f}KB, "
148
+ f"{ratio:.1f}x compress, {bpw:.2f} bpw, {sparsity:.1%} sparse, {dt:.1f}s)")
149
+
150
+ total_fp16_bytes = 0
151
+ for key in keep_keys:
152
+ w = tensors[key].astype(np.float16)
153
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
154
+ w.tofile(prefix + ".fp16")
155
+ total_fp16_bytes += w.nbytes
156
+ print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
157
+
158
+ manifest = {
159
+ "unary": {k: list(tensors[k].shape) for k in ternary_keys},
160
+ "fp16": {k: list(tensors[k].shape) for k in keep_keys},
161
+ }
162
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
163
+ json.dump(manifest, f, indent=2)
164
+
165
+ total_bytes = total_unary_bytes + total_fp16_bytes
166
+ avg_bpw = (total_unary_bytes * 8) / sum(np.prod(tensors[k].shape) for k in ternary_keys)
167
+
168
+ print(f"\n=== Summary ===")
169
+ print(f"Original FP32 linear weights: {total_original_bytes/1024/1024:.1f} MB")
170
+ print(f"Unary linear weights: {total_unary_bytes/1024/1024:.1f} MB")
171
+ print(f"FP16 other weights: {total_fp16_bytes/1024/1024:.1f} MB")
172
+ print(f"Total model size: {total_bytes/1024/1024:.1f} MB")
173
+ print(f"Average bits per weight (linear): {avg_bpw:.2f}")
174
+ print(f"Compression vs FP32: {(total_original_bytes + total_fp16_bytes)/total_bytes:.1f}x")
175
+ print(f"Precision levels: {n_planes + 1} (vs ternary=3, INT4=16)")
176
+
177
+
178
+ if __name__ == "__main__":
179
+ import sys
180
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
181
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary"
182
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
183
+
184
+ print(f"Loading model from {model_dir}...")
185
+ tensors = load_safetensors(model_dir)
186
+
187
+ print(f"Converting to unary (n_planes={n_planes})...")
188
+ save_unary_model(tensors, output_dir, n_planes)
189
+ print("Done!")
unary_convert_v2.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Pure Unary Converter - interleaved plane layout [out_dim][chunks][n_planes]
4
+ for cache-friendly access in the kernel.
5
+
6
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
7
+ """
8
+
9
+ import os, json, sys, time
10
+ import numpy as np
11
+ from pathlib import Path
12
+
13
+
14
+ def load_safetensors(model_dir):
15
+ import torch
16
+ from safetensors.torch import load_file
17
+ tensors = {}
18
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
19
+ print(f"Loading {f.name}...")
20
+ for k, v in load_file(str(f)).items():
21
+ tensors[k] = v.float().numpy()
22
+ return tensors
23
+
24
+
25
+ def quantize_unary_interleaved(weight, n_planes):
26
+ """Quantize and pack into interleaved layout [out_dim][chunks][n_planes]"""
27
+ w = weight.astype(np.float32)
28
+ out_dim, in_dim = w.shape
29
+ chunks = (in_dim + 63) // 64
30
+ padded = chunks * 64
31
+
32
+ row_max = np.max(np.abs(w), axis=1, keepdims=True)
33
+ row_max = np.where(row_max == 0, 1.0, row_max)
34
+ scales = (row_max.flatten() / n_planes).astype(np.float32)
35
+
36
+ w_scaled = w / scales[:, None]
37
+ magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
38
+ magnitudes = np.clip(magnitudes, 0, n_planes)
39
+ signs = (w < 0)
40
+
41
+ sparsity = np.mean(magnitudes == 0)
42
+
43
+ if in_dim < padded:
44
+ magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, padded-in_dim), dtype=np.int32)], axis=1)
45
+ signs = np.concatenate([signs, np.zeros((out_dim, padded-in_dim), dtype=bool)], axis=1)
46
+
47
+ # Pack sign bits [out_dim][chunks]
48
+ bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
49
+ signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
50
+ sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
51
+
52
+ # Pack magnitude planes INTERLEAVED: [out_dim][chunks][n_planes]
53
+ mag_planes = np.zeros((out_dim, chunks, n_planes), dtype=np.uint64)
54
+
55
+ for p in range(n_planes):
56
+ active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
57
+ mag_planes[:, :, p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
58
+
59
+ return sign_bits, mag_planes, scales, sparsity
60
+
61
+
62
+ def convert(model_dir, output_dir, n_planes):
63
+ os.makedirs(output_dir, exist_ok=True)
64
+ tensors = load_safetensors(model_dir)
65
+
66
+ config = {
67
+ "hidden_size": 1536, "intermediate_size": 8960,
68
+ "num_attention_heads": 12, "num_key_value_heads": 2,
69
+ "num_hidden_layers": 28, "vocab_size": 151936,
70
+ "head_dim": 128, "rope_theta": 1000000.0,
71
+ "rms_norm_eps": 1e-6, "n_planes": n_planes,
72
+ "quant_type": "unary_interleaved",
73
+ }
74
+
75
+ linear_keys = [k for k in tensors if any(p in k for p in
76
+ ['q_proj.weight','k_proj.weight','v_proj.weight','o_proj.weight',
77
+ 'gate_proj.weight','up_proj.weight','down_proj.weight'])]
78
+ other_keys = [k for k in tensors if k not in linear_keys]
79
+
80
+ print(f"\nUnary: {len(linear_keys)} layers, {n_planes} planes ({2*n_planes+1} levels)")
81
+ print(f"FP16: {len(other_keys)} layers\n")
82
+
83
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
84
+ json.dump(config, f, indent=2)
85
+
86
+ total_unary = total_orig = total_fp16 = 0
87
+
88
+ for key in linear_keys:
89
+ w = tensors[key]
90
+ total_orig += w.nbytes
91
+ t0 = time.time()
92
+ sign_bits, mag_planes, scales, sparsity = quantize_unary_interleaved(w, n_planes)
93
+ dt = time.time() - t0
94
+
95
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
96
+ sign_bits.tofile(prefix + ".sign")
97
+ mag_planes.tofile(prefix + ".planes") # [out_dim][chunks][n_planes] contiguous
98
+ scales.tofile(prefix + ".scales")
99
+
100
+ ub = sign_bits.nbytes + mag_planes.nbytes + scales.nbytes
101
+ total_unary += ub
102
+ bpw = (ub * 8) / (w.shape[0] * w.shape[1])
103
+ print(f" {key}: {w.shape} -> {ub/1024:.0f}KB ({bpw:.1f}bpw, {sparsity:.0%} sparse, {dt:.1f}s)")
104
+
105
+ for key in other_keys:
106
+ w = tensors[key].astype(np.float16)
107
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
108
+ w.tofile(prefix + ".fp16")
109
+ total_fp16 += w.nbytes
110
+ print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
111
+
112
+ manifest = {
113
+ "unary": {k: list(tensors[k].shape) for k in linear_keys},
114
+ "fp16": {k: list(tensors[k].shape) for k in other_keys},
115
+ }
116
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
117
+ json.dump(manifest, f, indent=2)
118
+
119
+ total = total_unary + total_fp16
120
+ avg_bpw = (total_unary * 8) / sum(np.prod(tensors[k].shape) for k in linear_keys)
121
+ print(f"\n=== Summary ===")
122
+ print(f"Unary weights: {total_unary/1024/1024:.1f} MB ({avg_bpw:.1f} avg bpw)")
123
+ print(f"FP16 weights: {total_fp16/1024/1024:.1f} MB")
124
+ print(f"Total: {total/1024/1024:.1f} MB")
125
+ print(f"Planes: {n_planes}, Levels: {2*n_planes+1}")
126
+ print(f"Layout: interleaved [out_dim][chunks][n_planes]")
127
+ print("Done!")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
132
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-unary31"
133
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 31
134
+ convert(model_dir, output_dir, n_planes)
unary_engine.c ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * PURE UNARY (BASE-1) TRANSFORMER ENGINE
3
+ * AVX-512 + OpenMP. Full Qwen2 forward pass in C.
4
+ *
5
+ * Thermometer encoding: magnitude M = M planes set.
6
+ * Each plane contributes EXACTLY 1. No powers. No binary.
7
+ * 7 planes = 8 levels {0,1,2,3,4,5,6,7} * sign.
8
+ *
9
+ * Model format on disk (from unary_convert.py):
10
+ * .sign = [out_dim * chunks] uint64 (1=negative)
11
+ * .planes = [n_planes * out_dim * chunks] uint64 (thermometer)
12
+ * .scales = [out_dim] float32 (per-row)
13
+ *
14
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
15
+ */
16
+
17
+ #include <immintrin.h>
18
+ #include <stdint.h>
19
+ #include <stdlib.h>
20
+ #include <string.h>
21
+ #include <math.h>
22
+ #include <stdio.h>
23
+ #include <time.h>
24
+ #include <omp.h>
25
+
26
+ #define HIDDEN 1536
27
+ #define INTER 8960
28
+ #define N_HEADS 12
29
+ #define N_KV_HEADS 2
30
+ #define HEAD_DIM 128
31
+ #define N_LAYERS 28
32
+ #define VOCAB 151936
33
+ #define RMS_EPS 1e-6f
34
+ #define ROPE_THETA 1000000.0f
35
+ #define MAX_SEQ 4096
36
+ #define GQA_RATIO (N_HEADS / N_KV_HEADS)
37
+
38
+ typedef struct {
39
+ uint64_t *sign_bits; /* [out_dim * chunks] */
40
+ uint64_t *mag_planes; /* [n_planes * out_dim * chunks] */
41
+ float *scales; /* [out_dim] */
42
+ float *bias; /* [out_dim] or NULL */
43
+ int out_dim, in_dim, n_planes;
44
+ } UL; /* Unary Linear */
45
+
46
+ typedef struct { uint16_t *w; int od, id; } FL; /* FP16 Linear */
47
+
48
+ typedef struct {
49
+ UL qp, kp, vp, op, gp, up, dp;
50
+ float *in_norm, *pn_norm;
51
+ float *qb, *kb, *vb;
52
+ } Lay;
53
+
54
+ typedef struct {
55
+ uint16_t *emb;
56
+ Lay lay[N_LAYERS];
57
+ float *fnorm;
58
+ FL lmh;
59
+ float *kc, *vc; /* KV cache */
60
+ float *h, *h2; /* hidden states */
61
+ float *sq, *sk, *sv; /* QKV scratch */
62
+ float *ao; /* attn output */
63
+ float *sg, *su, *sd; /* MLP scratch */
64
+ float *lg; /* logits */
65
+ float *as; /* attn scores */
66
+ int np;
67
+ } M;
68
+
69
+ /* ============================================================
70
+ * PURE UNARY MATVEC
71
+ *
72
+ * y[i] = scales[i] * SUM over planes p:
73
+ * SUM over j where plane_p bit j is set:
74
+ * sign[j]==0 ? +x[j] : -x[j]
75
+ *
76
+ * Each plane contributes 1. Seven planes, seven passes.
77
+ * Embarrassingly parallel over output rows.
78
+ * ============================================================ */
79
+ static void umv(const UL *L, const float *x, float *y) {
80
+ const int od = L->out_dim, id = L->in_dim, np = L->n_planes;
81
+ const int ch = (id + 63) / 64;
82
+ const int idp = (id + 15) & ~15;
83
+
84
+ float *xp = (float*)aligned_alloc(64, idp * sizeof(float));
85
+ memcpy(xp, x, id * sizeof(float));
86
+ if (idp > id) memset(xp + id, 0, (idp - id) * sizeof(float));
87
+
88
+ #pragma omp parallel for schedule(dynamic, 64)
89
+ for (int i = 0; i < od; i++) {
90
+ const uint64_t *rs = L->sign_bits + (size_t)i * ch;
91
+ float tot = 0.0f;
92
+
93
+ for (int p = 0; p < np; p++) {
94
+ const uint64_t *pr = L->mag_planes + ((size_t)p * od + i) * ch;
95
+ __m512 acc = _mm512_setzero_ps();
96
+
97
+ for (int c = 0; c < ch; c++) {
98
+ uint64_t mb = pr[c], sb = rs[c];
99
+ uint64_t pos = mb & ~sb;
100
+ uint64_t neg = mb & sb;
101
+
102
+ for (int g = 0; g < 4; g++) {
103
+ int off = c * 64 + g * 16;
104
+ if (off >= idp) break;
105
+ __m512 xv = _mm512_load_ps(xp + off);
106
+ __mmask16 pm = (__mmask16)((pos >> (g*16)) & 0xFFFF);
107
+ __mmask16 nm = (__mmask16)((neg >> (g*16)) & 0xFFFF);
108
+ acc = _mm512_mask_add_ps(acc, pm, acc, xv);
109
+ acc = _mm512_mask_sub_ps(acc, nm, acc, xv);
110
+ }
111
+ }
112
+ /* PURE UNARY: each plane worth exactly 1 */
113
+ tot += _mm512_reduce_add_ps(acc);
114
+ }
115
+ y[i] = tot * L->scales[i];
116
+ if (L->bias) y[i] += L->bias[i];
117
+ }
118
+ free(xp);
119
+ }
120
+
121
+ /* FP16 matvec (lm_head only) */
122
+ static void fmv(const FL *L, const float *x, float *y) {
123
+ #pragma omp parallel for schedule(dynamic, 256)
124
+ for (int i = 0; i < L->od; i++) {
125
+ __m512 acc = _mm512_setzero_ps();
126
+ const uint16_t *row = L->w + (size_t)i * L->id;
127
+ int j;
128
+ for (j = 0; j + 16 <= L->id; j += 16) {
129
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + j));
130
+ acc = _mm512_fmadd_ps(_mm512_cvtph_ps(h), _mm512_loadu_ps(x + j), acc);
131
+ }
132
+ float s = _mm512_reduce_add_ps(acc);
133
+ for (; j < L->id; j++) {
134
+ float wf; _mm_store_ss(&wf, _mm_cvtph_ps(_mm_set1_epi16(row[j])));
135
+ s += wf * x[j];
136
+ }
137
+ y[i] = s;
138
+ }
139
+ }
140
+
141
+ /* RMSNorm */
142
+ static void rn(const float *x, const float *w, float *y, int d) {
143
+ __m512 sq = _mm512_setzero_ps();
144
+ int i;
145
+ for (i = 0; i+16 <= d; i += 16) {
146
+ __m512 v = _mm512_loadu_ps(x+i);
147
+ sq = _mm512_fmadd_ps(v, v, sq);
148
+ }
149
+ float ss = _mm512_reduce_add_ps(sq);
150
+ for (; i < d; i++) ss += x[i]*x[i];
151
+ float r = 1.0f / sqrtf(ss/d + RMS_EPS);
152
+ __m512 rv = _mm512_set1_ps(r);
153
+ for (i = 0; i+16 <= d; i += 16)
154
+ _mm512_storeu_ps(y+i, _mm512_mul_ps(_mm512_mul_ps(
155
+ _mm512_loadu_ps(x+i), rv), _mm512_loadu_ps(w+i)));
156
+ for (; i < d; i++) y[i] = x[i]*r*w[i];
157
+ }
158
+
159
+ static void silu(float *x, int n) {
160
+ for (int i = 0; i < n; i++) x[i] /= (1.0f + expf(-x[i]));
161
+ }
162
+
163
+ static void emul(const float *a, const float *b, float *c, int n) {
164
+ int i;
165
+ for (i = 0; i+16 <= n; i += 16)
166
+ _mm512_storeu_ps(c+i, _mm512_mul_ps(_mm512_loadu_ps(a+i), _mm512_loadu_ps(b+i)));
167
+ for (; i < n; i++) c[i] = a[i]*b[i];
168
+ }
169
+
170
+ static void va(float *y, const float *x, int n) {
171
+ int i;
172
+ for (i = 0; i+16 <= n; i += 16)
173
+ _mm512_storeu_ps(y+i, _mm512_add_ps(_mm512_loadu_ps(y+i), _mm512_loadu_ps(x+i)));
174
+ for (; i < n; i++) y[i] += x[i];
175
+ }
176
+
177
+ static void rope(float *v, int pos, int d) {
178
+ for (int i = 0; i < d; i += 2) {
179
+ float f = 1.0f / powf(ROPE_THETA, (float)i/d);
180
+ float a = pos*f, co = cosf(a), si = sinf(a);
181
+ float v0 = v[i], v1 = v[i+1];
182
+ v[i] = v0*co - v1*si; v[i+1] = v0*si + v1*co;
183
+ }
184
+ }
185
+
186
+ static void sm(float *x, int n) {
187
+ float mx = x[0];
188
+ for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i];
189
+ float s = 0;
190
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i]-mx); s += x[i]; }
191
+ float iv = 1.0f/s;
192
+ for (int i = 0; i < n; i++) x[i] *= iv;
193
+ }
194
+
195
+ static void etok(const M *m, int t, float *o) {
196
+ const uint16_t *r = m->emb + (size_t)t * HIDDEN;
197
+ int i;
198
+ for (i = 0; i+16 <= HIDDEN; i += 16)
199
+ _mm512_storeu_ps(o+i, _mm512_cvtph_ps(_mm256_loadu_si256((__m256i*)(r+i))));
200
+ for (; i < HIDDEN; i++) _mm_store_ss(o+i, _mm_cvtph_ps(_mm_set1_epi16(r[i])));
201
+ }
202
+
203
+ static float* kvp(float *c, int l, int p, int h) {
204
+ return c + ((size_t)l*MAX_SEQ*N_KV_HEADS + (size_t)p*N_KV_HEADS + h)*HEAD_DIM;
205
+ }
206
+
207
+ static void do_attn(M *m, int l, int pos) {
208
+ Lay *ly = &m->lay[l];
209
+ umv(&ly->qp, m->h2, m->sq);
210
+ umv(&ly->kp, m->h2, m->sk);
211
+ umv(&ly->vp, m->h2, m->sv);
212
+ if (ly->qb) va(m->sq, ly->qb, N_HEADS*HEAD_DIM);
213
+ if (ly->kb) va(m->sk, ly->kb, N_KV_HEADS*HEAD_DIM);
214
+ if (ly->vb) va(m->sv, ly->vb, N_KV_HEADS*HEAD_DIM);
215
+ for (int h = 0; h < N_HEADS; h++) rope(m->sq + h*HEAD_DIM, pos, HEAD_DIM);
216
+ for (int h = 0; h < N_KV_HEADS; h++) rope(m->sk + h*HEAD_DIM, pos, HEAD_DIM);
217
+ for (int h = 0; h < N_KV_HEADS; h++) {
218
+ memcpy(kvp(m->kc,l,pos,h), m->sk+h*HEAD_DIM, HEAD_DIM*4);
219
+ memcpy(kvp(m->vc,l,pos,h), m->sv+h*HEAD_DIM, HEAD_DIM*4);
220
+ }
221
+ float sc = 1.0f/sqrtf((float)HEAD_DIM);
222
+ memset(m->ao, 0, N_HEADS*HEAD_DIM*4);
223
+ for (int h = 0; h < N_HEADS; h++) {
224
+ int kvh = h / GQA_RATIO;
225
+ float *qh = m->sq + h*HEAD_DIM, *oh = m->ao + h*HEAD_DIM;
226
+ for (int t = 0; t <= pos; t++) {
227
+ float *kk = kvp(m->kc,l,t,kvh);
228
+ __m512 a = _mm512_setzero_ps();
229
+ int d;
230
+ for (d = 0; d+16 <= HEAD_DIM; d += 16)
231
+ a = _mm512_fmadd_ps(_mm512_loadu_ps(qh+d), _mm512_loadu_ps(kk+d), a);
232
+ float dot = _mm512_reduce_add_ps(a);
233
+ for (; d < HEAD_DIM; d++) dot += qh[d]*kk[d];
234
+ m->as[t] = dot * sc;
235
+ }
236
+ sm(m->as, pos+1);
237
+ for (int t = 0; t <= pos; t++) {
238
+ float w = m->as[t];
239
+ if (w < 1e-8f) continue;
240
+ float *vv = kvp(m->vc,l,t,kvh);
241
+ __m512 wv = _mm512_set1_ps(w);
242
+ int d;
243
+ for (d = 0; d+16 <= HEAD_DIM; d += 16)
244
+ _mm512_storeu_ps(oh+d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vv+d), _mm512_loadu_ps(oh+d)));
245
+ for (; d < HEAD_DIM; d++) oh[d] += w*vv[d];
246
+ }
247
+ }
248
+ umv(&ly->op, m->ao, m->h2);
249
+ }
250
+
251
+ static void do_mlp(M *m, int l) {
252
+ Lay *ly = &m->lay[l];
253
+ umv(&ly->gp, m->h2, m->sg);
254
+ umv(&ly->up, m->h2, m->su);
255
+ silu(m->sg, INTER);
256
+ emul(m->sg, m->su, m->sd, INTER);
257
+ umv(&ly->dp, m->sd, m->h2);
258
+ }
259
+
260
+ float* forward_token(M *m, int tid, int pos) {
261
+ etok(m, tid, m->h);
262
+ for (int l = 0; l < N_LAYERS; l++) {
263
+ rn(m->h, m->lay[l].in_norm, m->h2, HIDDEN);
264
+ do_attn(m, l, pos);
265
+ va(m->h, m->h2, HIDDEN);
266
+ rn(m->h, m->lay[l].pn_norm, m->h2, HIDDEN);
267
+ do_mlp(m, l);
268
+ va(m->h, m->h2, HIDDEN);
269
+ }
270
+ rn(m->h, m->fnorm, m->h2, HIDDEN);
271
+ fmv(&m->lmh, m->h2, m->lg);
272
+ return m->lg;
273
+ }
274
+
275
+ static int samp(float *lg, int V, float T, float tp) {
276
+ if (T > 0) { float it = 1.0f/T; for (int i = 0; i < V; i++) lg[i] *= it; }
277
+ sm(lg, V);
278
+ float *pr = (float*)malloc(V*4); int *ix = (int*)malloc(V*4);
279
+ memcpy(pr, lg, V*4);
280
+ for (int i = 0; i < V; i++) ix[i] = i;
281
+ float cum = 0; int nk = 0;
282
+ while (cum < tp && nk < V && nk < 50) {
283
+ int b = nk;
284
+ for (int i = nk+1; i < V; i++) if (pr[i] > pr[b]) b = i;
285
+ float t = pr[nk]; pr[nk] = pr[b]; pr[b] = t;
286
+ int ti = ix[nk]; ix[nk] = ix[b]; ix[b] = ti;
287
+ cum += pr[nk]; nk++;
288
+ }
289
+ float s = 0; for (int i = 0; i < nk; i++) s += pr[i];
290
+ float r = (float)rand()/RAND_MAX * s, ac = 0;
291
+ int ch = ix[0];
292
+ for (int i = 0; i < nk; i++) { ac += pr[i]; if (ac >= r) { ch = ix[i]; break; } }
293
+ free(pr); free(ix);
294
+ return ch;
295
+ }
296
+
297
+ int generate(M *m, const int *pr, int pl, int *out, int mx,
298
+ float T, float tp, int eos) {
299
+ srand(time(NULL));
300
+ for (int i = 0; i < pl; i++) forward_token(m, pr[i], i);
301
+ int pos = pl, gen = 0;
302
+ for (int t = 0; t < mx; t++) {
303
+ int nx;
304
+ if (T <= 0) {
305
+ nx = 0;
306
+ for (int i = 1; i < VOCAB; i++) if (m->lg[i] > m->lg[nx]) nx = i;
307
+ } else {
308
+ nx = samp(m->lg, VOCAB, T, tp);
309
+ }
310
+ out[t] = nx; gen++;
311
+ if (nx == eos) break;
312
+ forward_token(m, nx, pos); pos++;
313
+ }
314
+ return gen;
315
+ }
316
+
317
+ /* ============================================================
318
+ * ALLOCATION + WEIGHT SETTERS (called from Python)
319
+ * ============================================================ */
320
+ M* model_alloc(int np) {
321
+ M *m = (M*)calloc(1, sizeof(M));
322
+ m->np = np;
323
+ size_t kv = (size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
324
+ m->kc = (float*)calloc(kv,4); m->vc = (float*)calloc(kv,4);
325
+ m->h = (float*)aligned_alloc(64,HIDDEN*4);
326
+ m->h2 = (float*)aligned_alloc(64,HIDDEN*4);
327
+ m->sq = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
328
+ m->sk = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
329
+ m->sv = (float*)aligned_alloc(64,N_KV_HEADS*HEAD_DIM*4);
330
+ m->ao = (float*)aligned_alloc(64,N_HEADS*HEAD_DIM*4);
331
+ m->sg = (float*)aligned_alloc(64,INTER*4);
332
+ m->su = (float*)aligned_alloc(64,INTER*4);
333
+ m->sd = (float*)aligned_alloc(64,INTER*4);
334
+ m->lg = (float*)aligned_alloc(64,VOCAB*4);
335
+ m->as = (float*)aligned_alloc(64,MAX_SEQ*4);
336
+ m->fnorm = (float*)aligned_alloc(64,HIDDEN*4);
337
+ printf("Alloc: KV=%zuMB np=%d\n", kv*2*4/1024/1024, np);
338
+ return m;
339
+ }
340
+
341
+ void model_set_embed(M *m, uint16_t *d) { m->emb = d; }
342
+ void model_set_final_norm(M *m, float *d) { memcpy(m->fnorm, d, HIDDEN*4); }
343
+ void model_set_lm_head(M *m, uint16_t *d, int o, int i) {
344
+ m->lmh.w = d; m->lmh.od = o; m->lmh.id = i;
345
+ }
346
+ void layer_set_norms(M *m, int l, float *i, float *p) {
347
+ m->lay[l].in_norm = i; m->lay[l].pn_norm = p;
348
+ }
349
+ void layer_set_bias(M *m, int l, float *q, float *k, float *v) {
350
+ m->lay[l].qb = q; m->lay[l].kb = k; m->lay[l].vb = v;
351
+ }
352
+ static void set_ul(UL *u, uint64_t *s, uint64_t *p, float *sc, int o, int i, int np) {
353
+ u->sign_bits=s; u->mag_planes=p; u->scales=sc;
354
+ u->out_dim=o; u->in_dim=i; u->n_planes=np; u->bias=NULL;
355
+ }
356
+ void layer_set_linears(M *m, int l,
357
+ uint64_t*qs,uint64_t*qp,float*qc,int qo,int qi,
358
+ uint64_t*ks,uint64_t*kp,float*kc,int ko,int ki,
359
+ uint64_t*vs,uint64_t*vp,float*vc,int vo,int vi,
360
+ uint64_t*os,uint64_t*op,float*oc,int oo,int oi,
361
+ uint64_t*gs,uint64_t*gp,float*gc,int go,int gi,
362
+ uint64_t*us,uint64_t*up,float*uc,int uo,int ui,
363
+ uint64_t*ds,uint64_t*dp,float*dc,int doo,int di, int np) {
364
+ set_ul(&m->lay[l].qp,qs,qp,qc,qo,qi,np);
365
+ set_ul(&m->lay[l].kp,ks,kp,kc,ko,ki,np);
366
+ set_ul(&m->lay[l].vp,vs,vp,vc,vo,vi,np);
367
+ set_ul(&m->lay[l].op,os,op,oc,oo,oi,np);
368
+ set_ul(&m->lay[l].gp,gs,gp,gc,go,gi,np);
369
+ set_ul(&m->lay[l].up,us,up,uc,uo,ui,np);
370
+ set_ul(&m->lay[l].dp,ds,dp,dc,doo,di,np);
371
+ }
372
+ void model_reset_cache(M *m) {
373
+ size_t kv=(size_t)N_LAYERS*MAX_SEQ*N_KV_HEADS*HEAD_DIM;
374
+ memset(m->kc,0,kv*4); memset(m->vc,0,kv*4);
375
+ }
376
+ void model_free(M *m) {
377
+ free(m->kc);free(m->vc);free(m->h);free(m->h2);
378
+ free(m->sq);free(m->sk);free(m->sv);free(m->ao);
379
+ free(m->sg);free(m->su);free(m->sd);
380
+ free(m->lg);free(m->as);free(m->fnorm);free(m);
381
+ }
unary_engine_v2.c ADDED
@@ -0,0 +1,629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * UNARY TRANSFORMER ENGINE v2 - Configurable dimensions
3
+ *
4
+ * Full Qwen2/Qwen3 forward pass in C with AVX-512 + OpenMP.
5
+ * Supports any model size via runtime config.
6
+ *
7
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
8
+ */
9
+
10
+ #include <immintrin.h>
11
+ #include <omp.h>
12
+ #include <stdint.h>
13
+ #include <stdlib.h>
14
+ #include <string.h>
15
+ #include <math.h>
16
+ #include <stdio.h>
17
+ #include <time.h>
18
+
19
+ #define MAX_SEQ 4096
20
+ #define RMS_EPS 1e-6f
21
+
22
+ /* ============================================================
23
+ * Config - set at init time
24
+ * ============================================================ */
25
+ typedef struct {
26
+ int hidden;
27
+ int inter;
28
+ int n_heads;
29
+ int n_kv_heads;
30
+ int head_dim;
31
+ int n_layers;
32
+ int vocab;
33
+ float rope_theta;
34
+ int has_attn_bias; /* 1 for Qwen2 (1.5B), 0 for Qwen3 (4B) */
35
+ int tie_embeddings; /* 1 if lm_head shares embed weights */
36
+ } Config;
37
+
38
+ /* ============================================================
39
+ * Unary linear layer
40
+ * ============================================================ */
41
+ typedef struct {
42
+ uint64_t *sign_bits;
43
+ uint64_t *mag_planes;
44
+ float *scales;
45
+ float *bias;
46
+ int out_dim;
47
+ int in_dim;
48
+ int n_planes;
49
+ } UnaryLinear;
50
+
51
+ /* FP16 linear (for lm_head when not tied) */
52
+ typedef struct {
53
+ uint16_t *weight;
54
+ int out_dim;
55
+ int in_dim;
56
+ } FP16Linear;
57
+
58
+ /* ============================================================
59
+ * Transformer layer
60
+ * ============================================================ */
61
+ typedef struct {
62
+ UnaryLinear q_proj, k_proj, v_proj, o_proj;
63
+ UnaryLinear gate_proj, up_proj, down_proj;
64
+ float *input_norm;
65
+ float *post_norm;
66
+ float *q_bias, *k_bias, *v_bias;
67
+ float *q_norm, *k_norm; /* QK-Norm (Qwen3) */
68
+ } Layer;
69
+
70
+ /* ============================================================
71
+ * Full model
72
+ * ============================================================ */
73
+ typedef struct {
74
+ Config cfg;
75
+
76
+ uint16_t *embed; /* FP16 embeddings */
77
+ Layer *layers; /* Dynamic array */
78
+ float *final_norm;
79
+ FP16Linear lm_head; /* Only used if !tie_embeddings */
80
+
81
+ /* KV cache */
82
+ float *k_cache;
83
+ float *v_cache;
84
+
85
+ /* Scratch buffers */
86
+ float *hidden;
87
+ float *hidden2;
88
+ float *q;
89
+ float *k;
90
+ float *v;
91
+ float *attn_out;
92
+ float *gate;
93
+ float *up;
94
+ float *down_in;
95
+ float *logits;
96
+ float *attn_scores;
97
+
98
+ int n_planes;
99
+ } Model;
100
+
101
+ /* ============================================================
102
+ * AVX-512 Unary matvec: y = W @ x
103
+ * ============================================================ */
104
+ static void unary_matvec(
105
+ const UnaryLinear *layer, const float *x, float *y
106
+ ) {
107
+ int out_dim = layer->out_dim;
108
+ int in_dim = layer->in_dim;
109
+ int n_planes = layer->n_planes;
110
+ int chunks = (in_dim + 63) / 64;
111
+ int in_padded = (in_dim + 15) & ~15;
112
+
113
+ #pragma omp parallel for schedule(dynamic, 64)
114
+ for (int i = 0; i < out_dim; i++) {
115
+ const uint64_t *row_sign = layer->sign_bits + (size_t)i * chunks;
116
+ float total = 0.0f;
117
+
118
+ /* Aligned local copy of input for this thread */
119
+ float x_local[in_padded] __attribute__((aligned(64)));
120
+ memcpy(x_local, x, in_dim * sizeof(float));
121
+ if (in_padded > in_dim)
122
+ memset(x_local + in_dim, 0, (in_padded - in_dim) * sizeof(float));
123
+
124
+ for (int p = 0; p < n_planes; p++) {
125
+ const uint64_t *plane_row = layer->mag_planes +
126
+ ((size_t)p * out_dim + i) * chunks;
127
+ __m512 acc = _mm512_setzero_ps();
128
+
129
+ for (int c = 0; c < chunks; c++) {
130
+ uint64_t mbits = plane_row[c];
131
+ uint64_t sbits = row_sign[c];
132
+ uint64_t pos_bits = mbits & ~sbits;
133
+ uint64_t neg_bits = mbits & sbits;
134
+
135
+ for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
136
+ int offset = c * 64 + g * 16;
137
+ __m512 xv = _mm512_load_ps(x_local + offset);
138
+ __mmask16 pmask = (__mmask16)((pos_bits >> (g * 16)) & 0xFFFF);
139
+ __mmask16 nmask = (__mmask16)((neg_bits >> (g * 16)) & 0xFFFF);
140
+ acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
141
+ acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
142
+ }
143
+ }
144
+ total += _mm512_reduce_add_ps(acc);
145
+ }
146
+ y[i] = total * layer->scales[i];
147
+ if (layer->bias) y[i] += layer->bias[i];
148
+ }
149
+ }
150
+
151
+ /* FP16 matvec for lm_head */
152
+ static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
153
+ int out_dim = layer->out_dim;
154
+ int in_dim = layer->in_dim;
155
+ const uint16_t *w = layer->weight;
156
+
157
+ #pragma omp parallel for schedule(dynamic, 256)
158
+ for (int i = 0; i < out_dim; i++) {
159
+ __m512 acc = _mm512_setzero_ps();
160
+ int j;
161
+ for (j = 0; j + 16 <= in_dim; j += 16) {
162
+ __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
163
+ __m512 wv = _mm512_cvtph_ps(h);
164
+ __m512 xv = _mm512_loadu_ps(x + j);
165
+ acc = _mm512_fmadd_ps(wv, xv, acc);
166
+ }
167
+ float sum = _mm512_reduce_add_ps(acc);
168
+ for (; j < in_dim; j++) {
169
+ __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
170
+ __m128 fv = _mm_cvtph_ps(hv);
171
+ float wf;
172
+ _mm_store_ss(&wf, fv);
173
+ sum += wf * x[j];
174
+ }
175
+ y[i] = sum;
176
+ }
177
+ }
178
+
179
+ /* ============================================================
180
+ * Basic ops - all AVX-512 vectorized
181
+ * ============================================================ */
182
+
183
+ static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
184
+ __m512 sum_sq = _mm512_setzero_ps();
185
+ int i;
186
+ for (i = 0; i + 16 <= dim; i += 16) {
187
+ __m512 xv = _mm512_loadu_ps(x + i);
188
+ sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
189
+ }
190
+ float ss = _mm512_reduce_add_ps(sum_sq);
191
+ for (; i < dim; i++) ss += x[i] * x[i];
192
+ float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
193
+
194
+ for (i = 0; i + 16 <= dim; i += 16) {
195
+ __m512 xv = _mm512_loadu_ps(x + i);
196
+ __m512 wv = _mm512_loadu_ps(weight + i);
197
+ __m512 rv = _mm512_set1_ps(rms);
198
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
199
+ }
200
+ for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
201
+ }
202
+
203
+ static void silu_inplace(float *x, int n) {
204
+ int i;
205
+ for (i = 0; i + 16 <= n; i += 16) {
206
+ __m512 xv = _mm512_loadu_ps(x + i);
207
+ __m512 neg = _mm512_sub_ps(_mm512_setzero_ps(), xv);
208
+ /* exp(-x) approximation not great with AVX, use scalar */
209
+ float tmp[16];
210
+ _mm512_storeu_ps(tmp, xv);
211
+ for (int j = 0; j < 16; j++)
212
+ tmp[j] = tmp[j] / (1.0f + expf(-tmp[j]));
213
+ _mm512_storeu_ps(x + i, _mm512_loadu_ps(tmp));
214
+ }
215
+ for (; i < n; i++)
216
+ x[i] = x[i] / (1.0f + expf(-x[i]));
217
+ }
218
+
219
+ static void elemwise_mul(const float *a, const float *b, float *c, int n) {
220
+ int i;
221
+ for (i = 0; i + 16 <= n; i += 16) {
222
+ __m512 av = _mm512_loadu_ps(a + i);
223
+ __m512 bv = _mm512_loadu_ps(b + i);
224
+ _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
225
+ }
226
+ for (; i < n; i++) c[i] = a[i] * b[i];
227
+ }
228
+
229
+ static void vec_add(float *y, const float *x, int n) {
230
+ int i;
231
+ for (i = 0; i + 16 <= n; i += 16) {
232
+ __m512 yv = _mm512_loadu_ps(y + i);
233
+ __m512 xv = _mm512_loadu_ps(x + i);
234
+ _mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
235
+ }
236
+ for (; i < n; i++) y[i] += x[i];
237
+ }
238
+
239
+ static void apply_rope(float *vec, int pos, int dim, float theta) {
240
+ for (int i = 0; i < dim; i += 2) {
241
+ float freq = 1.0f / powf(theta, (float)i / dim);
242
+ float angle = pos * freq;
243
+ float cos_a = cosf(angle);
244
+ float sin_a = sinf(angle);
245
+ float v0 = vec[i];
246
+ float v1 = vec[i + 1];
247
+ vec[i] = v0 * cos_a - v1 * sin_a;
248
+ vec[i + 1] = v0 * sin_a + v1 * cos_a;
249
+ }
250
+ }
251
+
252
+ static void softmax(float *x, int n) {
253
+ float max_val = x[0];
254
+ for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
255
+ float sum = 0.0f;
256
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
257
+ float inv = 1.0f / sum;
258
+ for (int i = 0; i < n; i++) x[i] *= inv;
259
+ }
260
+
261
+ /* ============================================================
262
+ * Embedding lookup (FP16 -> FP32)
263
+ * ============================================================ */
264
+ static void embed_token(const Model *m, int token_id, float *out) {
265
+ int hidden = m->cfg.hidden;
266
+ const uint16_t *row = m->embed + (size_t)token_id * hidden;
267
+ int i;
268
+ for (i = 0; i + 16 <= hidden; i += 16) {
269
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
270
+ __m512 fv = _mm512_cvtph_ps(h);
271
+ _mm512_storeu_ps(out + i, fv);
272
+ }
273
+ for (; i < hidden; i++) {
274
+ __m128i hv = _mm_set1_epi16(row[i]);
275
+ __m128 fv = _mm_cvtph_ps(hv);
276
+ _mm_store_ss(out + i, fv);
277
+ }
278
+ }
279
+
280
+ /* KV cache helpers */
281
+ static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) {
282
+ return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads +
283
+ (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim;
284
+ }
285
+
286
+ /* ============================================================
287
+ * ATTENTION
288
+ * ============================================================ */
289
+ static void attention(Model *m, int layer_idx, int pos) {
290
+ Config *c = &m->cfg;
291
+ Layer *layer = &m->layers[layer_idx];
292
+ int heads_per_kv = c->n_heads / c->n_kv_heads;
293
+
294
+ unary_matvec(&layer->q_proj, m->hidden2, m->q);
295
+ unary_matvec(&layer->k_proj, m->hidden2, m->k);
296
+ unary_matvec(&layer->v_proj, m->hidden2, m->v);
297
+
298
+ if (c->has_attn_bias) {
299
+ if (layer->q_bias) vec_add(m->q, layer->q_bias, c->n_heads * c->head_dim);
300
+ if (layer->k_bias) vec_add(m->k, layer->k_bias, c->n_kv_heads * c->head_dim);
301
+ if (layer->v_bias) vec_add(m->v, layer->v_bias, c->n_kv_heads * c->head_dim);
302
+ }
303
+
304
+ /* QK-Norm (Qwen3): RMSNorm each head's Q and K before RoPE */
305
+ if (layer->q_norm) {
306
+ for (int h = 0; h < c->n_heads; h++)
307
+ rmsnorm(m->q + h * c->head_dim, layer->q_norm, m->q + h * c->head_dim, c->head_dim);
308
+ }
309
+ if (layer->k_norm) {
310
+ for (int h = 0; h < c->n_kv_heads; h++)
311
+ rmsnorm(m->k + h * c->head_dim, layer->k_norm, m->k + h * c->head_dim, c->head_dim);
312
+ }
313
+
314
+ for (int h = 0; h < c->n_heads; h++)
315
+ apply_rope(m->q + h * c->head_dim, pos, c->head_dim, c->rope_theta);
316
+ for (int h = 0; h < c->n_kv_heads; h++)
317
+ apply_rope(m->k + h * c->head_dim, pos, c->head_dim, c->rope_theta);
318
+
319
+ for (int h = 0; h < c->n_kv_heads; h++) {
320
+ memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h),
321
+ m->k + h * c->head_dim, c->head_dim * sizeof(float));
322
+ memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h),
323
+ m->v + h * c->head_dim, c->head_dim * sizeof(float));
324
+ }
325
+
326
+ float scale = 1.0f / sqrtf((float)c->head_dim);
327
+ memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float));
328
+
329
+ for (int h = 0; h < c->n_heads; h++) {
330
+ int kv_h = h / heads_per_kv;
331
+ float *q_head = m->q + h * c->head_dim;
332
+ float *out_head = m->attn_out + h * c->head_dim;
333
+
334
+ for (int t = 0; t <= pos; t++) {
335
+ float *k_cached = kv_ptr(m->k_cache, c, layer_idx, t, kv_h);
336
+ __m512 acc = _mm512_setzero_ps();
337
+ int d;
338
+ for (d = 0; d + 16 <= c->head_dim; d += 16) {
339
+ __m512 qv = _mm512_loadu_ps(q_head + d);
340
+ __m512 kv = _mm512_loadu_ps(k_cached + d);
341
+ acc = _mm512_fmadd_ps(qv, kv, acc);
342
+ }
343
+ float dot = _mm512_reduce_add_ps(acc);
344
+ for (; d < c->head_dim; d++) dot += q_head[d] * k_cached[d];
345
+ m->attn_scores[t] = dot * scale;
346
+ }
347
+
348
+ softmax(m->attn_scores, pos + 1);
349
+
350
+ for (int t = 0; t <= pos; t++) {
351
+ float w = m->attn_scores[t];
352
+ if (w < 1e-8f) continue;
353
+ float *v_cached = kv_ptr(m->v_cache, c, layer_idx, t, kv_h);
354
+ __m512 wv = _mm512_set1_ps(w);
355
+ int d;
356
+ for (d = 0; d + 16 <= c->head_dim; d += 16) {
357
+ __m512 ov = _mm512_loadu_ps(out_head + d);
358
+ __m512 vv = _mm512_loadu_ps(v_cached + d);
359
+ _mm512_storeu_ps(out_head + d, _mm512_fmadd_ps(wv, vv, ov));
360
+ }
361
+ for (; d < c->head_dim; d++) out_head[d] += w * v_cached[d];
362
+ }
363
+ }
364
+
365
+ unary_matvec(&layer->o_proj, m->attn_out, m->hidden2);
366
+ }
367
+
368
+ /* ============================================================
369
+ * MLP - SwiGLU
370
+ * ============================================================ */
371
+ static void mlp(Model *m, int layer_idx) {
372
+ Layer *layer = &m->layers[layer_idx];
373
+ int inter = m->cfg.inter;
374
+
375
+ unary_matvec(&layer->gate_proj, m->hidden2, m->gate);
376
+ unary_matvec(&layer->up_proj, m->hidden2, m->up);
377
+
378
+ silu_inplace(m->gate, inter);
379
+ elemwise_mul(m->gate, m->up, m->down_in, inter);
380
+
381
+ unary_matvec(&layer->down_proj, m->down_in, m->hidden2);
382
+ }
383
+
384
+ /* ============================================================
385
+ * FORWARD ONE TOKEN
386
+ * ============================================================ */
387
+ float* forward_token(Model *m, int token_id, int pos) {
388
+ Config *c = &m->cfg;
389
+
390
+ embed_token(m, token_id, m->hidden);
391
+
392
+ for (int l = 0; l < c->n_layers; l++) {
393
+ rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, c->hidden);
394
+ attention(m, l, pos);
395
+ vec_add(m->hidden, m->hidden2, c->hidden);
396
+ rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, c->hidden);
397
+ mlp(m, l);
398
+ vec_add(m->hidden, m->hidden2, c->hidden);
399
+ }
400
+
401
+ rmsnorm(m->hidden, m->final_norm, m->hidden2, c->hidden);
402
+
403
+ /* LM head - either tied embeddings or separate FP16 */
404
+ if (c->tie_embeddings) {
405
+ /* Use embed weights as lm_head (FP16 matvec) */
406
+ FP16Linear tied;
407
+ tied.weight = m->embed;
408
+ tied.out_dim = c->vocab;
409
+ tied.in_dim = c->hidden;
410
+ fp16_matvec(&tied, m->hidden2, m->logits);
411
+ } else {
412
+ fp16_matvec(&m->lm_head, m->hidden2, m->logits);
413
+ }
414
+
415
+ return m->logits;
416
+ }
417
+
418
+ /* ============================================================
419
+ * TOP-P SAMPLING
420
+ * ============================================================ */
421
+ static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
422
+ if (temperature > 0) {
423
+ float inv_t = 1.0f / temperature;
424
+ for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
425
+ }
426
+ softmax(logits, vocab);
427
+
428
+ float *probs = (float *)malloc(vocab * sizeof(float));
429
+ int *indices = (int *)malloc(vocab * sizeof(int));
430
+ memcpy(probs, logits, vocab * sizeof(float));
431
+ for (int i = 0; i < vocab; i++) indices[i] = i;
432
+
433
+ int n_keep = 0;
434
+ float cum = 0.0f;
435
+ while (cum < top_p && n_keep < vocab) {
436
+ int best = n_keep;
437
+ for (int i = n_keep + 1; i < vocab; i++)
438
+ if (probs[i] > probs[best]) best = i;
439
+ float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
440
+ int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
441
+ cum += probs[n_keep];
442
+ n_keep++;
443
+ if (n_keep >= 40) break;
444
+ }
445
+
446
+ float sum = 0.0f;
447
+ for (int i = 0; i < n_keep; i++) sum += probs[i];
448
+ float r = (float)rand() / RAND_MAX * sum;
449
+ float acc = 0.0f;
450
+ int chosen = indices[0];
451
+ for (int i = 0; i < n_keep; i++) {
452
+ acc += probs[i];
453
+ if (acc >= r) { chosen = indices[i]; break; }
454
+ }
455
+
456
+ free(probs);
457
+ free(indices);
458
+ return chosen;
459
+ }
460
+
461
+ /* ============================================================
462
+ * GENERATE
463
+ * ============================================================ */
464
+ int generate(
465
+ Model *m,
466
+ const int *prompt_ids, int prompt_len,
467
+ int *out_tokens, int max_new_tokens,
468
+ float temperature, float top_p,
469
+ int eos_token
470
+ ) {
471
+ srand(time(NULL));
472
+
473
+ for (int i = 0; i < prompt_len; i++) {
474
+ forward_token(m, prompt_ids[i], i);
475
+ }
476
+
477
+ int pos = prompt_len;
478
+ int generated = 0;
479
+
480
+ for (int t = 0; t < max_new_tokens; t++) {
481
+ float *logits = m->logits;
482
+
483
+ int next_token;
484
+ if (temperature <= 0) {
485
+ next_token = 0;
486
+ for (int i = 1; i < m->cfg.vocab; i++)
487
+ if (logits[i] > logits[next_token]) next_token = i;
488
+ } else {
489
+ next_token = sample_top_p(logits, m->cfg.vocab, temperature, top_p);
490
+ }
491
+
492
+ out_tokens[t] = next_token;
493
+ generated++;
494
+
495
+ if (next_token == eos_token) break;
496
+
497
+ forward_token(m, next_token, pos);
498
+ pos++;
499
+ }
500
+
501
+ return generated;
502
+ }
503
+
504
+ /* ============================================================
505
+ * MODEL ALLOCATION with config
506
+ * ============================================================ */
507
+ Model* model_alloc(
508
+ int n_planes,
509
+ int hidden, int inter, int n_heads, int n_kv_heads,
510
+ int head_dim, int n_layers, int vocab,
511
+ float rope_theta, int has_attn_bias, int tie_embeddings
512
+ ) {
513
+ Model *m = (Model *)calloc(1, sizeof(Model));
514
+ m->n_planes = n_planes;
515
+
516
+ Config *c = &m->cfg;
517
+ c->hidden = hidden;
518
+ c->inter = inter;
519
+ c->n_heads = n_heads;
520
+ c->n_kv_heads = n_kv_heads;
521
+ c->head_dim = head_dim;
522
+ c->n_layers = n_layers;
523
+ c->vocab = vocab;
524
+ c->rope_theta = rope_theta;
525
+ c->has_attn_bias = has_attn_bias;
526
+ c->tie_embeddings = tie_embeddings;
527
+
528
+ m->layers = (Layer *)calloc(n_layers, sizeof(Layer));
529
+
530
+ size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim;
531
+ m->k_cache = (float *)calloc(kv_size, sizeof(float));
532
+ m->v_cache = (float *)calloc(kv_size, sizeof(float));
533
+
534
+ m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float));
535
+ m->hidden2 = (float *)aligned_alloc(64, hidden * sizeof(float));
536
+ m->q = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
537
+ m->k = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
538
+ m->v = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float));
539
+ m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float));
540
+ m->gate = (float *)aligned_alloc(64, inter * sizeof(float));
541
+ m->up = (float *)aligned_alloc(64, inter * sizeof(float));
542
+ m->down_in = (float *)aligned_alloc(64, inter * sizeof(float));
543
+ m->logits = (float *)aligned_alloc(64, vocab * sizeof(float));
544
+ m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
545
+ m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float));
546
+
547
+ size_t kv_mb = kv_size * 2 * sizeof(float) / (1024*1024);
548
+ printf("Model config: hidden=%d inter=%d heads=%d kv_heads=%d layers=%d vocab=%d\n",
549
+ hidden, inter, n_heads, n_kv_heads, n_layers, vocab);
550
+ printf("KV cache: %zu MB, tied_embed=%d, attn_bias=%d\n",
551
+ kv_mb, tie_embeddings, has_attn_bias);
552
+
553
+ return m;
554
+ }
555
+
556
+ /* Weight setters */
557
+ void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
558
+ void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); }
559
+ void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
560
+ m->lm_head.weight = data;
561
+ m->lm_head.out_dim = out_dim;
562
+ m->lm_head.in_dim = in_dim;
563
+ }
564
+
565
+ void layer_set_norms(Model *m, int l, float *input_norm, float *post_norm) {
566
+ m->layers[l].input_norm = input_norm;
567
+ m->layers[l].post_norm = post_norm;
568
+ }
569
+
570
+ void layer_set_bias(Model *m, int l, float *q_bias, float *k_bias, float *v_bias) {
571
+ m->layers[l].q_bias = q_bias;
572
+ m->layers[l].k_bias = k_bias;
573
+ m->layers[l].v_bias = v_bias;
574
+ }
575
+
576
+ void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) {
577
+ m->layers[l].q_norm = q_norm;
578
+ m->layers[l].k_norm = k_norm;
579
+ }
580
+
581
+ void layer_set_unary(
582
+ UnaryLinear *ul,
583
+ uint64_t *sign, uint64_t *planes, float *scales,
584
+ int out_dim, int in_dim, int n_planes
585
+ ) {
586
+ ul->sign_bits = sign;
587
+ ul->mag_planes = planes;
588
+ ul->scales = scales;
589
+ ul->out_dim = out_dim;
590
+ ul->in_dim = in_dim;
591
+ ul->n_planes = n_planes;
592
+ ul->bias = NULL;
593
+ }
594
+
595
+ void layer_set_linears(
596
+ Model *m, int l,
597
+ uint64_t *q_sign, uint64_t *q_planes, float *q_scales, int q_out, int q_in,
598
+ uint64_t *k_sign, uint64_t *k_planes, float *k_scales, int k_out, int k_in,
599
+ uint64_t *v_sign, uint64_t *v_planes, float *v_scales, int v_out, int v_in,
600
+ uint64_t *o_sign, uint64_t *o_planes, float *o_scales, int o_out, int o_in,
601
+ uint64_t *g_sign, uint64_t *g_planes, float *g_scales, int g_out, int g_in,
602
+ uint64_t *u_sign, uint64_t *u_planes, float *u_scales, int u_out, int u_in,
603
+ uint64_t *d_sign, uint64_t *d_planes, float *d_scales, int d_out, int d_in,
604
+ int n_planes
605
+ ) {
606
+ layer_set_unary(&m->layers[l].q_proj, q_sign, q_planes, q_scales, q_out, q_in, n_planes);
607
+ layer_set_unary(&m->layers[l].k_proj, k_sign, k_planes, k_scales, k_out, k_in, n_planes);
608
+ layer_set_unary(&m->layers[l].v_proj, v_sign, v_planes, v_scales, v_out, v_in, n_planes);
609
+ layer_set_unary(&m->layers[l].o_proj, o_sign, o_planes, o_scales, o_out, o_in, n_planes);
610
+ layer_set_unary(&m->layers[l].gate_proj, g_sign, g_planes, g_scales, g_out, g_in, n_planes);
611
+ layer_set_unary(&m->layers[l].up_proj, u_sign, u_planes, u_scales, u_out, u_in, n_planes);
612
+ layer_set_unary(&m->layers[l].down_proj, d_sign, d_planes, d_scales, d_out, d_in, n_planes);
613
+ }
614
+
615
+ void model_reset_cache(Model *m) {
616
+ size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim;
617
+ memset(m->k_cache, 0, kv_size * sizeof(float));
618
+ memset(m->v_cache, 0, kv_size * sizeof(float));
619
+ }
620
+
621
+ void model_free(Model *m) {
622
+ free(m->k_cache); free(m->v_cache);
623
+ free(m->hidden); free(m->hidden2);
624
+ free(m->q); free(m->k); free(m->v);
625
+ free(m->attn_out); free(m->gate); free(m->up); free(m->down_in);
626
+ free(m->logits); free(m->attn_scores); free(m->final_norm);
627
+ free(m->layers);
628
+ free(m);
629
+ }
unary_full.c ADDED
@@ -0,0 +1,742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * FULL UNARY ENGINE - Weights AND Activations in Base-1
3
+ *
4
+ * True unary: the entire matmul is popcount of ANDed bitplanes.
5
+ * No floating point in the inner loop. No multiplication anywhere.
6
+ *
7
+ * Weight w with magnitude M_w (thermometer: M_w planes with bit set)
8
+ * Activation x with magnitude M_x (thermometer: M_x planes with bit set)
9
+ *
10
+ * dot(w, x) for row i:
11
+ * For each weight plane p (0..W-1) and act plane q (0..A-1):
12
+ * contribution = popcount( w_plane_p[i] AND act_plane_q AND same_sign )
13
+ * - popcount( w_plane_p[i] AND act_plane_q AND diff_sign )
14
+ * y[i] = sum_of_contributions * w_scale[i] * act_scale
15
+ *
16
+ * The outer sum has W*A terms, each is a popcount over 64 elements.
17
+ * With W=4, A=4: 16 popcounts per 64 elements = insanely fast.
18
+ *
19
+ * AVX-512 VPOPCNTDQ: one instruction for 8x64-bit popcounts.
20
+ * On Skylake (no VPOPCNTDQ): use Harley-Seal or scalar POPCNT.
21
+ *
22
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
23
+ */
24
+
25
+ #include <immintrin.h>
26
+ #include <stdint.h>
27
+ #include <stdlib.h>
28
+ #include <string.h>
29
+ #include <math.h>
30
+ #include <stdio.h>
31
+ #include <time.h>
32
+ #include <omp.h>
33
+ #include <x86intrin.h>
34
+
35
+ /* ============================================================
36
+ * Config (DeepSeek-R1-Distill-Qwen-1.5B)
37
+ * ============================================================ */
38
+ #define HIDDEN 1536
39
+ #define INTER 8960
40
+ #define N_HEADS 12
41
+ #define N_KV_HEADS 2
42
+ #define HEAD_DIM 128
43
+ #define N_LAYERS 28
44
+ #define VOCAB 151936
45
+ #define RMS_EPS 1e-6f
46
+ #define ROPE_THETA 1000000.0f
47
+ #define MAX_SEQ 4096
48
+ #define HEADS_PER_KV (N_HEADS / N_KV_HEADS)
49
+
50
+ /* Unary config */
51
+ #define W_PLANES 4 /* weight magnitude planes (5 levels: 0-4) */
52
+ #define A_PLANES 8 /* activation magnitude planes (9 levels: 0-8) */
53
+
54
+ /* ============================================================
55
+ * Portable popcount for 64-bit
56
+ * Uses hardware POPCNT (available on Skylake)
57
+ * ============================================================ */
58
+ static inline int popcnt64(uint64_t x) {
59
+ return __builtin_popcountll(x);
60
+ }
61
+
62
+ /* ============================================================
63
+ * Unary Linear Layer (weight storage)
64
+ * ============================================================ */
65
+ typedef struct {
66
+ uint64_t *sign_bits; /* [out_dim * chunks] - 1=negative */
67
+ uint64_t *mag_planes; /* [W_PLANES * out_dim * chunks] */
68
+ float *scales; /* [out_dim] per-row scale */
69
+ float *bias; /* [out_dim] or NULL */
70
+ int out_dim;
71
+ int in_dim;
72
+ int n_planes;
73
+ } UnaryLinear;
74
+
75
+ /* FP16 Linear (for lm_head and embed) */
76
+ typedef struct {
77
+ uint16_t *weight;
78
+ int out_dim;
79
+ int in_dim;
80
+ } FP16Linear;
81
+
82
+ /* ============================================================
83
+ * Quantized Activation Buffer
84
+ * Activations quantized to unary thermometer on the fly.
85
+ * ============================================================ */
86
+ typedef struct {
87
+ uint64_t *sign_bits; /* [chunks] */
88
+ uint64_t *mag_planes; /* [A_PLANES * chunks] */
89
+ float scale; /* single scale for entire vector */
90
+ int dim;
91
+ int chunks;
92
+ } QuantAct;
93
+
94
+ /* ============================================================
95
+ * Transformer Layer
96
+ * ============================================================ */
97
+ typedef struct {
98
+ UnaryLinear q_proj, k_proj, v_proj, o_proj;
99
+ UnaryLinear gate_proj, up_proj, down_proj;
100
+ float *input_norm;
101
+ float *post_norm;
102
+ float *q_bias, *k_bias, *v_bias;
103
+ } Layer;
104
+
105
+ /* ============================================================
106
+ * Full Model
107
+ * ============================================================ */
108
+ typedef struct {
109
+ uint16_t *embed;
110
+ Layer layers[N_LAYERS];
111
+ float *final_norm;
112
+ FP16Linear lm_head;
113
+
114
+ /* KV cache (keep as float - only used in attention dot products) */
115
+ float *k_cache;
116
+ float *v_cache;
117
+
118
+ /* Float scratch (for between operations) */
119
+ float *hidden;
120
+ float *hidden2;
121
+ float *q_buf; /* [N_HEADS * HEAD_DIM] */
122
+ float *k_buf; /* [N_KV_HEADS * HEAD_DIM] */
123
+ float *v_buf;
124
+ float *attn_out;
125
+ float *gate_buf; /* [INTER] */
126
+ float *up_buf;
127
+ float *mlp_buf; /* [INTER] for gate*up result */
128
+ float *logits;
129
+ float *attn_scores;
130
+
131
+ /* Quantized activation buffers (reusable) */
132
+ QuantAct qa_hidden; /* for HIDDEN-dim activations */
133
+ QuantAct qa_inter; /* for INTER-dim activations */
134
+
135
+ int n_w_planes;
136
+ int n_a_planes;
137
+ } Model;
138
+
139
+
140
+ /* ============================================================
141
+ * QUANTIZE ACTIVATION TO UNARY (on the fly)
142
+ *
143
+ * Takes float vector, produces unary bitplanes.
144
+ * This is the key operation that enables full-unary matmul.
145
+ * ============================================================ */
146
+ static void quantize_activation(const float *x, QuantAct *qa) {
147
+ int dim = qa->dim;
148
+ int chunks = qa->chunks;
149
+ int n_planes = A_PLANES;
150
+
151
+ /* Find absmax for scale */
152
+ float absmax = 0.0f;
153
+ for (int i = 0; i < dim; i++) {
154
+ float a = fabsf(x[i]);
155
+ if (a > absmax) absmax = a;
156
+ }
157
+ if (absmax == 0.0f) absmax = 1.0f;
158
+
159
+ qa->scale = absmax / n_planes;
160
+ float inv_scale = 1.0f / qa->scale;
161
+
162
+ /* Clear bitplanes */
163
+ memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
164
+ memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
165
+
166
+ /* Quantize and pack into bitplanes */
167
+ for (int i = 0; i < dim; i++) {
168
+ int chunk = i / 64;
169
+ int bit = i % 64;
170
+ uint64_t mask = (uint64_t)1 << bit;
171
+
172
+ float val = x[i];
173
+ if (val < 0) {
174
+ qa->sign_bits[chunk] |= mask;
175
+ val = -val;
176
+ }
177
+
178
+ int mag = (int)(val * inv_scale + 0.5f);
179
+ if (mag > n_planes) mag = n_planes;
180
+
181
+ /* Thermometer: set planes 0..mag-1 */
182
+ for (int p = 0; p < mag; p++) {
183
+ qa->mag_planes[p * chunks + chunk] |= mask;
184
+ }
185
+ }
186
+ }
187
+
188
+ /* Vectorized quantize - process 64 elements at a time */
189
+ static void quantize_activation_fast(const float *x, QuantAct *qa) {
190
+ int dim = qa->dim;
191
+ int chunks = qa->chunks;
192
+ int padded = chunks * 64;
193
+ int n_planes = A_PLANES;
194
+
195
+ /* Find absmax with AVX-512 */
196
+ __m512 vmax = _mm512_setzero_ps();
197
+ int i;
198
+ for (i = 0; i + 16 <= dim; i += 16) {
199
+ __m512 xv = _mm512_loadu_ps(x + i);
200
+ __m512 av = _mm512_abs_ps(xv);
201
+ vmax = _mm512_max_ps(vmax, av);
202
+ }
203
+ float absmax = _mm512_reduce_max_ps(vmax);
204
+ for (; i < dim; i++) {
205
+ float a = fabsf(x[i]);
206
+ if (a > absmax) absmax = a;
207
+ }
208
+ if (absmax == 0.0f) absmax = 1.0f;
209
+
210
+ qa->scale = absmax / n_planes;
211
+ float inv_scale = (float)n_planes / absmax;
212
+
213
+ /* Clear */
214
+ memset(qa->sign_bits, 0, chunks * sizeof(uint64_t));
215
+ memset(qa->mag_planes, 0, n_planes * chunks * sizeof(uint64_t));
216
+
217
+ /* Process 16 floats at a time, pack bits */
218
+ __m512 v_inv = _mm512_set1_ps(inv_scale);
219
+ __m512 v_half = _mm512_set1_ps(0.5f);
220
+ __m512 v_zero = _mm512_setzero_ps();
221
+
222
+ for (int c = 0; c < chunks; c++) {
223
+ uint64_t sign_word = 0;
224
+ uint64_t plane_words[A_PLANES];
225
+ memset(plane_words, 0, sizeof(plane_words));
226
+
227
+ for (int g = 0; g < 4; g++) {
228
+ int offset = c * 64 + g * 16;
229
+ if (offset >= dim) break;
230
+
231
+ /* Load 16 floats */
232
+ __m512 xv;
233
+ if (offset + 16 <= dim) {
234
+ xv = _mm512_loadu_ps(x + offset);
235
+ } else {
236
+ /* Partial load at end */
237
+ xv = _mm512_setzero_ps();
238
+ for (int j = 0; j < dim - offset; j++) {
239
+ ((float*)&xv)[j] = x[offset + j];
240
+ }
241
+ }
242
+
243
+ /* Sign: negative mask */
244
+ __mmask16 neg_mask = _mm512_cmplt_ps_mask(xv, v_zero);
245
+ sign_word |= ((uint64_t)neg_mask << (g * 16));
246
+
247
+ /* Absolute value and quantize */
248
+ __m512 av = _mm512_abs_ps(xv);
249
+ __m512 qv = _mm512_fmadd_ps(av, v_inv, v_half);
250
+
251
+ /* Convert to int and clamp */
252
+ __m512i iv = _mm512_cvttps_epi32(qv);
253
+ __m512i v_max = _mm512_set1_epi32(n_planes);
254
+ iv = _mm512_min_epi32(iv, v_max);
255
+
256
+ /* Thermometer encode: plane p has bit set if magnitude > p */
257
+ for (int p = 0; p < n_planes; p++) {
258
+ __m512i vp = _mm512_set1_epi32(p + 1);
259
+ __mmask16 active = _mm512_cmpge_epi32_mask(iv, vp);
260
+ plane_words[p] |= ((uint64_t)active << (g * 16));
261
+ }
262
+ }
263
+
264
+ qa->sign_bits[c] = sign_word;
265
+ for (int p = 0; p < n_planes; p++) {
266
+ qa->mag_planes[p * chunks + c] = plane_words[p];
267
+ }
268
+ }
269
+ }
270
+
271
+
272
+ /* ============================================================
273
+ * FULL-UNARY MATVEC via POPCOUNT
274
+ *
275
+ * y[i] = w_scale[i] * act_scale *
276
+ * sum_{p=0}^{W-1} sum_{q=0}^{A-1}
277
+ * ( popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND ~a_sign) // both positive
278
+ * + popcount(w_plane_p[i] AND a_plane_q AND w_sign AND a_sign) // both negative (neg*neg=pos)
279
+ * - popcount(w_plane_p[i] AND a_plane_q AND ~w_sign AND a_sign) // pos weight * neg act
280
+ * - popcount(w_plane_p[i] AND a_plane_q AND w_sign AND ~a_sign) ) // neg weight * pos act
281
+ *
282
+ * Simplification: same_sign = ~(w_sign XOR a_sign), diff_sign = w_sign XOR a_sign
283
+ * contribution = popcount(w_plane AND a_plane AND same_sign)
284
+ * - popcount(w_plane AND a_plane AND diff_sign)
285
+ * ============================================================ */
286
+ static void unary_matvec_popcount(
287
+ const UnaryLinear *layer, const QuantAct *qa, float *y
288
+ ) {
289
+ int out_dim = layer->out_dim;
290
+ int chunks = qa->chunks;
291
+ int n_w = layer->n_planes;
292
+ int n_a = A_PLANES;
293
+ float act_scale = qa->scale;
294
+
295
+ #pragma omp parallel for schedule(dynamic, 64)
296
+ for (int i = 0; i < out_dim; i++) {
297
+ const uint64_t *w_sign = layer->sign_bits + (size_t)i * chunks;
298
+ long total = 0; /* integer accumulator! */
299
+
300
+ for (int c = 0; c < chunks; c++) {
301
+ uint64_t ws = w_sign[c];
302
+ uint64_t as = qa->sign_bits[c];
303
+ uint64_t same_sign = ~(ws ^ as); /* bits where signs agree */
304
+ uint64_t diff_sign = ws ^ as; /* bits where signs differ */
305
+
306
+ for (int p = 0; p < n_w; p++) {
307
+ uint64_t wp = layer->mag_planes[((size_t)p * out_dim + i) * chunks + c];
308
+
309
+ for (int q = 0; q < n_a; q++) {
310
+ uint64_t aq = qa->mag_planes[q * chunks + c];
311
+ uint64_t active = wp & aq; /* both have magnitude at this level */
312
+
313
+ total += popcnt64(active & same_sign);
314
+ total -= popcnt64(active & diff_sign);
315
+ }
316
+ }
317
+ }
318
+
319
+ y[i] = (float)total * layer->scales[i] * act_scale;
320
+ if (layer->bias) y[i] += layer->bias[i];
321
+ }
322
+ }
323
+
324
+
325
+ /* ============================================================
326
+ * FP16 matvec for lm_head (final projection to vocab)
327
+ * ============================================================ */
328
+ static void fp16_matvec(const FP16Linear *layer, const float *x, float *y) {
329
+ int out_dim = layer->out_dim;
330
+ int in_dim = layer->in_dim;
331
+ const uint16_t *w = layer->weight;
332
+
333
+ #pragma omp parallel for schedule(dynamic, 256)
334
+ for (int i = 0; i < out_dim; i++) {
335
+ __m512 acc = _mm512_setzero_ps();
336
+ int j;
337
+ for (j = 0; j + 16 <= in_dim; j += 16) {
338
+ __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j));
339
+ __m512 wv = _mm512_cvtph_ps(h);
340
+ __m512 xv = _mm512_loadu_ps(x + j);
341
+ acc = _mm512_fmadd_ps(wv, xv, acc);
342
+ }
343
+ float sum = _mm512_reduce_add_ps(acc);
344
+ for (; j < in_dim; j++) {
345
+ __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]);
346
+ __m128 fv = _mm_cvtph_ps(hv);
347
+ float wf;
348
+ _mm_store_ss(&wf, fv);
349
+ sum += wf * x[j];
350
+ }
351
+ y[i] = sum;
352
+ }
353
+ }
354
+
355
+ /* ============================================================
356
+ * Basic ops (still float for norms, residuals, attention)
357
+ * ============================================================ */
358
+ static void rmsnorm(const float *x, const float *weight, float *y, int dim) {
359
+ __m512 sum_sq = _mm512_setzero_ps();
360
+ int i;
361
+ for (i = 0; i + 16 <= dim; i += 16) {
362
+ __m512 xv = _mm512_loadu_ps(x + i);
363
+ sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
364
+ }
365
+ float ss = _mm512_reduce_add_ps(sum_sq);
366
+ for (; i < dim; i++) ss += x[i] * x[i];
367
+ float rms = 1.0f / sqrtf(ss / dim + RMS_EPS);
368
+ for (i = 0; i + 16 <= dim; i += 16) {
369
+ __m512 xv = _mm512_loadu_ps(x + i);
370
+ __m512 wv = _mm512_loadu_ps(weight + i);
371
+ __m512 rv = _mm512_set1_ps(rms);
372
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
373
+ }
374
+ for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
375
+ }
376
+
377
+ static void silu_inplace(float *x, int n) {
378
+ int i;
379
+ __m512 one = _mm512_set1_ps(1.0f);
380
+ /* SiLU vectorized: x / (1 + exp(-x)) */
381
+ for (i = 0; i < n; i++) {
382
+ x[i] = x[i] / (1.0f + expf(-x[i]));
383
+ }
384
+ }
385
+
386
+ static void elemwise_mul(const float *a, const float *b, float *c, int n) {
387
+ int i;
388
+ for (i = 0; i + 16 <= n; i += 16) {
389
+ __m512 av = _mm512_loadu_ps(a + i);
390
+ __m512 bv = _mm512_loadu_ps(b + i);
391
+ _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
392
+ }
393
+ for (; i < n; i++) c[i] = a[i] * b[i];
394
+ }
395
+
396
+ static void vec_add(float *y, const float *x, int n) {
397
+ int i;
398
+ for (i = 0; i + 16 <= n; i += 16) {
399
+ __m512 yv = _mm512_loadu_ps(y + i);
400
+ __m512 xv = _mm512_loadu_ps(x + i);
401
+ _mm512_storeu_ps(y + i, _mm512_add_ps(yv, xv));
402
+ }
403
+ for (; i < n; i++) y[i] += x[i];
404
+ }
405
+
406
+ static void apply_rope(float *vec, int pos, int dim) {
407
+ for (int i = 0; i < dim; i += 2) {
408
+ float freq = 1.0f / powf(ROPE_THETA, (float)i / dim);
409
+ float angle = pos * freq;
410
+ float c = cosf(angle), s = sinf(angle);
411
+ float v0 = vec[i], v1 = vec[i + 1];
412
+ vec[i] = v0 * c - v1 * s;
413
+ vec[i + 1] = v0 * s + v1 * c;
414
+ }
415
+ }
416
+
417
+ static void softmax(float *x, int n) {
418
+ float max_val = x[0];
419
+ for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
420
+ float sum = 0.0f;
421
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
422
+ float inv = 1.0f / sum;
423
+ for (int i = 0; i < n; i++) x[i] *= inv;
424
+ }
425
+
426
+ static void embed_token(const Model *m, int token_id, float *out) {
427
+ const uint16_t *row = m->embed + (size_t)token_id * HIDDEN;
428
+ int i;
429
+ for (i = 0; i + 16 <= HIDDEN; i += 16) {
430
+ __m256i h = _mm256_loadu_si256((__m256i*)(row + i));
431
+ _mm512_storeu_ps(out + i, _mm512_cvtph_ps(h));
432
+ }
433
+ for (; i < HIDDEN; i++) {
434
+ __m128i hv = _mm_set1_epi16(row[i]);
435
+ _mm_store_ss(out + i, _mm_cvtph_ps(hv));
436
+ }
437
+ }
438
+
439
+ static float* kv_ptr(float *cache, int layer, int pos, int kv_head) {
440
+ return cache + ((size_t)layer * MAX_SEQ * N_KV_HEADS +
441
+ (size_t)pos * N_KV_HEADS + kv_head) * HEAD_DIM;
442
+ }
443
+
444
+ /* ============================================================
445
+ * ATTENTION
446
+ *
447
+ * Q/K/V projections use full-unary popcount matmul.
448
+ * Attention scores and value accumulation stay float
449
+ * (these are O(seq_len) not O(dim²), not the bottleneck).
450
+ * ============================================================ */
451
+ static void attention(Model *m, int layer_idx, int pos) {
452
+ Layer *layer = &m->layers[layer_idx];
453
+
454
+ /* Quantize hidden2 to unary for projections */
455
+ quantize_activation_fast(m->hidden2, &m->qa_hidden);
456
+
457
+ /* Project Q, K, V via popcount matvec */
458
+ unary_matvec_popcount(&layer->q_proj, &m->qa_hidden, m->q_buf);
459
+ unary_matvec_popcount(&layer->k_proj, &m->qa_hidden, m->k_buf);
460
+ unary_matvec_popcount(&layer->v_proj, &m->qa_hidden, m->v_buf);
461
+
462
+ /* Add biases */
463
+ if (layer->q_bias) vec_add(m->q_buf, layer->q_bias, N_HEADS * HEAD_DIM);
464
+ if (layer->k_bias) vec_add(m->k_buf, layer->k_bias, N_KV_HEADS * HEAD_DIM);
465
+ if (layer->v_bias) vec_add(m->v_buf, layer->v_bias, N_KV_HEADS * HEAD_DIM);
466
+
467
+ /* RoPE */
468
+ for (int h = 0; h < N_HEADS; h++)
469
+ apply_rope(m->q_buf + h * HEAD_DIM, pos, HEAD_DIM);
470
+ for (int h = 0; h < N_KV_HEADS; h++)
471
+ apply_rope(m->k_buf + h * HEAD_DIM, pos, HEAD_DIM);
472
+
473
+ /* Store KV */
474
+ for (int h = 0; h < N_KV_HEADS; h++) {
475
+ memcpy(kv_ptr(m->k_cache, layer_idx, pos, h), m->k_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
476
+ memcpy(kv_ptr(m->v_cache, layer_idx, pos, h), m->v_buf + h * HEAD_DIM, HEAD_DIM * sizeof(float));
477
+ }
478
+
479
+ /* Attention */
480
+ float scale = 1.0f / sqrtf((float)HEAD_DIM);
481
+ memset(m->attn_out, 0, N_HEADS * HEAD_DIM * sizeof(float));
482
+
483
+ for (int h = 0; h < N_HEADS; h++) {
484
+ int kv_h = h / HEADS_PER_KV;
485
+ float *qh = m->q_buf + h * HEAD_DIM;
486
+ float *oh = m->attn_out + h * HEAD_DIM;
487
+
488
+ for (int t = 0; t <= pos; t++) {
489
+ float *kc = kv_ptr(m->k_cache, layer_idx, t, kv_h);
490
+ __m512 acc = _mm512_setzero_ps();
491
+ int d;
492
+ for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
493
+ acc = _mm512_fmadd_ps(_mm512_loadu_ps(qh + d), _mm512_loadu_ps(kc + d), acc);
494
+ }
495
+ float dot = _mm512_reduce_add_ps(acc);
496
+ for (; d < HEAD_DIM; d++) dot += qh[d] * kc[d];
497
+ m->attn_scores[t] = dot * scale;
498
+ }
499
+
500
+ softmax(m->attn_scores, pos + 1);
501
+
502
+ for (int t = 0; t <= pos; t++) {
503
+ float w = m->attn_scores[t];
504
+ if (w < 1e-8f) continue;
505
+ float *vc = kv_ptr(m->v_cache, layer_idx, t, kv_h);
506
+ __m512 wv = _mm512_set1_ps(w);
507
+ int d;
508
+ for (d = 0; d + 16 <= HEAD_DIM; d += 16) {
509
+ __m512 ov = _mm512_loadu_ps(oh + d);
510
+ _mm512_storeu_ps(oh + d, _mm512_fmadd_ps(wv, _mm512_loadu_ps(vc + d), ov));
511
+ }
512
+ for (; d < HEAD_DIM; d++) oh[d] += w * vc[d];
513
+ }
514
+ }
515
+
516
+ /* O projection: quantize attn_out, then popcount matvec */
517
+ quantize_activation_fast(m->attn_out, &m->qa_hidden);
518
+ unary_matvec_popcount(&layer->o_proj, &m->qa_hidden, m->hidden2);
519
+ }
520
+
521
+ /* ============================================================
522
+ * MLP - SwiGLU with unary matmuls
523
+ * ============================================================ */
524
+ static void mlp(Model *m, int layer_idx) {
525
+ Layer *layer = &m->layers[layer_idx];
526
+
527
+ /* Quantize hidden2 */
528
+ quantize_activation_fast(m->hidden2, &m->qa_hidden);
529
+
530
+ /* gate and up projections via popcount */
531
+ unary_matvec_popcount(&layer->gate_proj, &m->qa_hidden, m->gate_buf);
532
+ unary_matvec_popcount(&layer->up_proj, &m->qa_hidden, m->up_buf);
533
+
534
+ /* SwiGLU: silu(gate) * up */
535
+ silu_inplace(m->gate_buf, INTER);
536
+ elemwise_mul(m->gate_buf, m->up_buf, m->mlp_buf, INTER);
537
+
538
+ /* Down projection: quantize intermediate, popcount matvec */
539
+ quantize_activation_fast(m->mlp_buf, &m->qa_inter);
540
+ unary_matvec_popcount(&layer->down_proj, &m->qa_inter, m->hidden2);
541
+ }
542
+
543
+ /* ============================================================
544
+ * FORWARD ONE TOKEN
545
+ * ============================================================ */
546
+ float* forward_token(Model *m, int token_id, int pos) {
547
+ embed_token(m, token_id, m->hidden);
548
+
549
+ for (int l = 0; l < N_LAYERS; l++) {
550
+ rmsnorm(m->hidden, m->layers[l].input_norm, m->hidden2, HIDDEN);
551
+ attention(m, l, pos);
552
+ vec_add(m->hidden, m->hidden2, HIDDEN);
553
+
554
+ rmsnorm(m->hidden, m->layers[l].post_norm, m->hidden2, HIDDEN);
555
+ mlp(m, l);
556
+ vec_add(m->hidden, m->hidden2, HIDDEN);
557
+ }
558
+
559
+ rmsnorm(m->hidden, m->final_norm, m->hidden2, HIDDEN);
560
+ fp16_matvec(&m->lm_head, m->hidden2, m->logits);
561
+
562
+ return m->logits;
563
+ }
564
+
565
+ /* ============================================================
566
+ * SAMPLING
567
+ * ============================================================ */
568
+ static int sample_top_p(float *logits, int vocab, float temperature, float top_p) {
569
+ if (temperature > 0) {
570
+ float inv_t = 1.0f / temperature;
571
+ for (int i = 0; i < vocab; i++) logits[i] *= inv_t;
572
+ }
573
+ softmax(logits, vocab);
574
+
575
+ float *probs = (float *)malloc(vocab * sizeof(float));
576
+ int *indices = (int *)malloc(vocab * sizeof(int));
577
+ memcpy(probs, logits, vocab * sizeof(float));
578
+ for (int i = 0; i < vocab; i++) indices[i] = i;
579
+
580
+ int n_keep = 0;
581
+ float cum = 0.0f;
582
+ while (cum < top_p && n_keep < vocab && n_keep < 40) {
583
+ int best = n_keep;
584
+ for (int i = n_keep + 1; i < vocab; i++)
585
+ if (probs[i] > probs[best]) best = i;
586
+ float tmp_p = probs[n_keep]; probs[n_keep] = probs[best]; probs[best] = tmp_p;
587
+ int tmp_i = indices[n_keep]; indices[n_keep] = indices[best]; indices[best] = tmp_i;
588
+ cum += probs[n_keep];
589
+ n_keep++;
590
+ }
591
+
592
+ float sum = 0.0f;
593
+ for (int i = 0; i < n_keep; i++) sum += probs[i];
594
+ float r = (float)rand() / RAND_MAX * sum;
595
+ float acc = 0.0f;
596
+ int chosen = indices[0];
597
+ for (int i = 0; i < n_keep; i++) {
598
+ acc += probs[i]; if (acc >= r) { chosen = indices[i]; break; }
599
+ }
600
+ free(probs); free(indices);
601
+ return chosen;
602
+ }
603
+
604
+ /* ============================================================
605
+ * GENERATE
606
+ * ============================================================ */
607
+ int generate(
608
+ Model *m,
609
+ const int *prompt_ids, int prompt_len,
610
+ int *out_tokens, int max_new_tokens,
611
+ float temperature, float top_p,
612
+ int eos_token
613
+ ) {
614
+ srand(time(NULL));
615
+
616
+ for (int i = 0; i < prompt_len; i++)
617
+ forward_token(m, prompt_ids[i], i);
618
+
619
+ int pos = prompt_len;
620
+ int generated = 0;
621
+
622
+ for (int t = 0; t < max_new_tokens; t++) {
623
+ float *logits = m->logits;
624
+ int next = (temperature <= 0) ? 0 : sample_top_p(logits, VOCAB, temperature, top_p);
625
+ if (temperature <= 0) {
626
+ for (int i = 1; i < VOCAB; i++)
627
+ if (logits[i] > logits[next]) next = i;
628
+ }
629
+ out_tokens[t] = next;
630
+ generated++;
631
+ if (next == eos_token) break;
632
+ forward_token(m, next, pos);
633
+ pos++;
634
+ }
635
+ return generated;
636
+ }
637
+
638
+ /* ============================================================
639
+ * ALLOCATE QUANTIZED ACTIVATION BUFFER
640
+ * ============================================================ */
641
+ static void qa_alloc(QuantAct *qa, int dim) {
642
+ qa->dim = dim;
643
+ qa->chunks = (dim + 63) / 64;
644
+ qa->sign_bits = (uint64_t *)aligned_alloc(64, qa->chunks * sizeof(uint64_t));
645
+ qa->mag_planes = (uint64_t *)aligned_alloc(64, A_PLANES * qa->chunks * sizeof(uint64_t));
646
+ qa->scale = 1.0f;
647
+ }
648
+
649
+ /* ============================================================
650
+ * MODEL ALLOC
651
+ * ============================================================ */
652
+ Model* model_alloc(int n_w_planes) {
653
+ Model *m = (Model *)calloc(1, sizeof(Model));
654
+ m->n_w_planes = n_w_planes;
655
+ m->n_a_planes = A_PLANES;
656
+
657
+ size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
658
+ m->k_cache = (float *)calloc(kv_size, sizeof(float));
659
+ m->v_cache = (float *)calloc(kv_size, sizeof(float));
660
+
661
+ m->hidden = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
662
+ m->hidden2 = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
663
+ m->q_buf = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
664
+ m->k_buf = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
665
+ m->v_buf = (float *)aligned_alloc(64, N_KV_HEADS * HEAD_DIM * sizeof(float));
666
+ m->attn_out = (float *)aligned_alloc(64, N_HEADS * HEAD_DIM * sizeof(float));
667
+ m->gate_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
668
+ m->up_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
669
+ m->mlp_buf = (float *)aligned_alloc(64, INTER * sizeof(float));
670
+ m->logits = (float *)aligned_alloc(64, VOCAB * sizeof(float));
671
+ m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float));
672
+ m->final_norm = (float *)aligned_alloc(64, HIDDEN * sizeof(float));
673
+
674
+ qa_alloc(&m->qa_hidden, HIDDEN);
675
+ qa_alloc(&m->qa_inter, INTER);
676
+
677
+ printf("Model allocated: KV=%zuMB, W_PLANES=%d, A_PLANES=%d\n",
678
+ kv_size * 2 * sizeof(float) / (1024*1024), n_w_planes, A_PLANES);
679
+
680
+ return m;
681
+ }
682
+
683
+ void model_set_embed(Model *m, uint16_t *data) { m->embed = data; }
684
+ void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, HIDDEN * sizeof(float)); }
685
+ void model_set_lm_head(Model *m, uint16_t *data, int out_dim, int in_dim) {
686
+ m->lm_head.weight = data; m->lm_head.out_dim = out_dim; m->lm_head.in_dim = in_dim;
687
+ }
688
+
689
+ void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) {
690
+ m->layers[l].input_norm = in_norm;
691
+ m->layers[l].post_norm = post_norm;
692
+ }
693
+
694
+ void layer_set_bias(Model *m, int l, float *qb, float *kb, float *vb) {
695
+ m->layers[l].q_bias = qb; m->layers[l].k_bias = kb; m->layers[l].v_bias = vb;
696
+ }
697
+
698
+ void layer_set_unary(
699
+ UnaryLinear *ul, uint64_t *sign, uint64_t *planes, float *scales,
700
+ int out_dim, int in_dim, int n_planes
701
+ ) {
702
+ ul->sign_bits = sign; ul->mag_planes = planes; ul->scales = scales;
703
+ ul->out_dim = out_dim; ul->in_dim = in_dim; ul->n_planes = n_planes;
704
+ ul->bias = NULL;
705
+ }
706
+
707
+ void layer_set_linears(
708
+ Model *m, int l,
709
+ uint64_t *qs, uint64_t *qp, float *qsc, int qo, int qi,
710
+ uint64_t *ks, uint64_t *kp, float *ksc, int ko, int ki,
711
+ uint64_t *vs, uint64_t *vp, float *vsc, int vo, int vi,
712
+ uint64_t *os, uint64_t *op, float *osc, int oo, int oi,
713
+ uint64_t *gs, uint64_t *gp, float *gsc, int go, int gi,
714
+ uint64_t *us, uint64_t *up, float *usc, int uo, int ui,
715
+ uint64_t *ds, uint64_t *dp, float *dsc, int doo, int di,
716
+ int n_planes
717
+ ) {
718
+ layer_set_unary(&m->layers[l].q_proj, qs, qp, qsc, qo, qi, n_planes);
719
+ layer_set_unary(&m->layers[l].k_proj, ks, kp, ksc, ko, ki, n_planes);
720
+ layer_set_unary(&m->layers[l].v_proj, vs, vp, vsc, vo, vi, n_planes);
721
+ layer_set_unary(&m->layers[l].o_proj, os, op, osc, oo, oi, n_planes);
722
+ layer_set_unary(&m->layers[l].gate_proj, gs, gp, gsc, go, gi, n_planes);
723
+ layer_set_unary(&m->layers[l].up_proj, us, up, usc, uo, ui, n_planes);
724
+ layer_set_unary(&m->layers[l].down_proj, ds, dp, dsc, doo, di, n_planes);
725
+ }
726
+
727
+ void model_reset_cache(Model *m) {
728
+ size_t kv_size = (size_t)N_LAYERS * MAX_SEQ * N_KV_HEADS * HEAD_DIM;
729
+ memset(m->k_cache, 0, kv_size * sizeof(float));
730
+ memset(m->v_cache, 0, kv_size * sizeof(float));
731
+ }
732
+
733
+ void model_free(Model *m) {
734
+ free(m->k_cache); free(m->v_cache);
735
+ free(m->hidden); free(m->hidden2);
736
+ free(m->q_buf); free(m->k_buf); free(m->v_buf);
737
+ free(m->attn_out); free(m->gate_buf); free(m->up_buf); free(m->mlp_buf);
738
+ free(m->logits); free(m->attn_scores); free(m->final_norm);
739
+ free(m->qa_hidden.sign_bits); free(m->qa_hidden.mag_planes);
740
+ free(m->qa_inter.sign_bits); free(m->qa_inter.mag_planes);
741
+ free(m);
742
+ }
unary_group_convert.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert model to UNARY with GROUP quantization.
4
+ Each group of 32 weights gets its own scale factor.
5
+ This dramatically improves accuracy vs per-row scaling.
6
+
7
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
8
+ """
9
+ import os, json, sys, time
10
+ import numpy as np
11
+ from pathlib import Path
12
+
13
+ GROUP_SIZE = 32
14
+
15
+ def load_safetensors(model_dir):
16
+ import torch
17
+ from safetensors.torch import load_file
18
+ tensors = {}
19
+ for f in sorted(Path(model_dir).glob("*.safetensors")):
20
+ print(f"Loading {f.name}...")
21
+ for key, val in load_file(str(f)).items():
22
+ tensors[key] = val.float().numpy()
23
+ return tensors
24
+
25
+
26
+ def quantize_group_unary(weight, n_planes=7):
27
+ """Quantize with per-group scales. GROUP_SIZE=32 weights per scale."""
28
+ w = weight.astype(np.float32)
29
+ out_dim, in_dim = w.shape
30
+ n_groups = (in_dim + GROUP_SIZE - 1) // GROUP_SIZE
31
+ chunks = (in_dim + 63) // 64
32
+ padded = chunks * 64
33
+
34
+ # Pad weight to multiple of GROUP_SIZE
35
+ if in_dim % GROUP_SIZE != 0:
36
+ pad_w = GROUP_SIZE - (in_dim % GROUP_SIZE)
37
+ w = np.concatenate([w, np.zeros((out_dim, pad_w), dtype=np.float32)], axis=1)
38
+
39
+ # Reshape to groups: [out_dim, n_groups, GROUP_SIZE]
40
+ w_grouped = w[:, :n_groups * GROUP_SIZE].reshape(out_dim, n_groups, GROUP_SIZE)
41
+
42
+ # Per-group max absolute value
43
+ group_max = np.max(np.abs(w_grouped), axis=2) # [out_dim, n_groups]
44
+ group_max = np.where(group_max == 0, 1.0, group_max)
45
+
46
+ # Per-group scales
47
+ group_scales = (group_max / n_planes).astype(np.float32) # [out_dim, n_groups]
48
+
49
+ # Quantize per group
50
+ w_scaled = w_grouped / group_scales[:, :, None] # [out_dim, n_groups, GROUP_SIZE]
51
+ magnitudes = np.round(np.abs(w_scaled)).astype(np.int32)
52
+ magnitudes = np.clip(magnitudes, 0, n_planes)
53
+ signs = (w_grouped < 0)
54
+
55
+ # Flatten back to [out_dim, n_groups * GROUP_SIZE]
56
+ magnitudes = magnitudes.reshape(out_dim, -1)
57
+ signs = signs.reshape(out_dim, -1)
58
+
59
+ # Pad to multiple of 64 for bitpacking
60
+ if magnitudes.shape[1] < padded:
61
+ extra = padded - magnitudes.shape[1]
62
+ magnitudes = np.concatenate([magnitudes, np.zeros((out_dim, extra), dtype=np.int32)], axis=1)
63
+ signs = np.concatenate([signs, np.zeros((out_dim, extra), dtype=bool)], axis=1)
64
+
65
+ sparsity = np.mean(magnitudes == 0)
66
+
67
+ # Pack bits
68
+ bit_positions = (np.uint64(1) << np.arange(64, dtype=np.uint64))
69
+
70
+ signs_r = signs.reshape(out_dim, chunks, 64).astype(np.uint64)
71
+ sign_bits = np.bitwise_or.reduce(signs_r * bit_positions, axis=2)
72
+
73
+ mag_planes = np.zeros((n_planes, out_dim, chunks), dtype=np.uint64)
74
+ for p in range(n_planes):
75
+ active = (magnitudes >= (p + 1)).reshape(out_dim, chunks, 64).astype(np.uint64)
76
+ mag_planes[p] = np.bitwise_or.reduce(active * bit_positions, axis=2)
77
+
78
+ return sign_bits, mag_planes, group_scales, sparsity
79
+
80
+
81
+ def test_accuracy(weight, sign_bits, mag_planes, group_scales, n_planes):
82
+ """Test reconstruction accuracy of a single layer."""
83
+ out_dim, in_dim = weight.shape
84
+ n_groups = group_scales.shape[1]
85
+ chunks = (in_dim + 63) // 64
86
+
87
+ np.random.seed(42)
88
+ x = np.random.randn(in_dim).astype(np.float32)
89
+ y_orig = weight @ x
90
+
91
+ # Reconstruct weights from unary format
92
+ w_recon = np.zeros((out_dim, chunks * 64), dtype=np.float32)
93
+ for p in range(n_planes):
94
+ for i in range(out_dim):
95
+ for c in range(chunks):
96
+ mbits = mag_planes[p, i, c]
97
+ sbits = sign_bits[i, c]
98
+ for b in range(64):
99
+ if mbits & (1 << b):
100
+ col = c * 64 + b
101
+ g = col // GROUP_SIZE
102
+ if g < n_groups:
103
+ sign = -1.0 if (sbits & (1 << b)) else 1.0
104
+ w_recon[i, col] += sign * group_scales[i, g]
105
+
106
+ y_recon = w_recon[:, :in_dim] @ x
107
+ cosim = np.dot(y_orig, y_recon) / (np.linalg.norm(y_orig) * np.linalg.norm(y_recon))
108
+ return cosim
109
+
110
+
111
+ def convert(model_dir, output_dir, n_planes=7):
112
+ os.makedirs(output_dir, exist_ok=True)
113
+ tensors = load_safetensors(model_dir)
114
+
115
+ linear_keys = [k for k in tensors if any(p in k for p in
116
+ ['q_proj.weight', 'k_proj.weight', 'v_proj.weight', 'o_proj.weight',
117
+ 'gate_proj.weight', 'up_proj.weight', 'down_proj.weight'])]
118
+ other_keys = [k for k in tensors if k not in linear_keys]
119
+
120
+ print(f"\nGroup-unary: {len(linear_keys)} layers, n_planes={n_planes}, group_size={GROUP_SIZE}")
121
+
122
+ config = {
123
+ "hidden_size": 1536, "intermediate_size": 8960,
124
+ "num_attention_heads": 12, "num_key_value_heads": 2,
125
+ "num_hidden_layers": 28, "vocab_size": 151936,
126
+ "head_dim": 128, "rope_theta": 1000000.0, "rms_norm_eps": 1e-6,
127
+ "n_planes": n_planes, "group_size": GROUP_SIZE,
128
+ "quant_type": "unary_group",
129
+ }
130
+ with open(os.path.join(output_dir, "config.json"), "w") as f:
131
+ json.dump(config, f, indent=2)
132
+
133
+ total_unary = 0
134
+ total_orig = 0
135
+
136
+ # Test accuracy on first layer
137
+ test_key = linear_keys[0]
138
+
139
+ for key in linear_keys:
140
+ w = tensors[key]
141
+ total_orig += w.nbytes
142
+
143
+ t0 = time.time()
144
+ sign_bits, mag_planes, group_scales, sparsity = quantize_group_unary(w, n_planes)
145
+ dt = time.time() - t0
146
+
147
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
148
+ sign_bits.tofile(prefix + ".sign")
149
+ mag_planes.tofile(prefix + ".planes")
150
+ group_scales.tofile(prefix + ".gscales")
151
+
152
+ nbytes = sign_bits.nbytes + mag_planes.nbytes + group_scales.nbytes
153
+ total_unary += nbytes
154
+
155
+ print(f" {key}: {w.shape} -> {nbytes/1024:.0f}KB ({dt:.1f}s, {sparsity:.0%} sparse)")
156
+
157
+ total_fp16 = 0
158
+ for key in other_keys:
159
+ w = tensors[key].astype(np.float16)
160
+ prefix = os.path.join(output_dir, key.replace(".", "_"))
161
+ w.tofile(prefix + ".fp16")
162
+ total_fp16 += w.nbytes
163
+ print(f" {key}: {w.shape} -> fp16 ({w.nbytes/1024:.0f}KB)")
164
+
165
+ manifest = {
166
+ "unary": {k: list(tensors[k].shape) for k in linear_keys},
167
+ "fp16": {k: list(tensors[k].shape) for k in other_keys},
168
+ }
169
+ with open(os.path.join(output_dir, "manifest.json"), "w") as f:
170
+ json.dump(manifest, f, indent=2)
171
+
172
+ total = total_unary + total_fp16
173
+ print(f"\n=== Summary ===")
174
+ print(f"Original FP32: {total_orig/1e6:.0f} MB")
175
+ print(f"Unary+group: {total_unary/1e6:.0f} MB")
176
+ print(f"FP16 other: {total_fp16/1e6:.0f} MB")
177
+ print(f"Total: {total/1e6:.0f} MB")
178
+
179
+ # Quick accuracy test
180
+ print(f"\nAccuracy test on {test_key}...")
181
+ w = tensors[test_key]
182
+ sign_bits, mag_planes, group_scales, _ = quantize_group_unary(w, n_planes)
183
+ cosim = test_accuracy(w, sign_bits, mag_planes, group_scales, n_planes)
184
+ print(f" Cosine similarity: {cosim:.4f}")
185
+
186
+
187
+ if __name__ == "__main__":
188
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-hf"
189
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-gunary"
190
+ n_planes = int(sys.argv[3]) if len(sys.argv) > 3 else 7
191
+ convert(model_dir, output_dir, n_planes)
192
+ print("Done!")
unary_kernel.c ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * UNARY (Base-1) Neural Network Kernel - AVX-512
3
+ *
4
+ * Weights quantized to signed integers [-N..+N], stored as:
5
+ * sign_bits[row][chunks] - 1 = negative, 0 = positive
6
+ * mag_planes[plane][row][chunks] - unary thermometer bitplanes
7
+ * scales[row] - per-row float32 scale
8
+ *
9
+ * For magnitude M, the first M bitplanes have bit=1 at that position.
10
+ * E.g. magnitude 3 with max_planes=7: planes 0,1,2 have bit set.
11
+ *
12
+ * TRUE UNARY: each plane contributes equally (value 1 per plane).
13
+ * NOT binary (where plane p contributes 2^p).
14
+ *
15
+ * y[i] = scale[i] * sum_planes( signed_masked_sum(x, plane, sign) )
16
+ *
17
+ * (c) 2026 OpenTransformers Ltd / Scott Bisset
18
+ */
19
+
20
+ #include <immintrin.h>
21
+ #include <stdint.h>
22
+ #include <stdlib.h>
23
+ #include <string.h>
24
+ #include <math.h>
25
+ #include <stdio.h>
26
+
27
+ void unary_matvec_avx512(
28
+ const uint64_t *sign_bits,
29
+ const uint64_t *mag_planes,
30
+ const float *scales,
31
+ const float *x,
32
+ float *y,
33
+ int out_dim,
34
+ int in_dim,
35
+ int n_planes
36
+ ) {
37
+ int chunks = (in_dim + 63) / 64;
38
+ int in_padded = (in_dim + 15) & ~15;
39
+ float *x_pad = (float *)aligned_alloc(64, in_padded * sizeof(float));
40
+ memcpy(x_pad, x, in_dim * sizeof(float));
41
+ memset(x_pad + in_dim, 0, (in_padded - in_dim) * sizeof(float));
42
+
43
+ for (int i = 0; i < out_dim; i++) {
44
+ const uint64_t *row_sign = sign_bits + (size_t)i * chunks;
45
+ float total = 0.0f;
46
+
47
+ for (int p = 0; p < n_planes; p++) {
48
+ const uint64_t *plane_row = mag_planes +
49
+ ((size_t)p * out_dim + i) * chunks;
50
+
51
+ __m512 acc = _mm512_setzero_ps();
52
+
53
+ for (int c = 0; c < chunks; c++) {
54
+ uint64_t mbits = plane_row[c];
55
+ uint64_t sbits = row_sign[c];
56
+ uint64_t pos = mbits & ~sbits;
57
+ uint64_t neg = mbits & sbits;
58
+
59
+ for (int g = 0; g < 4 && (c * 64 + g * 16) < in_padded; g++) {
60
+ int offset = c * 64 + g * 16;
61
+ __m512 xv = _mm512_load_ps(x_pad + offset);
62
+ __mmask16 pmask = (__mmask16)((pos >> (g * 16)) & 0xFFFF);
63
+ __mmask16 nmask = (__mmask16)((neg >> (g * 16)) & 0xFFFF);
64
+ acc = _mm512_mask_add_ps(acc, pmask, acc, xv);
65
+ acc = _mm512_mask_sub_ps(acc, nmask, acc, xv);
66
+ }
67
+ }
68
+ total += _mm512_reduce_add_ps(acc);
69
+ }
70
+ y[i] = total * scales[i];
71
+ }
72
+ free(x_pad);
73
+ }
74
+
75
+ void rmsnorm_avx512(
76
+ const float *x, const float *weight, float *y, int dim, float eps
77
+ ) {
78
+ __m512 sum_sq = _mm512_setzero_ps();
79
+ int i;
80
+ for (i = 0; i + 16 <= dim; i += 16) {
81
+ __m512 xv = _mm512_loadu_ps(x + i);
82
+ sum_sq = _mm512_fmadd_ps(xv, xv, sum_sq);
83
+ }
84
+ float ss = _mm512_reduce_add_ps(sum_sq);
85
+ for (; i < dim; i++) ss += x[i] * x[i];
86
+ float rms = 1.0f / sqrtf(ss / dim + eps);
87
+ for (i = 0; i + 16 <= dim; i += 16) {
88
+ __m512 xv = _mm512_loadu_ps(x + i);
89
+ __m512 wv = _mm512_loadu_ps(weight + i);
90
+ __m512 rv = _mm512_set1_ps(rms);
91
+ _mm512_storeu_ps(y + i, _mm512_mul_ps(_mm512_mul_ps(xv, rv), wv));
92
+ }
93
+ for (; i < dim; i++) y[i] = x[i] * rms * weight[i];
94
+ }
95
+
96
+ void silu_avx512(float *x, int n) {
97
+ for (int i = 0; i < n; i++) {
98
+ float v = x[i];
99
+ x[i] = v / (1.0f + expf(-v));
100
+ }
101
+ }
102
+
103
+ void elemwise_mul_avx512(const float *a, const float *b, float *c, int n) {
104
+ int i;
105
+ for (i = 0; i + 16 <= n; i += 16) {
106
+ __m512 av = _mm512_loadu_ps(a + i);
107
+ __m512 bv = _mm512_loadu_ps(b + i);
108
+ _mm512_storeu_ps(c + i, _mm512_mul_ps(av, bv));
109
+ }
110
+ for (; i < n; i++) c[i] = a[i] * b[i];
111
+ }
112
+
113
+ void softmax_avx512(float *x, int n) {
114
+ float max_val = x[0];
115
+ for (int i = 1; i < n; i++) if (x[i] > max_val) max_val = x[i];
116
+ float sum = 0.0f;
117
+ for (int i = 0; i < n; i++) { x[i] = expf(x[i] - max_val); sum += x[i]; }
118
+ float inv = 1.0f / sum;
119
+ for (int i = 0; i < n; i++) x[i] *= inv;
120
+ }
unary_loader.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Thin Python loader for the Unary C Engine.
4
+ Loads weights from disk, passes pointers to C, calls C generate().
5
+ ZERO Python in the inference hot path.
6
+
7
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
8
+ """
9
+
10
+ import ctypes
11
+ import numpy as np
12
+ import json
13
+ import os
14
+ import time
15
+ from pathlib import Path
16
+ from transformers import AutoTokenizer
17
+
18
+
19
+ class UnaryEngine:
20
+ def __init__(self, model_dir, so_path="unary_engine.so"):
21
+ self.model_dir = Path(model_dir)
22
+ self.lib = ctypes.CDLL(so_path)
23
+ self._setup_ctypes()
24
+
25
+ # Load config
26
+ with open(self.model_dir / "config.json") as f:
27
+ self.config = json.load(f)
28
+ self.n_planes = self.config["n_planes"]
29
+
30
+ # Load manifest
31
+ with open(self.model_dir / "manifest.json") as f:
32
+ self.manifest = json.load(f)
33
+
34
+ # Allocate model in C
35
+ self.model = self.lib.model_alloc(self.n_planes)
36
+
37
+ # Keep references so GC doesn't free numpy arrays
38
+ self._refs = []
39
+
40
+ # Load all weights
41
+ self._load_weights()
42
+
43
+ def _setup_ctypes(self):
44
+ L = self.lib
45
+ L.model_alloc.restype = ctypes.c_void_p
46
+ L.model_alloc.argtypes = [ctypes.c_int]
47
+
48
+ L.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
49
+ L.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
50
+ L.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
51
+
52
+ L.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
53
+ L.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
54
+
55
+ # layer_set_linears: model, layer_idx, then 7x (sign, planes, scales, out, in), plus n_planes
56
+ args = [ctypes.c_void_p, ctypes.c_int]
57
+ for _ in range(7):
58
+ args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
59
+ args.append(ctypes.c_int)
60
+ L.layer_set_linears.argtypes = args
61
+
62
+ L.model_reset_cache.argtypes = [ctypes.c_void_p]
63
+ L.model_free.argtypes = [ctypes.c_void_p]
64
+
65
+ L.forward_token.restype = ctypes.POINTER(ctypes.c_float)
66
+ L.forward_token.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
67
+
68
+ L.generate.restype = ctypes.c_int
69
+ L.generate.argtypes = [
70
+ ctypes.c_void_p,
71
+ ctypes.c_void_p, ctypes.c_int,
72
+ ctypes.c_void_p, ctypes.c_int,
73
+ ctypes.c_float, ctypes.c_float,
74
+ ctypes.c_int
75
+ ]
76
+
77
+ def _keep(self, arr):
78
+ """Keep reference to prevent GC."""
79
+ self._refs.append(arr)
80
+ return arr.ctypes.data
81
+
82
+ def _load_fp16(self, key):
83
+ path = self.model_dir / (key.replace(".", "_") + ".fp16")
84
+ arr = np.fromfile(str(path), dtype=np.float16)
85
+ return arr
86
+
87
+ def _load_fp16_as_f32(self, key):
88
+ arr = self._load_fp16(key).astype(np.float32)
89
+ self._refs.append(arr)
90
+ return arr
91
+
92
+ def _load_unary(self, key):
93
+ prefix = str(self.model_dir / key.replace(".", "_"))
94
+ sign = np.fromfile(prefix + ".sign", dtype=np.uint64)
95
+ planes = np.fromfile(prefix + ".planes", dtype=np.uint64)
96
+ scales = np.fromfile(prefix + ".scales", dtype=np.float32)
97
+ self._refs.extend([sign, planes, scales])
98
+ shape = self.manifest["unary"][key]
99
+ return sign, planes, scales, shape[0], shape[1]
100
+
101
+ def _load_weights(self):
102
+ t0 = time.time()
103
+
104
+ # Embeddings
105
+ embed = self._load_fp16("model.embed_tokens.weight")
106
+ self._refs.append(embed)
107
+ self.lib.model_set_embed(self.model, embed.ctypes.data)
108
+ print(f" Embeddings: {embed.nbytes/1024/1024:.1f} MB")
109
+
110
+ # Final norm
111
+ fnorm = self._load_fp16_as_f32("model.norm.weight")
112
+ self.lib.model_set_final_norm(self.model, fnorm.ctypes.data)
113
+
114
+ # LM head
115
+ lm = self._load_fp16("lm_head.weight")
116
+ self._refs.append(lm)
117
+ shape = self.manifest["fp16"]["lm_head.weight"]
118
+ self.lib.model_set_lm_head(self.model, lm.ctypes.data, shape[0], shape[1])
119
+ print(f" LM head: {lm.nbytes/1024/1024:.1f} MB")
120
+
121
+ # Layers
122
+ for l in range(28):
123
+ prefix = f"model.layers.{l}"
124
+
125
+ # Norms
126
+ in_norm = self._load_fp16_as_f32(f"{prefix}.input_layernorm.weight")
127
+ post_norm = self._load_fp16_as_f32(f"{prefix}.post_attention_layernorm.weight")
128
+ self.lib.layer_set_norms(self.model, l, in_norm.ctypes.data, post_norm.ctypes.data)
129
+
130
+ # Biases
131
+ q_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.q_proj.bias")
132
+ k_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.k_proj.bias")
133
+ v_bias = self._load_fp16_as_f32(f"{prefix}.self_attn.v_proj.bias")
134
+ self.lib.layer_set_bias(self.model, l,
135
+ q_bias.ctypes.data, k_bias.ctypes.data, v_bias.ctypes.data)
136
+
137
+ # Unary linear layers
138
+ projs = ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj',
139
+ 'self_attn.o_proj', 'mlp.gate_proj', 'mlp.up_proj', 'mlp.down_proj']
140
+
141
+ linear_args = []
142
+ for proj in projs:
143
+ key = f"{prefix}.{proj}.weight"
144
+ sign, planes, scales, out_d, in_d = self._load_unary(key)
145
+ linear_args.extend([sign.ctypes.data, planes.ctypes.data,
146
+ scales.ctypes.data, out_d, in_d])
147
+
148
+ self.lib.layer_set_linears(self.model, l, *linear_args, self.n_planes)
149
+
150
+ if (l + 1) % 7 == 0:
151
+ print(f" Loaded {l+1}/28 layers")
152
+
153
+ dt = time.time() - t0
154
+ total = sum(a.nbytes for a in self._refs) / 1024 / 1024
155
+ print(f"\nModel loaded in {dt:.1f}s, {total:.0f} MB in Python arrays")
156
+
157
+ def generate(self, token_ids, max_new_tokens=256, temperature=0.6, top_p=0.95, eos_token=151643):
158
+ self.lib.model_reset_cache(self.model)
159
+
160
+ prompt = np.array(token_ids, dtype=np.int32)
161
+ output = np.zeros(max_new_tokens, dtype=np.int32)
162
+
163
+ t0 = time.time()
164
+ n_gen = self.lib.generate(
165
+ self.model,
166
+ prompt.ctypes.data, len(prompt),
167
+ output.ctypes.data, max_new_tokens,
168
+ ctypes.c_float(temperature), ctypes.c_float(top_p),
169
+ eos_token
170
+ )
171
+ dt = time.time() - t0
172
+
173
+ return output[:n_gen].tolist(), n_gen, dt
174
+
175
+
176
+ def main():
177
+ import sys
178
+ model_dir = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary"
179
+ hf_dir = sys.argv[2] if len(sys.argv) > 2 else "deepseek-r1-1.5b-hf"
180
+
181
+ print("Loading tokenizer...")
182
+ tok = AutoTokenizer.from_pretrained(hf_dir, trust_remote_code=True)
183
+
184
+ print("Loading unary engine...")
185
+ engine = UnaryEngine(model_dir, "./unary_engine.so")
186
+
187
+ # Test
188
+ messages = [{"role": "user", "content": "What is 2+2?"}]
189
+ prompt = tok.apply_chat_template(messages, add_generation_prompt=True)
190
+ print(f"\nPrompt: {len(prompt)} tokens")
191
+ print("Generating...")
192
+
193
+ tokens, n_gen, dt = engine.generate(prompt, max_new_tokens=60, temperature=0.6)
194
+ text = tok.decode(tokens)
195
+
196
+ print(f"\n--- Output ({n_gen} tokens in {dt:.2f}s = {n_gen/dt:.1f} tok/s) ---")
197
+ print(text)
198
+ print("---")
199
+
200
+
201
+ if __name__ == "__main__":
202
+ main()
unary_run.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unary Engine Runner - Loads weights into the C engine and generates text.
4
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
5
+ """
6
+ import ctypes, numpy as np, os, sys, time, struct
7
+
8
+ MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
9
+ HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
10
+ ENGINE = "/root/ternary_engine/unary_engine.so"
11
+ N_PLANES = 7
12
+ N_LAYERS = 28
13
+ HIDDEN = 1536
14
+ VOCAB = 151936
15
+
16
+ # Load engine
17
+ lib = ctypes.CDLL(ENGINE)
18
+
19
+ # Define function signatures
20
+ lib.model_alloc.restype = ctypes.c_void_p
21
+ lib.model_alloc.argtypes = [ctypes.c_int]
22
+
23
+ lib.model_set_embed.restype = None
24
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
25
+
26
+ lib.model_set_final_norm.restype = None
27
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
28
+
29
+ lib.model_set_lm_head.restype = None
30
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
31
+
32
+ lib.layer_set_norms.restype = None
33
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
34
+
35
+ lib.layer_set_bias.restype = None
36
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
37
+
38
+ lib.layer_set_linears.restype = None
39
+ # 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
40
+ lib.layer_set_linears.argtypes = [
41
+ ctypes.c_void_p, ctypes.c_int, # model, layer_idx
42
+ # q_proj
43
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
44
+ # k_proj
45
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
46
+ # v_proj
47
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
48
+ # o_proj
49
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
50
+ # gate_proj
51
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
52
+ # up_proj
53
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
54
+ # down_proj
55
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
56
+ ctypes.c_int # n_planes
57
+ ]
58
+
59
+ lib.generate.restype = ctypes.c_int
60
+ lib.generate.argtypes = [
61
+ ctypes.c_void_p, # model
62
+ ctypes.c_void_p, ctypes.c_int, # prompt_ids, prompt_len
63
+ ctypes.c_void_p, ctypes.c_int, # out_tokens, max_new_tokens
64
+ ctypes.c_float, ctypes.c_float, # temperature, top_p
65
+ ctypes.c_int # eos_token
66
+ ]
67
+
68
+ lib.model_reset_cache.restype = None
69
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
70
+
71
+ lib.model_free.restype = None
72
+ lib.model_free.argtypes = [ctypes.c_void_p]
73
+
74
+ def load_fp16_as_uint16(path):
75
+ """Load FP16 file as raw uint16 array (for passing to C as FP16)"""
76
+ return np.fromfile(path, dtype=np.uint16)
77
+
78
+ def load_fp16_as_f32(path):
79
+ """Load FP16 file and convert to FP32"""
80
+ raw = np.fromfile(path, dtype=np.float16)
81
+ return raw.astype(np.float32)
82
+
83
+ def load_unary(name):
84
+ """Load sign, planes, scales for a unary layer"""
85
+ base = os.path.join(MODEL_DIR, name)
86
+ sign = np.fromfile(base + ".sign", dtype=np.uint64)
87
+ planes = np.fromfile(base + ".planes", dtype=np.uint64)
88
+ scales = np.fromfile(base + ".scales", dtype=np.float32)
89
+ return sign, planes, scales
90
+
91
+ # Keep references to prevent GC
92
+ _refs = []
93
+
94
+ def keep(arr):
95
+ """Keep numpy array alive and return its ctypes pointer"""
96
+ _refs.append(arr)
97
+ return arr.ctypes.data
98
+
99
+ print("Allocating model...")
100
+ model = lib.model_alloc(N_PLANES)
101
+ print(f"Model pointer: {model:#x}")
102
+
103
+ # Load embeddings (FP16, passed as uint16)
104
+ print("Loading embeddings...")
105
+ embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
106
+ print(f" embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
107
+ lib.model_set_embed(model, keep(embed))
108
+
109
+ # Load final norm (FP16 -> FP32)
110
+ print("Loading final norm...")
111
+ final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
112
+ lib.model_set_final_norm(model, keep(final_norm))
113
+
114
+ # Load lm_head (FP16, passed as uint16)
115
+ print("Loading lm_head...")
116
+ lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
117
+ lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
118
+
119
+ # Load layers
120
+ PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
121
+ "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
122
+
123
+ # Layer dimensions: [out_dim, in_dim]
124
+ PROJ_DIMS = {
125
+ "self_attn_q_proj": (1536, 1536),
126
+ "self_attn_k_proj": (256, 1536),
127
+ "self_attn_v_proj": (256, 1536),
128
+ "self_attn_o_proj": (1536, 1536),
129
+ "mlp_gate_proj": (8960, 1536),
130
+ "mlp_up_proj": (8960, 1536),
131
+ "mlp_down_proj": (1536, 8960),
132
+ }
133
+
134
+ for l in range(N_LAYERS):
135
+ if l % 7 == 0:
136
+ print(f"Loading layer {l}/{N_LAYERS}...")
137
+
138
+ # Norms (FP16 -> FP32)
139
+ input_norm = load_fp16_as_f32(
140
+ os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
141
+ post_norm = load_fp16_as_f32(
142
+ os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
143
+ lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
144
+
145
+ # Biases (FP16 -> FP32)
146
+ q_bias = load_fp16_as_f32(
147
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
148
+ k_bias = load_fp16_as_f32(
149
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
150
+ v_bias = load_fp16_as_f32(
151
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
152
+ lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
153
+
154
+ # Unary linear layers
155
+ proj_args = []
156
+ for proj_name in PROJ_NAMES:
157
+ weight_name = f"model_layers_{l}_{proj_name}_weight"
158
+ sign, planes, scales = load_unary(weight_name)
159
+ out_dim, in_dim = PROJ_DIMS[proj_name]
160
+ proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
161
+
162
+ lib.layer_set_linears(model, l, *proj_args, N_PLANES)
163
+
164
+ print("Model loaded!")
165
+
166
+ # Load tokenizer
167
+ print("Loading tokenizer...")
168
+ from transformers import AutoTokenizer
169
+ tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
170
+ eos_id = tokenizer.eos_token_id
171
+ print(f"Tokenizer loaded, EOS={eos_id}")
172
+
173
+ # Generate
174
+ prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
175
+ print(f"\nPrompt: {prompt}")
176
+
177
+ input_ids = tokenizer.encode(prompt, return_tensors=None)
178
+ input_arr = np.array(input_ids, dtype=np.int32)
179
+ max_new = 256
180
+ out_arr = np.zeros(max_new, dtype=np.int32)
181
+
182
+ lib.model_reset_cache(model)
183
+
184
+ print("Generating...")
185
+ t0 = time.time()
186
+ n_gen = lib.generate(
187
+ model,
188
+ input_arr.ctypes.data, len(input_ids),
189
+ out_arr.ctypes.data, max_new,
190
+ ctypes.c_float(0.6), ctypes.c_float(0.9),
191
+ eos_id
192
+ )
193
+ dt = time.time() - t0
194
+
195
+ output_ids = out_arr[:n_gen].tolist()
196
+ text = tokenizer.decode(output_ids, skip_special_tokens=False)
197
+ tok_s = n_gen / dt if dt > 0 else 0
198
+
199
+ print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
200
+ print(text)
201
+ print(f"--- End ---")
202
+
203
+ lib.model_free(model)
unary_run16.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unary Engine Runner - Loads weights into the C engine and generates text.
4
+ (c) 2026 OpenTransformers Ltd / Scott Bisset
5
+ """
6
+ import ctypes, numpy as np, os, sys, time, struct
7
+
8
+ MODEL_DIR = "/root/ternary_engine/deepseek-r1-1.5b-unary"
9
+ HF_DIR = "/root/ternary_engine/deepseek-r1-1.5b-hf"
10
+ ENGINE = "/root/ternary_engine/unary_engine.so"
11
+ N_PLANES = 7
12
+ N_LAYERS = 28
13
+ HIDDEN = 1536
14
+ VOCAB = 151936
15
+
16
+ # Load engine
17
+ lib = ctypes.CDLL(ENGINE)
18
+
19
+ # Define function signatures
20
+ lib.model_alloc.restype = ctypes.c_void_p
21
+ lib.model_alloc.argtypes = [ctypes.c_int]
22
+
23
+ lib.model_set_embed.restype = None
24
+ lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
25
+
26
+ lib.model_set_final_norm.restype = None
27
+ lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
28
+
29
+ lib.model_set_lm_head.restype = None
30
+ lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
31
+
32
+ lib.layer_set_norms.restype = None
33
+ lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
34
+
35
+ lib.layer_set_bias.restype = None
36
+ lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p]
37
+
38
+ lib.layer_set_linears.restype = None
39
+ # 7 linears * 3 args each (sign, planes, scales) + 7 * 2 dims + n_planes = 36 args
40
+ lib.layer_set_linears.argtypes = [
41
+ ctypes.c_void_p, ctypes.c_int, # model, layer_idx
42
+ # q_proj
43
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
44
+ # k_proj
45
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
46
+ # v_proj
47
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
48
+ # o_proj
49
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
50
+ # gate_proj
51
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
52
+ # up_proj
53
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
54
+ # down_proj
55
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int,
56
+ ctypes.c_int # n_planes
57
+ ]
58
+
59
+ lib.generate.restype = ctypes.c_int
60
+ lib.generate.argtypes = [
61
+ ctypes.c_void_p, # model
62
+ ctypes.c_void_p, ctypes.c_int, # prompt_ids, prompt_len
63
+ ctypes.c_void_p, ctypes.c_int, # out_tokens, max_new_tokens
64
+ ctypes.c_float, ctypes.c_float, # temperature, top_p
65
+ ctypes.c_int # eos_token
66
+ ]
67
+
68
+ lib.model_reset_cache.restype = None
69
+ lib.model_reset_cache.argtypes = [ctypes.c_void_p]
70
+
71
+ lib.model_free.restype = None
72
+ lib.model_free.argtypes = [ctypes.c_void_p]
73
+
74
+ def load_fp16_as_uint16(path):
75
+ """Load FP16 file as raw uint16 array (for passing to C as FP16)"""
76
+ return np.fromfile(path, dtype=np.uint16)
77
+
78
+ def load_fp16_as_f32(path):
79
+ """Load FP16 file and convert to FP32"""
80
+ raw = np.fromfile(path, dtype=np.float16)
81
+ return raw.astype(np.float32)
82
+
83
+ def load_unary(name):
84
+ """Load sign, planes, scales for a unary layer"""
85
+ base = os.path.join(MODEL_DIR, name)
86
+ sign = np.fromfile(base + ".sign", dtype=np.uint64)
87
+ planes = np.fromfile(base + ".planes", dtype=np.uint64)
88
+ scales = np.fromfile(base + ".scales", dtype=np.float32)
89
+ return sign, planes, scales
90
+
91
+ # Keep references to prevent GC
92
+ _refs = []
93
+
94
+ def keep(arr):
95
+ """Keep numpy array alive and return its ctypes pointer"""
96
+ _refs.append(arr)
97
+ return arr.ctypes.data
98
+
99
+ print("Allocating model...")
100
+ model = lib.model_alloc(N_PLANES)
101
+ print(f"Model pointer: {model:#x}")
102
+
103
+ # Load embeddings (FP16, passed as uint16)
104
+ print("Loading embeddings...")
105
+ embed = load_fp16_as_uint16(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"))
106
+ print(f" embed shape: {embed.shape} ({embed.nbytes/1e6:.1f}MB)")
107
+ lib.model_set_embed(model, keep(embed))
108
+
109
+ # Load final norm (FP16 -> FP32)
110
+ print("Loading final norm...")
111
+ final_norm = load_fp16_as_f32(os.path.join(MODEL_DIR, "model_norm_weight.fp16"))
112
+ lib.model_set_final_norm(model, keep(final_norm))
113
+
114
+ # Load lm_head (FP16, passed as uint16)
115
+ print("Loading lm_head...")
116
+ lm_head = load_fp16_as_uint16(os.path.join(MODEL_DIR, "lm_head_weight.fp16"))
117
+ lib.model_set_lm_head(model, keep(lm_head), VOCAB, HIDDEN)
118
+
119
+ # Load layers
120
+ PROJ_NAMES = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj",
121
+ "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"]
122
+
123
+ # Layer dimensions: [out_dim, in_dim]
124
+ PROJ_DIMS = {
125
+ "self_attn_q_proj": (1536, 1536),
126
+ "self_attn_k_proj": (256, 1536),
127
+ "self_attn_v_proj": (256, 1536),
128
+ "self_attn_o_proj": (1536, 1536),
129
+ "mlp_gate_proj": (8960, 1536),
130
+ "mlp_up_proj": (8960, 1536),
131
+ "mlp_down_proj": (1536, 8960),
132
+ }
133
+
134
+ for l in range(N_LAYERS):
135
+ if l % 7 == 0:
136
+ print(f"Loading layer {l}/{N_LAYERS}...")
137
+
138
+ # Norms (FP16 -> FP32)
139
+ input_norm = load_fp16_as_f32(
140
+ os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"))
141
+ post_norm = load_fp16_as_f32(
142
+ os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"))
143
+ lib.layer_set_norms(model, l, keep(input_norm), keep(post_norm))
144
+
145
+ # Biases (FP16 -> FP32)
146
+ q_bias = load_fp16_as_f32(
147
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"))
148
+ k_bias = load_fp16_as_f32(
149
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"))
150
+ v_bias = load_fp16_as_f32(
151
+ os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"))
152
+ lib.layer_set_bias(model, l, keep(q_bias), keep(k_bias), keep(v_bias))
153
+
154
+ # Unary linear layers
155
+ proj_args = []
156
+ for proj_name in PROJ_NAMES:
157
+ weight_name = f"model_layers_{l}_{proj_name}_weight"
158
+ sign, planes, scales = load_unary(weight_name)
159
+ out_dim, in_dim = PROJ_DIMS[proj_name]
160
+ proj_args.extend([keep(sign), keep(planes), keep(scales), out_dim, in_dim])
161
+
162
+ lib.layer_set_linears(model, l, *proj_args, N_PLANES)
163
+
164
+ print("Model loaded!")
165
+
166
+ # Load tokenizer
167
+ print("Loading tokenizer...")
168
+ from transformers import AutoTokenizer
169
+ tokenizer = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True)
170
+ eos_id = tokenizer.eos_token_id
171
+ print(f"Tokenizer loaded, EOS={eos_id}")
172
+
173
+ # Generate
174
+ prompt = sys.argv[1] if len(sys.argv) > 1 else "What is 2+2?"
175
+ print(f"\nPrompt: {prompt}")
176
+
177
+ input_ids = tokenizer.encode(prompt, return_tensors=None)
178
+ input_arr = np.array(input_ids, dtype=np.int32)
179
+ max_new = 16
180
+ out_arr = np.zeros(max_new, dtype=np.int32)
181
+
182
+ lib.model_reset_cache(model)
183
+
184
+ print("Generating...")
185
+ t0 = time.time()
186
+ n_gen = lib.generate(
187
+ model,
188
+ input_arr.ctypes.data, len(input_ids),
189
+ out_arr.ctypes.data, max_new,
190
+ ctypes.c_float(0.6), ctypes.c_float(0.9),
191
+ eos_id
192
+ )
193
+ dt = time.time() - t0
194
+
195
+ output_ids = out_arr[:n_gen].tolist()
196
+ text = tokenizer.decode(output_ids, skip_special_tokens=False)
197
+ tok_s = n_gen / dt if dt > 0 else 0
198
+
199
+ print(f"\n--- Output ({n_gen} tokens, {dt:.1f}s, {tok_s:.1f} tok/s) ---")
200
+ print(text)
201
+ print(f"--- End ---")
202
+
203
+ lib.model_free(model)