/* * LOG-UNARY TRANSFORMER ENGINE * * Unary base-1 with logarithmic compression: * Linear unary: value 7 = 1111111 (7 planes, each = +1) * Log unary: value 8 = 111 (3 planes, plane p = 2^p) * * Matmul kernel: acc += popcount(w_plane[p] AND x_plane[q]) << (p+q) * Still pure AND+popcount+shift, no float in hot path. * * 3 log-planes = values {0,1,2,4} with sign = {-4..+4} = 9 levels * 4 log-planes = values {0,1,2,4,8} with sign = {-8..+8} = 17 levels * 5 log-planes = values {0,1,2,4,8,16} with sign = {-16..+16} = 33 levels * * vs linear 7 planes = {-7..+7} = 15 levels using 7 planes * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #include #include #include #include #include #include #include #include #define MAX_SEQ 4096 #define RMS_EPS 1e-6f /* ============================================================ * Config * ============================================================ */ typedef struct { int hidden; int inter; int n_heads; int n_kv_heads; int head_dim; int n_layers; int vocab; float rope_theta; int tie_embeddings; int w_planes; /* weight log-planes */ int a_planes; /* activation log-planes */ } Config; /* Log-unary weight matrix */ typedef struct { uint64_t *sign_bits; /* [out_dim * chunks] */ uint64_t *log_planes; /* [n_planes][out_dim * chunks] - plane p = 2^p */ float *scales; /* [out_dim] */ int out_dim; int in_dim; int n_planes; int chunks; } LogUnaryWeight; /* Transformer layer */ typedef struct { LogUnaryWeight q_proj, k_proj, v_proj, o_proj; LogUnaryWeight gate_proj, up_proj, down_proj; float *input_norm; float *post_norm; float *q_norm, *k_norm; } Layer; /* Full model */ typedef struct { Config cfg; uint16_t *embed; Layer *layers; float *final_norm; /* KV cache */ float *k_cache; float *v_cache; /* Float scratch (O(dim) ops only) */ float *hidden; float *normed; float *q_float; float *k_float; float *v_float; float *attn_out; float *gate_float; float *up_float; float *mlp_act; float *logits; float *attn_scores; /* Unary scratch for activation quantization */ uint64_t *act_sign; uint64_t *act_planes; /* Larger scratch for intermediate dim */ uint64_t *mlp_act_sign; uint64_t *mlp_act_planes; } Model; /* ============================================================ * LOG-UNARY ACTIVATION QUANTIZATION * * Encode float value as sign + log-magnitude planes * Plane p is set if |x| >= threshold_p * threshold_p = scale * 2^p / max_level * * Effectively: compute integer magnitude = round(|x|/scale * max_level) * Then decompose into binary: if bit p is set in magnitude, plane p is set * * Wait — that's just BINARY encoding of the magnitude! * Log-unary IS binary representation stored as separate bitplanes. * The magic is that AND+popcount+shift MULTIPLIES them. * ============================================================ */ static void quantize_log_unary( const float *x, int dim, int n_planes, uint64_t *sign_out, uint64_t *planes_out, float *scale_out ) { int chunks = (dim + 63) / 64; int max_level = (1 << n_planes) - 1; /* 2^n - 1 */ /* Find absmax */ float amax = 0.0f; for (int i = 0; i < dim; i++) { float a = fabsf(x[i]); if (a > amax) amax = a; } if (amax == 0.0f) amax = 1.0f; *scale_out = amax / max_level; memset(sign_out, 0, chunks * sizeof(uint64_t)); memset(planes_out, 0, (size_t)n_planes * chunks * sizeof(uint64_t)); float inv_scale = max_level / amax; for (int i = 0; i < dim; i++) { int chunk = i / 64; int bit = i % 64; uint64_t mask = 1ULL << bit; if (x[i] < 0.0f) sign_out[chunk] |= mask; int mag = (int)(fabsf(x[i]) * inv_scale + 0.5f); if (mag > max_level) mag = max_level; /* Binary decomposition: plane p gets bit p of magnitude */ for (int p = 0; p < n_planes; p++) { if (mag & (1 << p)) planes_out[(size_t)p * chunks + chunk] |= mask; } } } /* ============================================================ * LOG-UNARY MATVEC: y = W @ x * * W: log-unary (sign + wp log-planes, scales) * x: log-unary (sign + xp log-planes, scale) * * For each output element i: * acc = 0 * for each chunk c: * same = ~(w_sign[c] ^ x_sign[c]) * diff = w_sign[c] ^ x_sign[c] * for p in 0..wp-1: * for q in 0..xp-1: * active = w_plane[p][c] & x_plane[q][c] * pos = popcount(active & same) * neg = popcount(active & diff) * acc += (pos - neg) << (p + q) <-- THE KEY: shift by p+q * y[i] = acc * w_scale[i] * x_scale * ============================================================ */ static void log_unary_matvec( const LogUnaryWeight *W, const uint64_t *x_sign, const uint64_t *x_planes, float x_scale, int x_n_planes, float *y_out ) { int out_dim = W->out_dim; int chunks = W->chunks; int wp = W->n_planes; int xp = x_n_planes; #pragma omp parallel for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { const uint64_t *w_sign_row = W->sign_bits + (size_t)i * chunks; long long acc = 0; for (int c = 0; c < chunks; c++) { uint64_t ws = w_sign_row[c]; uint64_t xs = x_sign[c]; uint64_t same = ~(ws ^ xs); uint64_t diff = ws ^ xs; for (int p = 0; p < wp; p++) { uint64_t w_mag = W->log_planes[((size_t)p * out_dim + i) * chunks + c]; for (int q = 0; q < xp; q++) { uint64_t x_mag = x_planes[(size_t)q * chunks + c]; uint64_t active = w_mag & x_mag; if (!active) continue; /* skip zero — common with log encoding */ uint64_t pos = active & same; uint64_t neg = active & diff; int shift = p + q; acc += (long long)(__builtin_popcountll(pos) - __builtin_popcountll(neg)) << shift; } } } y_out[i] = (float)acc * W->scales[i] * x_scale; } } /* ============================================================ * FP16 ops (embedding, lm_head) — not in the critical per-layer path * ============================================================ */ static void embed_token(const uint16_t *embed, int token_id, float *out, int hidden) { const uint16_t *row = embed + (size_t)token_id * hidden; int i; for (i = 0; i + 16 <= hidden; i += 16) { __m256i h = _mm256_loadu_si256((__m256i*)(row + i)); __m512 fv = _mm512_cvtph_ps(h); _mm512_storeu_ps(out + i, fv); } for (; i < hidden; i++) { __m128i hv = _mm_set1_epi16(row[i]); __m128 fv = _mm_cvtph_ps(hv); _mm_store_ss(out + i, fv); } } static void fp16_matvec(const uint16_t *w, const float *x, float *y, int out_dim, int in_dim) { #pragma omp parallel for schedule(dynamic, 256) for (int i = 0; i < out_dim; i++) { __m512 acc = _mm512_setzero_ps(); int j; for (j = 0; j + 16 <= in_dim; j += 16) { __m256i h = _mm256_loadu_si256((__m256i*)(w + (size_t)i * in_dim + j)); __m512 wv = _mm512_cvtph_ps(h); __m512 xv = _mm512_loadu_ps(x + j); acc = _mm512_fmadd_ps(wv, xv, acc); } float sum = _mm512_reduce_add_ps(acc); for (; j < in_dim; j++) { __m128i hv = _mm_set1_epi16(w[(size_t)i * in_dim + j]); __m128 fv = _mm_cvtph_ps(hv); float wf; _mm_store_ss(&wf, fv); sum += wf * x[j]; } y[i] = sum; } } /* ============================================================ * O(dim) float ops — RMSNorm, SiLU, Softmax, RoPE, residual * ============================================================ */ static void rmsnorm(const float *x, const float *w, float *y, int dim) { float ss = 0.0f; for (int i = 0; i < dim; i++) ss += x[i] * x[i]; float rms = 1.0f / sqrtf(ss / dim + RMS_EPS); for (int i = 0; i < dim; i++) y[i] = x[i] * rms * w[i]; } static void silu_mul(const float *gate, const float *up, float *out, int n) { for (int i = 0; i < n; i++) out[i] = (gate[i] / (1.0f + expf(-gate[i]))) * up[i]; } static void vec_add(float *y, const float *x, int n) { for (int i = 0; i < n; i++) y[i] += x[i]; } static void apply_rope(float *vec, int pos, int dim, float theta) { for (int i = 0; i < dim; i += 2) { float freq = 1.0f / powf(theta, (float)i / dim); float angle = pos * freq; float co = cosf(angle), si = sinf(angle); float v0 = vec[i], v1 = vec[i+1]; vec[i] = v0*co - v1*si; vec[i+1] = v0*si + v1*co; } } static void softmax(float *x, int n) { float mx = x[0]; for (int i = 1; i < n; i++) if (x[i] > mx) mx = x[i]; float sum = 0.0f; for (int i = 0; i < n; i++) { x[i] = expf(x[i] - mx); sum += x[i]; } float inv = 1.0f / sum; for (int i = 0; i < n; i++) x[i] *= inv; } static float* kv_ptr(float *cache, const Config *c, int layer, int pos, int kv_head) { return cache + ((size_t)layer * MAX_SEQ * c->n_kv_heads + (size_t)pos * c->n_kv_heads + kv_head) * c->head_dim; } /* ============================================================ * ATTENTION * ============================================================ */ static void attention(Model *m, int layer_idx, int pos) { Config *c = &m->cfg; Layer *L = &m->layers[layer_idx]; int heads_per_kv = c->n_heads / c->n_kv_heads; int hidden_chunks = (c->hidden + 63) / 64; float act_scale; /* Quantize normed hidden -> log-unary */ quantize_log_unary(m->normed, c->hidden, c->a_planes, m->act_sign, m->act_planes, &act_scale); /* Q, K, V — log-unary matmul */ log_unary_matvec(&L->q_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->q_float); log_unary_matvec(&L->k_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->k_float); log_unary_matvec(&L->v_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->v_float); /* QK-Norm */ if (L->q_norm) for (int h = 0; h < c->n_heads; h++) rmsnorm(m->q_float + h*c->head_dim, L->q_norm, m->q_float + h*c->head_dim, c->head_dim); if (L->k_norm) for (int h = 0; h < c->n_kv_heads; h++) rmsnorm(m->k_float + h*c->head_dim, L->k_norm, m->k_float + h*c->head_dim, c->head_dim); /* RoPE */ for (int h = 0; h < c->n_heads; h++) apply_rope(m->q_float + h*c->head_dim, pos, c->head_dim, c->rope_theta); for (int h = 0; h < c->n_kv_heads; h++) apply_rope(m->k_float + h*c->head_dim, pos, c->head_dim, c->rope_theta); /* KV cache store */ for (int h = 0; h < c->n_kv_heads; h++) { memcpy(kv_ptr(m->k_cache, c, layer_idx, pos, h), m->k_float + h*c->head_dim, c->head_dim * sizeof(float)); memcpy(kv_ptr(m->v_cache, c, layer_idx, pos, h), m->v_float + h*c->head_dim, c->head_dim * sizeof(float)); } /* Attention dot products + softmax + weighted sum */ float scale = 1.0f / sqrtf((float)c->head_dim); memset(m->attn_out, 0, c->n_heads * c->head_dim * sizeof(float)); for (int h = 0; h < c->n_heads; h++) { int kv_h = h / heads_per_kv; float *qh = m->q_float + h*c->head_dim; float *oh = m->attn_out + h*c->head_dim; for (int t = 0; t <= pos; t++) { float *kc = kv_ptr(m->k_cache, c, layer_idx, t, kv_h); float dot = 0.0f; for (int d = 0; d < c->head_dim; d++) dot += qh[d] * kc[d]; m->attn_scores[t] = dot * scale; } softmax(m->attn_scores, pos + 1); for (int t = 0; t <= pos; t++) { float w = m->attn_scores[t]; if (w < 1e-8f) continue; float *vc = kv_ptr(m->v_cache, c, layer_idx, t, kv_h); for (int d = 0; d < c->head_dim; d++) oh[d] += w * vc[d]; } } /* O projection — quantize attn_out, then log-unary matmul */ int o_dim = c->n_heads * c->head_dim; int o_chunks = (o_dim + 63) / 64; uint64_t *o_sign = (uint64_t *)aligned_alloc(64, o_chunks * sizeof(uint64_t)); uint64_t *o_planes = (uint64_t *)aligned_alloc(64, (size_t)c->a_planes * o_chunks * sizeof(uint64_t)); float o_scale; quantize_log_unary(m->attn_out, o_dim, c->a_planes, o_sign, o_planes, &o_scale); float *o_tmp = m->normed; /* reuse */ log_unary_matvec(&L->o_proj, o_sign, o_planes, o_scale, c->a_planes, o_tmp); memcpy(m->attn_out, o_tmp, c->hidden * sizeof(float)); free(o_sign); free(o_planes); } /* ============================================================ * MLP * ============================================================ */ static void mlp(Model *m, int layer_idx) { Config *c = &m->cfg; Layer *L = &m->layers[layer_idx]; int hidden_chunks = (c->hidden + 63) / 64; int inter_chunks = (c->inter + 63) / 64; float act_scale, mlp_scale; /* Quantize normed input */ quantize_log_unary(m->normed, c->hidden, c->a_planes, m->act_sign, m->act_planes, &act_scale); /* Gate + Up — log-unary */ log_unary_matvec(&L->gate_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->gate_float); log_unary_matvec(&L->up_proj, m->act_sign, m->act_planes, act_scale, c->a_planes, m->up_float); /* SiLU(gate) * up */ silu_mul(m->gate_float, m->up_float, m->mlp_act, c->inter); /* Quantize for down projection */ quantize_log_unary(m->mlp_act, c->inter, c->a_planes, m->mlp_act_sign, m->mlp_act_planes, &mlp_scale); /* Down — log-unary */ log_unary_matvec(&L->down_proj, m->mlp_act_sign, m->mlp_act_planes, mlp_scale, c->a_planes, m->normed); } /* ============================================================ * FORWARD * ============================================================ */ float* forward_token(Model *m, int token_id, int pos) { Config *c = &m->cfg; embed_token(m->embed, token_id, m->hidden, c->hidden); for (int l = 0; l < c->n_layers; l++) { rmsnorm(m->hidden, m->layers[l].input_norm, m->normed, c->hidden); attention(m, l, pos); vec_add(m->hidden, m->attn_out, c->hidden); rmsnorm(m->hidden, m->layers[l].post_norm, m->normed, c->hidden); mlp(m, l); vec_add(m->hidden, m->normed, c->hidden); } rmsnorm(m->hidden, m->final_norm, m->normed, c->hidden); if (c->tie_embeddings) fp16_matvec(m->embed, m->normed, m->logits, c->vocab, c->hidden); return m->logits; } /* ============================================================ * SAMPLING * ============================================================ */ static int sample_top_p(float *logits, int vocab, float temperature, float top_p) { if (temperature > 0) { float inv_t = 1.0f / temperature; for (int i = 0; i < vocab; i++) logits[i] *= inv_t; } softmax(logits, vocab); float *probs = (float *)malloc(vocab * sizeof(float)); int *indices = (int *)malloc(vocab * sizeof(int)); memcpy(probs, logits, vocab * sizeof(float)); for (int i = 0; i < vocab; i++) indices[i] = i; int n = 0; float cum = 0.0f; while (cum < top_p && n < vocab) { int best = n; for (int i = n+1; i < vocab; i++) if (probs[i] > probs[best]) best = i; float t = probs[n]; probs[n] = probs[best]; probs[best] = t; int ti = indices[n]; indices[n] = indices[best]; indices[best] = ti; cum += probs[n]; n++; if (n >= 40) break; } float sum = 0; for (int i = 0; i < n; i++) sum += probs[i]; float r = (float)rand() / RAND_MAX * sum; float a = 0; int ch = indices[0]; for (int i = 0; i < n; i++) { a += probs[i]; if (a >= r) { ch = indices[i]; break; } } free(probs); free(indices); return ch; } int generate(Model *m, const int *prompt, int plen, int *out, int max_new, float temperature, float top_p, int eos) { srand(time(NULL)); for (int i = 0; i < plen; i++) forward_token(m, prompt[i], i); int pos = plen, gen = 0; for (int t = 0; t < max_new; t++) { int next; if (temperature <= 0) { next = 0; for (int i = 1; i < m->cfg.vocab; i++) if (m->logits[i] > m->logits[next]) next = i; } else { next = sample_top_p(m->logits, m->cfg.vocab, temperature, top_p); } out[t] = next; gen++; if (next == eos) break; forward_token(m, next, pos); pos++; } return gen; } /* ============================================================ * ALLOCATION * ============================================================ */ Model* model_alloc( int w_planes, int a_planes, int hidden, int inter, int n_heads, int n_kv_heads, int head_dim, int n_layers, int vocab, float rope_theta, int tie_embeddings ) { Model *m = (Model *)calloc(1, sizeof(Model)); Config *c = &m->cfg; c->hidden = hidden; c->inter = inter; c->n_heads = n_heads; c->n_kv_heads = n_kv_heads; c->head_dim = head_dim; c->n_layers = n_layers; c->vocab = vocab; c->rope_theta = rope_theta; c->tie_embeddings = tie_embeddings; c->w_planes = w_planes; c->a_planes = a_planes; m->layers = (Layer *)calloc(n_layers, sizeof(Layer)); size_t kv_size = (size_t)n_layers * MAX_SEQ * n_kv_heads * head_dim; m->k_cache = (float *)calloc(kv_size, sizeof(float)); m->v_cache = (float *)calloc(kv_size, sizeof(float)); int max_dim = inter > hidden ? inter : hidden; m->hidden = (float *)aligned_alloc(64, hidden * sizeof(float)); m->normed = (float *)aligned_alloc(64, max_dim * sizeof(float)); m->q_float = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float)); m->k_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float)); m->v_float = (float *)aligned_alloc(64, n_kv_heads * head_dim * sizeof(float)); m->attn_out = (float *)aligned_alloc(64, n_heads * head_dim * sizeof(float)); m->gate_float = (float *)aligned_alloc(64, inter * sizeof(float)); m->up_float = (float *)aligned_alloc(64, inter * sizeof(float)); m->mlp_act = (float *)aligned_alloc(64, inter * sizeof(float)); m->logits = (float *)aligned_alloc(64, vocab * sizeof(float)); m->attn_scores = (float *)aligned_alloc(64, MAX_SEQ * sizeof(float)); m->final_norm = (float *)aligned_alloc(64, hidden * sizeof(float)); /* Unary scratch for hidden dim */ int h_chunks = (hidden + 63) / 64; m->act_sign = (uint64_t *)aligned_alloc(64, h_chunks * sizeof(uint64_t)); m->act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * h_chunks * sizeof(uint64_t)); /* Unary scratch for intermediate dim */ int i_chunks = (inter + 63) / 64; m->mlp_act_sign = (uint64_t *)aligned_alloc(64, i_chunks * sizeof(uint64_t)); m->mlp_act_planes = (uint64_t *)aligned_alloc(64, (size_t)a_planes * i_chunks * sizeof(uint64_t)); int w_max = (1 << w_planes) - 1; int a_max = (1 << a_planes) - 1; printf("LOG-UNARY ENGINE\n"); printf(" Model: hidden=%d inter=%d heads=%d/%d layers=%d vocab=%d\n", hidden, inter, n_heads, n_kv_heads, n_layers, vocab); printf(" Weight: %d log-planes -> %d levels (range -%d..+%d)\n", w_planes, 2*w_max+1, w_max, w_max); printf(" Activation: %d log-planes -> %d levels (range -%d..+%d)\n", a_planes, 2*a_max+1, a_max, a_max); printf(" Plane pairs per element: %d (vs %d linear)\n", w_planes * a_planes, 7 * 4); printf(" KV cache: %zu MB\n", kv_size * 2 * sizeof(float) / (1024*1024)); return m; } /* Weight setters */ void model_set_embed(Model *m, uint16_t *data) { m->embed = data; } void model_set_final_norm(Model *m, float *data) { memcpy(m->final_norm, data, m->cfg.hidden * sizeof(float)); } void layer_set_norms(Model *m, int l, float *in_norm, float *post_norm) { m->layers[l].input_norm = in_norm; m->layers[l].post_norm = post_norm; } void layer_set_qk_norm(Model *m, int l, float *q_norm, float *k_norm) { m->layers[l].q_norm = q_norm; m->layers[l].k_norm = k_norm; } static void init_weight(LogUnaryWeight *w, uint64_t *sign, uint64_t *planes, float *scales, int out_dim, int in_dim, int n_planes) { w->sign_bits = sign; w->log_planes = planes; w->scales = scales; w->out_dim = out_dim; w->in_dim = in_dim; w->n_planes = n_planes; w->chunks = (in_dim + 63) / 64; } void layer_set_linears( Model *m, int l, uint64_t *q_s, uint64_t *q_p, float *q_sc, int q_out, int q_in, uint64_t *k_s, uint64_t *k_p, float *k_sc, int k_out, int k_in, uint64_t *v_s, uint64_t *v_p, float *v_sc, int v_out, int v_in, uint64_t *o_s, uint64_t *o_p, float *o_sc, int o_out, int o_in, uint64_t *g_s, uint64_t *g_p, float *g_sc, int g_out, int g_in, uint64_t *u_s, uint64_t *u_p, float *u_sc, int u_out, int u_in, uint64_t *d_s, uint64_t *d_p, float *d_sc, int d_out, int d_in, int n_planes ) { init_weight(&m->layers[l].q_proj, q_s, q_p, q_sc, q_out, q_in, n_planes); init_weight(&m->layers[l].k_proj, k_s, k_p, k_sc, k_out, k_in, n_planes); init_weight(&m->layers[l].v_proj, v_s, v_p, v_sc, v_out, v_in, n_planes); init_weight(&m->layers[l].o_proj, o_s, o_p, o_sc, o_out, o_in, n_planes); init_weight(&m->layers[l].gate_proj, g_s, g_p, g_sc, g_out, g_in, n_planes); init_weight(&m->layers[l].up_proj, u_s, u_p, u_sc, u_out, u_in, n_planes); init_weight(&m->layers[l].down_proj, d_s, d_p, d_sc, d_out, d_in, n_planes); } void model_reset_cache(Model *m) { size_t kv_size = (size_t)m->cfg.n_layers * MAX_SEQ * m->cfg.n_kv_heads * m->cfg.head_dim; memset(m->k_cache, 0, kv_size * sizeof(float)); memset(m->v_cache, 0, kv_size * sizeof(float)); }