/* * CONCATENATIVE UNARY ENGINE * * In base-1, the value IS the count of ones. * Addition = concatenation of bitstreams. * Multiplication = AND + count. * * REPRESENTATION: * Each element of a vector has: * - A sign bit (positive/negative) * - A magnitude = number of 1-bits across K "slots" * * But crucially, when we ADD two unary vectors (residual connection), * we DON'T dequantize-add-requantize. We CONCATENATE the slots. * * If vector A has K_a slots and vector B has K_b slots, * A + B has K_a + K_b slots. The magnitude of element j is * just the total count of 1-bits at position j across ALL slots. * * This means the residual stream GROWS through the network: * After embed: K_0 slots * After layer 1: K_0 + K_attn + K_mlp slots * After layer L: K_0 + L*(K_attn + K_mlp) slots * * No information is ever destroyed by requantization. * * MATMUL: * y = W @ x where W has K_w slots and x has K_x slots. * For each output element y[i]: * For each slot pair (p from W, q from x): * count += popcount(W_slot_p[i] AND x_slot_q AND same_sign) * - popcount(W_slot_p[i] AND x_slot_q AND diff_sign) * Output gets K_out = some fixed number of slots (requantized) * because matmul output magnitude is in a different scale. * * SAME-SIGN ADD (residual): * Just append slots. Zero compute. * For different signs: need cancellation. * In practice residual connections are same-sign-dominant, * so we track sign separately and concat magnitudes, * deferring cancellation to the next norm. * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #define _POSIX_C_SOURCE 199309L #include #include #include #include #include #include #include #include /* ============================================================ * GROWABLE UNARY VECTOR * * The key data structure. Slots can be appended (concat = add). * Each slot is a bitplane of dim bits packed into uint64 chunks. * * sign: uint64[chunks] — per-element sign * slots: uint64[n_slots * chunks] — each slot is chunks uint64s * n_slots: current number of slots (grows via concat) * max_slots: allocated capacity * * For element j: * magnitude = number of slots where bit j is set * value = sign * magnitude * scale * * ============================================================ */ typedef struct { uint64_t *sign; uint64_t *slots; /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */ float scale; /* per-vector scale factor */ int dim; int chunks; /* (dim+63)/64 */ int n_slots; /* current slot count */ int max_slots; /* allocated capacity */ } GrowVec; /* Fixed-size unary matrix (weights don't grow) */ typedef struct { uint64_t *sign; /* [rows * chunks] */ uint64_t *slots; /* [K * rows * chunks] */ float *scales; /* [rows] per-row scale */ int rows, cols, chunks, K; } FixedMat; /* ============================================================ * ALLOCATION * ============================================================ */ GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) { GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec)); v->dim = dim; v->chunks = (dim + 63) / 64; v->n_slots = 0; v->max_slots = max_slots; v->scale = 1.0f; v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t)); memset(v->sign, 0, v->chunks * sizeof(uint64_t)); memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t)); return v; } void gv_free(GrowVec *v) { if (v) { free(v->sign); free(v->slots); free(v); } } FixedMat* fm_alloc(int rows, int cols, int K) { FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat)); m->rows = rows; m->cols = cols; m->K = K; m->chunks = (cols + 63) / 64; m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); m->scales = (float *)aligned_alloc(64, rows * sizeof(float)); memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); return m; } void fm_free(FixedMat *m) { if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); } } /* ============================================================ * FLOAT → UNARY CONVERSION (only at boundaries) * ============================================================ */ void gv_from_float(GrowVec *v, const float *x, int K) { int dim = v->dim, chunks = v->chunks; v->n_slots = K; memset(v->sign, 0, chunks * sizeof(uint64_t)); memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t)); float amax = 0.0f; for (int i = 0; i < dim; i++) { float a = fabsf(x[i]); if (a > amax) amax = a; } if (amax == 0.0f) { v->scale = 1.0f; return; } v->scale = amax / K; float inv = K / amax; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); if (x[i] < 0.0f) v->sign[c] |= bit; int mag = (int)(fabsf(x[i]) * inv + 0.5f); if (mag > K) mag = K; for (int s = 0; s < mag; s++) v->slots[(size_t)s * chunks + c] |= bit; } } void gv_to_float(const GrowVec *v, float *out) { int dim = v->dim, chunks = v->chunks; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); int mag = 0; for (int s = 0; s < v->n_slots; s++) { if (v->slots[(size_t)s * chunks + c] & bit) mag++; } float val = (float)mag * v->scale; out[i] = (v->sign[c] & bit) ? -val : val; } } void fm_from_float(FixedMat *m, const float *data) { int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); for (int r = 0; r < rows; r++) { const float *row = data + (size_t)r * cols; float amax = 0.0f; for (int j = 0; j < cols; j++) { float a = fabsf(row[j]); if (a > amax) amax = a; } if (amax == 0.0f) { m->scales[r] = 1.0f; continue; } m->scales[r] = amax / K; float inv = K / amax; uint64_t *rs = m->sign + (size_t)r * chunks; for (int j = 0; j < cols; j++) { int c = j / 64; uint64_t bit = 1ULL << (j % 64); if (row[j] < 0.0f) rs[c] |= bit; int mag = (int)(fabsf(row[j]) * inv + 0.5f); if (mag > K) mag = K; for (int s = 0; s < mag; s++) m->slots[((size_t)s * rows + r) * chunks + c] |= bit; } } } /* ============================================================ * CONCATENATION = ADDITION * * gv_concat(dst, src): * Appends src's slots to dst. * Same-sign: just append. * Different-sign: cancel bits (remove from both). * * For efficiency with residual connections where scales differ: * We track a "slot_scales" or use a single scale with normalization. * * SIMPLE VERSION: assumes same scale (works after norm). * ============================================================ */ /* Simple concat: append src slots to dst. Handles sign cancellation. */ void gv_concat(GrowVec *dst, const GrowVec *src) { int chunks = dst->chunks; /* For each source slot, process element-wise: * Where signs agree: copy bit to new dst slot * Where signs differ: cancel - find a dst slot with that bit set and clear it * * Optimization: for most transformer residuals, signs mostly agree. * So we do the simple thing: compute per-element sign agreement, * then for agreeing elements just append, for disagreeing elements cancel. */ /* Sign agreement mask */ /* agree[c] = ~(dst_sign[c] ^ src_sign[c]) — bits where signs match */ for (int s = 0; s < src->n_slots; s++) { const uint64_t *src_slot = src->slots + (size_t)s * chunks; /* Split into agree and disagree portions */ int new_slot = dst->n_slots; if (new_slot >= dst->max_slots) { /* Out of room — would need realloc in production */ printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots); return; } uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks; for (int c = 0; c < chunks; c++) { uint64_t src_bits = src_slot[c]; uint64_t agree = ~(dst->sign[c] ^ src->sign[c]); uint64_t disagree = dst->sign[c] ^ src->sign[c]; /* Same sign: just append to new slot */ uint64_t to_add = src_bits & agree; /* Different sign: cancel from existing dst slots */ uint64_t to_cancel = src_bits & disagree; /* Cancel by walking backwards through dst slots */ for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) { uint64_t *dslot = dst->slots + (size_t)d * chunks + c; uint64_t overlap = *dslot & to_cancel; *dslot &= ~overlap; /* clear cancelled bits in dst */ to_cancel &= ~overlap; /* mark as cancelled */ } /* Any remaining to_cancel means src > dst for those elements * — flip the sign and add to new slot */ if (to_cancel) { dst->sign[c] ^= to_cancel; /* flip sign for these elements */ to_add |= to_cancel; } dst_new[c] = to_add; } /* Only increment if new slot is non-empty */ int non_empty = 0; for (int c = 0; c < chunks && !non_empty; c++) if (dst_new[c]) non_empty = 1; if (non_empty) dst->n_slots++; } } /* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */ void gv_concat_fast(GrowVec *dst, const GrowVec *src) { int chunks = dst->chunks; int src_slots = src->n_slots; if (dst->n_slots + src_slots > dst->max_slots) { printf("WARNING: GrowVec overflow\n"); src_slots = dst->max_slots - dst->n_slots; } /* Just memcpy the slots — handles same-sign correctly, * defers opposite-sign cancellation to next norm */ memcpy(dst->slots + (size_t)dst->n_slots * chunks, src->slots, (size_t)src_slots * chunks * sizeof(uint64_t)); dst->n_slots += src_slots; } /* ============================================================ * MATMUL: y = M @ x * * M is fixed (K_w slots), x is growable (n_slots slots). * Output is a NEW GrowVec with K_out slots. * * Core: for each output element i, accumulate: * acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign) * - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign) * * Then quantize acc to K_out unary slots. * ============================================================ */ void gv_matmul( const FixedMat *M, const GrowVec *x, GrowVec *y, /* output — gets filled with K_out slots */ int K_out /* how many output slots */ ) { int out_dim = M->rows; int chunks = M->chunks; int wK = M->K; int xK = x->n_slots; float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float)); #pragma omp parallel for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; long long acc = 0; for (int c = 0; c < chunks; c++) { uint64_t ws = w_sign_row[c]; uint64_t xs = x->sign[c]; uint64_t same = ~(ws ^ xs); uint64_t diff = ws ^ xs; for (int p = 0; p < wK; p++) { uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c]; for (int q = 0; q < xK; q++) { uint64_t xq = x->slots[(size_t)q * chunks + c]; uint64_t active = wp & xq; acc += __builtin_popcountll(active & same) - __builtin_popcountll(active & diff); } } } y_float[i] = (float)acc * M->scales[i] * x->scale; } /* Quantize to K_out slots */ gv_from_float(y, y_float, K_out); free(y_float); } /* ============================================================ * NORM: GrowVec → GrowVec with controlled slot count * * RMSNorm dequantizes (counting), normalizes (float), * then requantizes to a fixed K. * This is where slot count gets reset. * ============================================================ */ void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) { int dim = x->dim; float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); gv_to_float(x, xf); float ss = 0.0f; for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; float rms = 1.0f / sqrtf(ss / dim + eps); for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i]; gv_from_float(out, xf, K_out); free(xf); } /* ============================================================ * SILU_MUL: out = SiLU(gate) * up * Dequant, compute, requant. O(dim). * ============================================================ */ void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) { int dim = gate->dim; float *gf = (float *)aligned_alloc(64, dim * sizeof(float)); float *uf = (float *)aligned_alloc(64, dim * sizeof(float)); gv_to_float(gate, gf); gv_to_float(up, uf); for (int i = 0; i < dim; i++) gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i]; gv_from_float(out, gf, K_out); free(gf); free(uf); } /* ============================================================ * TEST: demonstrate growing residual stream * ============================================================ */ void test_concat_add() { printf("=== CONCATENATION = ADDITION TEST ===\n\n"); int dim = 16; /* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */ float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4}; float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1}; GrowVec *a = gv_alloc(dim, 8, 64); GrowVec *b = gv_alloc(dim, 8, 64); gv_from_float(a, a_vals, 8); gv_from_float(b, b_vals, 8); printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale); float af[16], bf[16]; gv_to_float(a, af); printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n"); printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n"); printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale); gv_to_float(b, bf); printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n"); printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n"); /* Concatenate (= add) */ printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots); /* Need same scale for concat to work correctly */ /* In a real network, both come from norm so they have comparable scale */ /* For this test, use fast concat (no cancellation) */ gv_concat(a, b); printf(" -> %d):\n", a->n_slots); float result[16], ref[16]; gv_to_float(a, result); for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i]; /* NOTE: concat addition only works correctly when scales match. * When scales differ, we'd need to adjust. In a transformer, * the norm before each sublayer ensures comparable scales. */ printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n"); printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n"); gv_free(a); gv_free(b); } void test_growing_residual() { printf("\n=== GROWING RESIDUAL STREAM TEST ===\n"); printf("Simulating 6 transformer layers with concat residuals\n\n"); int dim = 2560; int K_embed = 16; /* initial embedding quantization */ int K_sublayer = 8; /* each sublayer output */ int n_layers = 6; /* Create random embedding */ float *embed = (float *)malloc(dim * sizeof(float)); srand(42); for (int i = 0; i < dim; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } /* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */ int max_slots = K_embed + n_layers * 2 * K_sublayer + 64; GrowVec *residual = gv_alloc(dim, K_embed, max_slots); gv_from_float(residual, embed, K_embed); printf("After embedding: %d slots (%.1f KB)\n", residual->n_slots, (float)residual->n_slots * residual->chunks * 8 / 1024); for (int l = 0; l < n_layers; l++) { /* Simulate attention output */ GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer); float *fake_attn = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < dim; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f; } gv_from_float(attn_out, fake_attn, K_sublayer); /* Scale must match for concat to work — in real net, norm handles this */ attn_out->scale = residual->scale; /* RESIDUAL ADD = CONCATENATION */ gv_concat_fast(residual, attn_out); /* Simulate MLP output */ GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer); float *fake_mlp = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < dim; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f; } gv_from_float(mlp_out, fake_mlp, K_sublayer); mlp_out->scale = residual->scale; /* RESIDUAL ADD = CONCATENATION */ gv_concat_fast(residual, mlp_out); printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n", l + 1, residual->n_slots, (float)residual->n_slots * residual->chunks * 8 / 1024, K_sublayer, K_sublayer); gv_free(attn_out); gv_free(mlp_out); free(fake_attn); free(fake_mlp); } printf("\nResidual grew from %d to %d slots through %d layers\n", K_embed, residual->n_slots, n_layers); printf("Information accumulated, never lost to requantization\n"); gv_free(residual); free(embed); } void test_matmul_accuracy() { printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n"); int rows = 512, cols = 2560; int wK = 32; printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK); printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms"); srand(42); float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); float *xf = (float *)malloc(cols * sizeof(float)); float *y_ref = (float *)calloc(rows, sizeof(float)); for (size_t i = 0; i < (size_t)rows * cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; FixedMat *M = fm_alloc(rows, cols, wK); fm_from_float(M, Mf); /* Test with different x slot counts (simulating growing residual) */ int x_slots[] = {8, 16, 32, 48, 64, 96}; for (int t = 0; t < 6; t++) { int xK = x_slots[t]; GrowVec *x = gv_alloc(cols, xK, xK); GrowVec *y = gv_alloc(rows, xK, xK); gv_from_float(x, xf, xK); struct timespec t0, t1; float *yf = (float *)malloc(rows * sizeof(float)); clock_gettime(CLOCK_MONOTONIC, &t0); gv_matmul(M, x, y, xK); clock_gettime(CLOCK_MONOTONIC, &t1); double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6; gv_to_float(y, yf); float dot = 0, na = 0, nb = 0, noise = 0; for (int i = 0; i < rows; i++) { dot += y_ref[i] * yf[i]; na += y_ref[i] * y_ref[i]; nb += yf[i] * yf[i]; float e = y_ref[i] - yf[i]; noise += e * e; } float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); float snr = 10.0f * log10f(na / (noise + 1e-10f)); printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms); gv_free(x); gv_free(y); free(yf); } fm_free(M); free(Mf); free(xf); free(y_ref); } int main() { printf("========================================\n"); printf(" CONCATENATIVE UNARY ENGINE TESTS\n"); printf(" Addition = Concatenation\n"); printf(" Value = Count of Ones\n"); printf("========================================\n"); test_concat_add(); test_growing_residual(); test_matmul_accuracy(); printf("\n=== ALL TESTS DONE ===\n"); return 0; }