/* * PROPER UNARY — ONE QUANTUM, NO SCALES * * Every single bit in the entire system has weight = 1 quantum. * The quantum is set ONCE for the whole model. * There are NO per-vector scales. NO per-row scales. * * The number 5.0 with quantum=0.1 is stored as 50 ones. * The number 5.0 with quantum=0.01 is stored as 500 ones. * More precision = more bits. That's the tradeoff. * * ADDITION = CONCATENATION. Always. No exceptions. * Because every bit everywhere means the same thing. * * MATMUL: y[i] = sum_j W[i][j] * x[j] * = sum over all (w_slot, x_slot) pairs: * popcount(w_slot[i] AND x_slot AND same_sign) * quantum² * - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum² * = quantum² * integer_count * * Output quantum = input_quantum² (magnitude grows) * Or we pick output quantum = input_quantum and accept * that the integer count includes the scaling. * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #define _POSIX_C_SOURCE 199309L #include #include #include #include #include #include #include #include /* ============================================================ * PROPER UNARY VECTOR * Every bit = 1 quantum. No local scale. * ============================================================ */ typedef struct { uint64_t *sign; /* [chunks] */ uint64_t *slots; /* [n_slots * chunks] */ int dim; int chunks; int n_slots; int cap; /* max slots allocated */ } UVec; /* Proper unary matrix — same quantum as vectors */ typedef struct { uint64_t *sign; /* [rows * chunks] */ uint64_t *slots; /* [K * rows * chunks] */ int rows, cols, chunks, K; } UMat; /* Global system quantum */ typedef struct { float quantum; /* every bit = this much */ /* quantum² is the matmul output unit */ } USystem; /* ============================================================ * ALLOC * ============================================================ */ UVec* uv_new(int dim, int cap) { UVec *v = (UVec *)calloc(1, sizeof(UVec)); v->dim = dim; v->chunks = (dim + 63) / 64; v->n_slots = 0; v->cap = cap; v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t)); memset(v->sign, 0, v->chunks * sizeof(uint64_t)); memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t)); return v; } UMat* um_new(int rows, int cols, int K) { UMat *m = (UMat *)calloc(1, sizeof(UMat)); m->rows = rows; m->cols = cols; m->K = K; m->chunks = (cols + 63) / 64; m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); return m; } void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} } void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} } /* ============================================================ * QUANTIZE: float → proper unary * * Given global quantum q: * magnitude = round(|value| / q) * That many slots get bit set. * * NO per-vector absmax. NO local scale. * Values that exceed K are clipped. * ============================================================ */ void uv_from_float(UVec *v, const float *x, int K, float quantum) { int dim = v->dim, chunks = v->chunks; v->n_slots = K; memset(v->sign, 0, chunks * sizeof(uint64_t)); memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t)); float inv_q = 1.0f / quantum; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); if (x[i] < 0.0f) v->sign[c] |= bit; int mag = (int)(fabsf(x[i]) * inv_q + 0.5f); if (mag > K) mag = K; /* clip */ for (int s = 0; s < mag; s++) v->slots[(size_t)s * chunks + c] |= bit; } } void uv_to_float(const UVec *v, float *out, float quantum) { int dim = v->dim, chunks = v->chunks; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); int mag = 0; for (int s = 0; s < v->n_slots; s++) if (v->slots[(size_t)s * chunks + c] & bit) mag++; out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum; } } void um_from_float(UMat *m, const float *data, float quantum) { int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); float inv_q = 1.0f / quantum; for (int r = 0; r < rows; r++) { const float *row = data + (size_t)r * cols; uint64_t *rs = m->sign + (size_t)r * chunks; for (int j = 0; j < cols; j++) { int c = j / 64; uint64_t bit = 1ULL << (j % 64); if (row[j] < 0.0f) rs[c] |= bit; int mag = (int)(fabsf(row[j]) * inv_q + 0.5f); if (mag > K) mag = K; for (int s = 0; s < mag; s++) m->slots[((size_t)s * rows + r) * chunks + c] |= bit; } } } /* ============================================================ * CONCATENATION = ADDITION * * Since every bit everywhere = same quantum, * appending slots IS adding magnitudes. Period. * * Sign handling: for elements where signs differ, * cancel bits from existing slots. * ============================================================ */ void uv_concat(UVec *dst, const UVec *src) { int chunks = dst->chunks; for (int s = 0; s < src->n_slots; s++) { if (dst->n_slots >= dst->cap) { printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap); return; } const uint64_t *src_slot = src->slots + (size_t)s * chunks; uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks; for (int c = 0; c < chunks; c++) { uint64_t sb = src_slot[c]; uint64_t agree = ~(dst->sign[c] ^ src->sign[c]); uint64_t disagree = dst->sign[c] ^ src->sign[c]; /* Same sign: straight append */ uint64_t add = sb & agree; /* Different sign: cancel from existing */ uint64_t cancel = sb & disagree; for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) { uint64_t *ds = dst->slots + (size_t)d * chunks + c; uint64_t overlap = *ds & cancel; *ds &= ~overlap; cancel &= ~overlap; } /* Leftover cancel = src magnitude exceeds dst, flip sign */ if (cancel) { dst->sign[c] ^= cancel; add |= cancel; } new_slot[c] = add; } /* Check if slot has any bits */ int any = 0; for (int c = 0; c < chunks && !any; c++) if (new_slot[c]) any = 1; if (any) dst->n_slots++; } } /* ============================================================ * MATMUL: y = M @ x * * Output unit = quantum² (one quantum from weight × one from activation) * The integer accumulator directly gives the value in units of quantum². * * To keep everything in the same quantum system: * y_float[i] = acc * quantum² * Then requantize to unary with the SAME global quantum. * y_mag[i] = acc * quantum² / quantum = acc * quantum * * ============================================================ */ void uv_matmul( const UMat *M, const UVec *x, UVec *y, int K_out, float quantum ) { int out_dim = M->rows; int chunks = M->chunks; int wK = M->K; int xK = x->n_slots; float q2 = quantum * quantum; y->n_slots = K_out; memset(y->sign, 0, y->chunks * sizeof(uint64_t)); memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t)); /* Compute integer dot products */ int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int)); uint8_t *neg = (uint8_t *)calloc(out_dim, 1); #pragma omp parallel for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; long long a = 0; for (int c = 0; c < chunks; c++) { uint64_t same = ~(w_sign_row[c] ^ x->sign[c]); uint64_t diff = w_sign_row[c] ^ x->sign[c]; for (int p = 0; p < wK; p++) { uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c]; for (int q = 0; q < xK; q++) { uint64_t xq = x->slots[(size_t)q * chunks + c]; uint64_t active = wp & xq; a += __builtin_popcountll(active & same) - __builtin_popcountll(active & diff); } } } /* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */ float val = (float)a * quantum; int mag = (int)(fabsf(val) + 0.5f); if (mag > K_out) mag = K_out; acc[i] = mag; neg[i] = (val < 0.0f) ? 1 : 0; } /* Encode directly to unary — no float intermediate */ for (int i = 0; i < out_dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); if (neg[i]) y->sign[c] |= bit; for (int s = 0; s < acc[i]; s++) y->slots[(size_t)s * y->chunks + c] |= bit; } free(acc); free(neg); } /* ============================================================ * RMSNORM — resets slot count, keeps same quantum * ============================================================ */ void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) { int dim = x->dim; float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); uv_to_float(x, xf, quantum); float ss = 0.0f; for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; float rms = 1.0f / sqrtf(ss / dim + eps); for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i]; uv_from_float(out, xf, K_out, quantum); free(xf); } /* ============================================================ * TESTS * ============================================================ */ void test_concat_correct() { printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n"); float quantum = 0.25f; /* every bit = 0.25 */ int dim = 8; /* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0] * In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28] * Need K >= 28 slots to hold 7.0 */ float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0}; float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0}; float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0}; int K = 32; UVec *a = uv_new(dim, 128); UVec *b = uv_new(dim, 128); uv_from_float(a, a_vals, K, quantum); uv_from_float(b, b_vals, K, quantum); float a_rec[8], b_rec[8]; uv_to_float(a, a_rec, quantum); uv_to_float(b, b_rec, quantum); printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum); printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n"); printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n"); printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n"); printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n"); printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots); uv_concat(a, b); printf("After concat: %d slots\n\n", a->n_slots); float result[8]; uv_to_float(a, result, quantum); printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n"); printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n"); printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n"); uv_free(a); uv_free(b); } void test_chain_concat() { printf("\n=== CHAINED CONCAT (5 additions) ===\n\n"); float quantum = 0.1f; int dim = 4; int K = 64; float vals[] = {1.0, -2.0, 3.0, -0.5}; UVec *acc = uv_new(dim, 512); uv_from_float(acc, vals, K, quantum); printf("Start: "); float tmp[4]; uv_to_float(acc, tmp, quantum); for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]); printf(" (%d slots)\n", acc->n_slots); float expected[] = {1.0, -2.0, 3.0, -0.5}; for (int step = 0; step < 5; step++) { float add_vals[] = {0.5, 0.3, -1.0, 0.7}; UVec *delta = uv_new(dim, K); uv_from_float(delta, add_vals, K, quantum); uv_concat(acc, delta); for (int i = 0; i < 4; i++) expected[i] += add_vals[i]; uv_to_float(acc, tmp, quantum); printf(" +[0.5,0.3,-1.0,0.7] = "); for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]); printf(" (%d slots) expect:", acc->n_slots); for(int i=0;i<4;i++) printf("%6.2f ",expected[i]); /* Check error */ float max_err = 0; for(int i=0;i<4;i++) { float e = fabsf(expected[i] - tmp[i]); if (e > max_err) max_err = e; } printf(" err=%.2f\n", max_err); uv_free(delta); } uv_free(acc); } void test_matmul() { printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n"); int rows = 512, cols = 256; int wK = 32, xK = 32; srand(42); float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); float *xf = (float *)malloc(cols * sizeof(float)); float *y_ref = (float *)calloc(rows, sizeof(float)); /* Small values so magnitudes fit in K slots */ for (size_t i = 0; i < (size_t)rows * cols; i++) Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; for (int i = 0; i < cols; i++) xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; /* Find quantum that fits the data range */ float data_max = 0; for (size_t i = 0; i < (size_t)rows * cols; i++) { float a = fabsf(Mf[i]); if (a > data_max) data_max = a; } for (int i = 0; i < cols; i++) { float a = fabsf(xf[i]); if (a > data_max) data_max = a; } float quantum = data_max / wK; printf("Data range: [-%.2f, %.2f]\n", data_max, data_max); printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK); printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK); UMat *M = um_new(rows, cols, wK); UVec *x = uv_new(cols, xK); um_from_float(M, Mf, quantum); uv_from_float(x, xf, xK, quantum); /* Output needs enough K for the matmul result range */ float ymax = 0; for (int i = 0; i < rows; i++) { float a = fabsf(y_ref[i]); if (a > ymax) ymax = a; } int K_out = (int)(ymax / quantum + 1); if (K_out > 4096) K_out = 4096; printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out); UVec *y = uv_new(rows, K_out); struct timespec t0, t1; clock_gettime(CLOCK_MONOTONIC, &t0); uv_matmul(M, x, y, K_out, quantum); clock_gettime(CLOCK_MONOTONIC, &t1); double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6; float *yf = (float *)malloc(rows * sizeof(float)); uv_to_float(y, yf, quantum); float dot = 0, na = 0, nb = 0, noise = 0; for (int i = 0; i < rows; i++) { dot += y_ref[i] * yf[i]; na += y_ref[i] * y_ref[i]; nb += yf[i] * yf[i]; float e = y_ref[i] - yf[i]; noise += e * e; } float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); float snr = 10.0f * log10f(na / (noise + 1e-10f)); printf("\nCosine: %.6f\n", cosine); printf("SNR: %.1f dB\n", snr); printf("Time: %.1f ms\n", ms); printf("\nFirst 10 values:\n"); printf("%10s %10s %10s\n", "Ref", "Unary", "Error"); for (int i = 0; i < 10; i++) printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]); um_free(M); uv_free(x); uv_free(y); free(Mf); free(xf); free(y_ref); free(yf); } void test_residual_chain() { printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n"); float quantum = 0.05f; int dim = 1024; int K = 128; /* fits values up to 6.4 */ srand(123); float *embed = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < dim; i++) embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f; /* Float reference: accumulate residuals */ float *ref = (float *)malloc(dim * sizeof(float)); memcpy(ref, embed, dim * sizeof(float)); /* Unary: grow via concat */ int total_cap = K + 10 * K; /* room for 10 concat operations */ UVec *residual = uv_new(dim, total_cap); uv_from_float(residual, embed, K, quantum); printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim); printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr"); for (int step = 0; step < 10; step++) { float *delta = (float *)malloc(dim * sizeof(float)); for (int i = 0; i < dim; i++) delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f; /* Float reference */ for (int i = 0; i < dim; i++) ref[i] += delta[i]; /* Unary: concat */ UVec *d = uv_new(dim, K); uv_from_float(d, delta, K, quantum); uv_concat(residual, d); /* Compare */ float *rec = (float *)malloc(dim * sizeof(float)); uv_to_float(residual, rec, quantum); float dot = 0, na = 0, nb = 0, max_err = 0; for (int i = 0; i < dim; i++) { dot += ref[i] * rec[i]; na += ref[i] * ref[i]; nb += rec[i] * rec[i]; float e = fabsf(ref[i] - rec[i]); if (e > max_err) max_err = e; } float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err); uv_free(d); free(delta); free(rec); } uv_free(residual); free(embed); free(ref); } int main() { printf("================================================\n"); printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n"); printf(" Every bit = 1 quantum. Concat = Add.\n"); printf("================================================\n\n"); test_concat_correct(); test_chain_concat(); test_matmul(); test_residual_chain(); printf("\n=== DONE ===\n"); return 0; }