/* * TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC * * Representation: * A value of magnitude M is stored as M consecutive 1-bits. * The number IS the count of ones. * Every bit has weight exactly 1. * * For a vector element quantized to integer range [-K, K]: * sign: 1 bit (0=positive, 1=negative) * magnitude: K bit positions, first |value| are 1, rest are 0 * * Storage layout for a vector of dim D with max magnitude K: * sign: uint64[(D+63)/64] — one sign bit per element * unary: uint64[K * (D+63)/64] — K bitplanes across D elements * Plane p has bit j set iff |element_j| > p * (thermometer = true unary in bitplane form) * * Multiplication: w * x = popcount of ones(w) matched with ones(x) * Since every bit = 1, the dot product is JUST COUNTING. * No weights, no shifts, no corrections. * sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j] * = sum_p sum_q popcount(W_row_plane_p AND X_plane_q) * * YES this uses more memory. A 2560-dim vector with K=32 uses: * 32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16) * But the MATH IS EXACT (to quantization level). * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #define _POSIX_C_SOURCE 199309L #include #include #include #include #include #include #include #include /* ============================================================ * TRUE UNARY VECTOR * ============================================================ */ typedef struct { uint64_t *sign; /* [chunks] — 1 bit per element */ uint64_t *unary; /* [K * chunks] — K bitplanes, each bit = weight 1 */ float scale; /* float scale: real_value = sign * count * scale */ int dim; int chunks; /* (dim+63)/64 */ int K; /* max magnitude = number of unary bitplanes */ } TrueUnaryVec; /* TRUE UNARY MATRIX — row-major */ typedef struct { uint64_t *sign; /* [rows * chunks] */ uint64_t *unary; /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */ float *scales; /* [rows] — per-row scale factors */ int rows; int cols; int chunks; /* (cols+63)/64 */ int K; /* max magnitude per element */ } TrueUnaryMat; /* ============================================================ * ALLOCATION * ============================================================ */ TrueUnaryVec* tuv_alloc(int dim, int K) { TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec)); v->dim = dim; v->K = K; v->chunks = (dim + 63) / 64; v->scale = 1.0f; v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t)); memset(v->sign, 0, v->chunks * sizeof(uint64_t)); memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t)); return v; } TrueUnaryMat* tum_alloc(int rows, int cols, int K) { TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat)); m->rows = rows; m->cols = cols; m->K = K; m->chunks = (cols + 63) / 64; m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); m->unary = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); m->scales = (float *)aligned_alloc(64, rows * sizeof(float)); memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); for (int i = 0; i < rows; i++) m->scales[i] = 1.0f; return m; } void tuv_free(TrueUnaryVec *v) { if (v) { free(v->sign); free(v->unary); free(v); } } void tum_free(TrueUnaryMat *m) { if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); } } /* ============================================================ * FLOAT → TRUE UNARY * * Quantize: integer_val = round(float_val / scale * K) * Then store |integer_val| as that many 1-bits. * * For vector: single global scale = absmax / K * For matrix: per-row scale = row_absmax / K * ============================================================ */ void tuv_from_float(TrueUnaryVec *v, const float *x) { int dim = v->dim, K = v->K, chunks = v->chunks; memset(v->sign, 0, chunks * sizeof(uint64_t)); memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t)); float amax = 0.0f; for (int i = 0; i < dim; i++) { float a = fabsf(x[i]); if (a > amax) amax = a; } if (amax == 0.0f) { v->scale = 1.0f; return; } v->scale = amax / K; float inv = K / amax; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); if (x[i] < 0.0f) v->sign[c] |= bit; int mag = (int)(fabsf(x[i]) * inv + 0.5f); if (mag > K) mag = K; /* TRUE UNARY: set planes 0 through mag-1 */ for (int p = 0; p < mag; p++) v->unary[(size_t)p * chunks + c] |= bit; } } void tuv_to_float(const TrueUnaryVec *v, float *out) { int dim = v->dim, K = v->K, chunks = v->chunks; for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); /* Count set planes = magnitude in base-1 */ int mag = 0; for (int p = 0; p < K; p++) { if (v->unary[(size_t)p * chunks + c] & bit) mag++; } float val = (float)mag * v->scale; out[i] = (v->sign[c] & bit) ? -val : val; } } void tum_from_float(TrueUnaryMat *m, const float *data) { int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); for (int r = 0; r < rows; r++) { const float *row = data + (size_t)r * cols; float amax = 0.0f; for (int j = 0; j < cols; j++) { float a = fabsf(row[j]); if (a > amax) amax = a; } if (amax == 0.0f) { m->scales[r] = 1.0f; continue; } m->scales[r] = amax / K; float inv = K / amax; uint64_t *row_sign = m->sign + (size_t)r * chunks; for (int j = 0; j < cols; j++) { int c = j / 64; uint64_t bit = 1ULL << (j % 64); if (row[j] < 0.0f) row_sign[c] |= bit; int mag = (int)(fabsf(row[j]) * inv + 0.5f); if (mag > K) mag = K; for (int p = 0; p < mag; p++) m->unary[((size_t)p * rows + r) * chunks + c] |= bit; } } } /* ============================================================ * TRUE UNARY MATVEC: y = M @ x * * THE CORE OPERATION. * * For each output element y[i]: * For each pair of planes (p from weight, q from activation): * active = w_plane_p[i] AND x_plane_q * same = active AND ~(w_sign[i] XOR x_sign) * diff = active AND (w_sign[i] XOR x_sign) * acc += popcount(same) - popcount(diff) * * EVERY PLANE PAIR HAS WEIGHT = 1. * No shifts. No scaling between planes. No corrections. * The count IS the answer. * * y[i] = acc * w_scale[i] * x_scale * (single float multiply at the very end) * * ============================================================ */ void tum_matvec( const TrueUnaryMat *M, const TrueUnaryVec *x, float *y_out /* float output, requantize externally if needed */ ) { int out_dim = M->rows; int chunks = M->chunks; int wK = M->K; int xK = x->K; #pragma omp parallel for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; long long acc = 0; for (int c = 0; c < chunks; c++) { uint64_t ws = w_sign_row[c]; uint64_t xs = x->sign[c]; uint64_t same = ~(ws ^ xs); uint64_t diff = ws ^ xs; /* * PURE BASE-1: every plane pair contributes weight 1. * acc += popcount(w_plane AND x_plane AND same_sign) * - popcount(w_plane AND x_plane AND diff_sign) */ for (int p = 0; p < wK; p++) { uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c]; for (int q = 0; q < xK; q++) { uint64_t xq = x->unary[(size_t)q * chunks + c]; uint64_t active = wp & xq; acc += __builtin_popcountll(active & same) - __builtin_popcountll(active & diff); } } } /* Single float rescale per output element */ y_out[i] = (float)acc * M->scales[i] * x->scale; } } /* ============================================================ * OPTIMIZED MATVEC: collapse x planes first * * Instead of iterating wK * xK plane pairs per chunk, * precompute per-chunk activation sums: * x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c]) * x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c]) * * Then for each weight plane p: * This doesn't directly simplify because we need AND with wp first. * * ALTERNATIVE: precompute per-element x magnitudes in unary, * then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j * * For now: provide both the naive and a vertically-accumulated variant. * * VERTICAL ACCUMULATE: sum all weight planes into a per-element * count, then multiply by x count. Reduces from O(wK*xK*chunks) * to O((wK+xK)*chunks + dim). * ============================================================ */ void tum_matvec_fast( const TrueUnaryMat *M, const TrueUnaryVec *x, float *y_out ) { int out_dim = M->rows; int cols = M->cols; int chunks = M->chunks; int xK = x->K; /* Step 1: compute x magnitudes (per-element popcount across planes) * x_mag[j] = number of x planes where bit j is set * This is O(xK * chunks) = O(xK * dim / 64) */ int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t)); memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t)); for (int q = 0; q < xK; q++) { const uint64_t *xplane = x->unary + (size_t)q * chunks; for (int c = 0; c < chunks; c++) { uint64_t bits = xplane[c]; while (bits) { int bit = __builtin_ctzll(bits); int j = c * 64 + bit; if (j < cols) x_mag[j]++; bits &= bits - 1; } } } /* Apply sign to x_mag: positive if same sign as... * Actually we need signed x_mag relative to each weight row's sign. * So we keep x_mag unsigned and handle sign per output element. */ /* Step 2: for each output row, compute: * y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement * * w_mag[i][j] = number of weight planes where bit j is set * sign_agreement = +1 if w_sign[j] == x_sign[j], else -1 * * We compute w_mag by vertical popcount across weight planes. * This is O(wK * chunks) per row. */ #pragma omp parallel { int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t)); #pragma omp for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t)); /* Vertical popcount: count set planes per element */ for (int p = 0; p < M->K; p++) { const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks; for (int c = 0; c < chunks; c++) { uint64_t bits = wplane[c]; while (bits) { int bit = __builtin_ctzll(bits); int j = c * 64 + bit; if (j < cols) w_mag[j]++; bits &= bits - 1; } } } /* Dot product with sign */ const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; long long acc = 0; for (int j = 0; j < cols; j++) { int c = j / 64; uint64_t bit = 1ULL << (j % 64); int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit); int product = (int)w_mag[j] * (int)x_mag[j]; acc += same_sign ? product : -product; } y_out[i] = (float)acc * M->scales[i] * x->scale; } free(w_mag); } free(x_mag); } /* ============================================================ * BENCHMARK + ACCURACY * ============================================================ */ typedef struct { float cosine; float snr_db; float max_rel_err; double ms_naive; double ms_fast; double gops_naive; double gops_fast; } TestResult; TestResult tum_test(int rows, int cols, int wK, int xK, int iters) { TestResult r = {0}; srand(42); /* Random float matrix and vector (normal distribution) */ float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); float *xf = (float *)malloc(cols * sizeof(float)); float *y_ref = (float *)calloc(rows, sizeof(float)); float *y_naive = (float *)malloc(rows * sizeof(float)); float *y_fast = (float *)malloc(rows * sizeof(float)); for (size_t i = 0; i < (size_t)rows * cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } /* Float reference */ for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; /* Convert to true unary */ TrueUnaryMat *M = tum_alloc(rows, cols, wK); TrueUnaryVec *x = tuv_alloc(cols, xK); tum_from_float(M, Mf); tuv_from_float(x, xf); /* Naive matvec */ struct timespec t0, t1; tum_matvec(M, x, y_naive); /* warmup */ clock_gettime(CLOCK_MONOTONIC, &t0); for (int i = 0; i < iters; i++) tum_matvec(M, x, y_naive); clock_gettime(CLOCK_MONOTONIC, &t1); r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters; /* Fast matvec */ tum_matvec_fast(M, x, y_fast); /* warmup */ clock_gettime(CLOCK_MONOTONIC, &t0); for (int i = 0; i < iters; i++) tum_matvec_fast(M, x, y_fast); clock_gettime(CLOCK_MONOTONIC, &t1); r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters; /* Accuracy vs float reference */ float dot = 0, na = 0, nb = 0, max_re = 0; for (int i = 0; i < rows; i++) { dot += y_ref[i] * y_naive[i]; na += y_ref[i] * y_ref[i]; nb += y_naive[i] * y_naive[i]; float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f); if (re > max_re) max_re = re; } r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); float noise = 0; for (int i = 0; i < rows; i++) { float e = y_ref[i] - y_naive[i]; noise += e * e; } r.snr_db = 10.0f * log10f(na / (noise + 1e-10f)); r.max_rel_err = max_re; /* Verify naive == fast */ float fast_err = 0; for (int i = 0; i < rows; i++) { float e = fabsf(y_naive[i] - y_fast[i]); if (e > fast_err) fast_err = e; } if (fast_err > 0.01f) printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err); double ops = 2.0 * rows * cols; r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6); r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6); tum_free(M); tuv_free(x); free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast); return r; } /* ============================================================ * MAIN: sweep K values, show accuracy + speed tradeoff * ============================================================ */ int main() { printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n"); printf("Every bit has weight 1. Value = count of ones.\n"); printf("Matmul = AND + popcount, no weighting.\n\n"); /* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */ int rows = 4096, cols = 2560; printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols); printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n", "wK", "xK", "Cosine", "SNR_dB", "MaxRelE", "Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory"); struct { int wK; int xK; } configs[] = { {8, 4}, {8, 8}, {16, 8}, {16, 16}, {32, 8}, {32, 16}, {32, 32}, {64, 16}, {64, 32}, }; int n = sizeof(configs) / sizeof(configs[0]); for (int c = 0; c < n; c++) { int wK = configs[c].wK; int xK = configs[c].xK; int iters = (wK <= 16 && xK <= 16) ? 3 : 1; TestResult r = tum_test(rows, cols, wK, xK, iters); /* Memory for this layer's weights */ size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8; size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8; size_t scale_bytes = rows * 4; double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6; printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n", wK, xK, r.cosine, r.snr_db, r.max_rel_err, r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb); } /* Show first 5 values for K=32,16 case */ printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n"); { int sr = 512, sc = 2560; srand(42); float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float)); float *xf = (float *)malloc(sc * sizeof(float)); float *y_ref = (float *)calloc(sr, sizeof(float)); float *y_unary = (float *)malloc(sr * sizeof(float)); for (size_t i = 0; i < (size_t)sr * sc; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < sc; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < sr; i++) for (int j = 0; j < sc; j++) y_ref[i] += Mf[(size_t)i * sc + j] * xf[j]; TrueUnaryMat *M = tum_alloc(sr, sc, 32); TrueUnaryVec *x = tuv_alloc(sc, 16); tum_from_float(M, Mf); tuv_from_float(x, xf); tum_matvec(M, x, y_unary); printf("%8s %8s %8s\n", "Ref", "Unary", "Error"); for (int i = 0; i < 10; i++) printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]); tum_free(M); tuv_free(x); free(Mf); free(xf); free(y_ref); free(y_unary); } printf("\n=== DONE ===\n"); return 0; }