#define _POSIX_C_SOURCE 199309L /* * LOG-UNARY TENSOR LIBRARY * * Native tensor type where values are represented as: * sign (1 bit) + log-magnitude bitplanes * * Plane p is set if |value| >= 2^(p - bias) * With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B) * * ALL arithmetic stays in this representation: * - matmul: AND + weighted_popcount (shift by p+q-2*bias) * - add: bitwise merge with carry propagation * - scale: shift planes up/down * - negate: flip sign bits * * Float conversion only at boundaries (embed lookup, final logits) * * (c) 2026 OpenTransformers Ltd / Scott Bisset */ #include #include #include #include #include #include #include #include /* ============================================================ * LOG-UNARY TENSOR * * For a vector of length `dim`: * sign: uint64[chunks] - 1 bit per element * planes: uint64[n_planes][chunks] - 1 bit per element per plane * chunks = (dim + 63) / 64 * * Plane p is set if |value| >= threshold[p] * threshold[p] = base_scale * 2^(p - bias) * * This is a LOG thermometer code: * value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001) * value=1.0 with bias=10 -> planes 0-10 set * value=64.0 with bias=10 -> planes 0-16 set * * ============================================================ */ typedef struct { uint64_t *sign; /* [chunks] */ uint64_t *planes; /* [n_planes * chunks] contiguous */ int dim; int chunks; int n_planes; int bias; /* log2 offset: threshold[p] = base * 2^(p-bias) */ float base_scale; /* per-tensor scale factor */ } LogUnaryTensor; /* 2D tensor (matrix) - row-major */ typedef struct { uint64_t *sign; /* [rows * chunks_per_row] */ uint64_t *planes; /* [n_planes * rows * chunks_per_row] */ float *row_scales; /* [rows] per-row base scales */ int rows; int cols; int chunks; /* chunks per row = (cols+63)/64 */ int n_planes; int bias; } LogUnaryMatrix; /* ============================================================ * ALLOCATION * ============================================================ */ LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) { LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor)); t->dim = dim; t->n_planes = n_planes; t->bias = bias; t->chunks = (dim + 63) / 64; t->base_scale = 1.0f; t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t)); t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t)); memset(t->sign, 0, t->chunks * sizeof(uint64_t)); memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t)); return t; } LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) { LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix)); m->rows = rows; m->cols = cols; m->n_planes = n_planes; m->bias = bias; m->chunks = (cols + 63) / 64; m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float)); memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f; return m; } void lut_free(LogUnaryTensor *t) { if (t) { free(t->sign); free(t->planes); free(t); } } void lum_free(LogUnaryMatrix *m) { if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); } } /* ============================================================ * FLOAT <-> LOG-UNARY CONVERSION * Only used at boundaries (embedding, final output) * ============================================================ */ void lut_from_float(LogUnaryTensor *t, const float *x) { int dim = t->dim; int np = t->n_planes; int bias = t->bias; int chunks = t->chunks; memset(t->sign, 0, chunks * sizeof(uint64_t)); memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t)); /* Find absmax for base_scale */ float amax = 0.0f; for (int i = 0; i < dim; i++) { float a = fabsf(x[i]); if (a > amax) amax = a; } if (amax == 0.0f) { t->base_scale = 1.0f; return; } /* Set base_scale so that max value uses the highest plane */ /* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */ t->base_scale = amax / ldexpf(1.0f, np - 1 - bias); for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); if (x[i] < 0.0f) t->sign[c] |= bit; float mag = fabsf(x[i]); /* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */ for (int p = 0; p < np; p++) { float thresh = t->base_scale * ldexpf(1.0f, p - bias); if (mag >= thresh) t->planes[(size_t)p * chunks + c] |= bit; else break; /* thermometer: once we stop, all higher planes are 0 */ } } } void lut_to_float(const LogUnaryTensor *t, float *out) { int dim = t->dim; int np = t->n_planes; int bias = t->bias; int chunks = t->chunks; memset(out, 0, dim * sizeof(float)); for (int i = 0; i < dim; i++) { int c = i / 64; uint64_t bit = 1ULL << (i % 64); /* Find highest set plane */ int highest = -1; for (int p = np - 1; p >= 0; p--) { if (t->planes[(size_t)p * chunks + c] & bit) { highest = p; break; } } if (highest < 0) { out[i] = 0.0f; } else { /* Value is approximately base * 2^(highest - bias) */ /* More precise: midpoint between this threshold and next */ float val = t->base_scale * ldexpf(1.0f, highest - bias); if (highest < np - 1) { float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias); val = (val + next) * 0.5f; /* midpoint reconstruction */ } out[i] = (t->sign[c] & bit) ? -val : val; } } } /* Convert float matrix to log-unary matrix (per-row scaling) */ void lum_from_float(LogUnaryMatrix *m, const float *data) { int rows = m->rows, cols = m->cols; int np = m->n_planes, bias = m->bias; int chunks = m->chunks; memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t)); for (int r = 0; r < rows; r++) { const float *row = data + (size_t)r * cols; /* Per-row absmax */ float amax = 0.0f; for (int j = 0; j < cols; j++) { float a = fabsf(row[j]); if (a > amax) amax = a; } if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; } m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias); uint64_t *row_sign = m->sign + (size_t)r * chunks; for (int j = 0; j < cols; j++) { int c = j / 64; uint64_t bit = 1ULL << (j % 64); if (row[j] < 0.0f) row_sign[c] |= bit; float mag = fabsf(row[j]); for (int p = 0; p < np; p++) { float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias); if (mag >= thresh) m->planes[((size_t)p * rows + r) * chunks + c] |= bit; else break; } } } } /* ============================================================ * LOG-UNARY MATMUL: y = M @ x * * Both M (matrix) and x (vector) are log-unary encoded. * * For each output element y[i]: * For each weight plane p, activation plane q: * active = M.planes[p][i] AND x.planes[q] * same = active AND ~(M.sign[i] XOR x.sign) * diff = active AND (M.sign[i] XOR x.sign) * contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias) * * Output is a LogUnaryTensor (converted from integer accumulator) * ============================================================ */ void lum_matvec( const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y_out /* output: log-unary encoded result */ ) { int out_dim = M->rows; int chunks = M->chunks; int wp = M->n_planes; int xp = x->n_planes; int w_bias = M->bias; int x_bias = x->bias; /* Accumulate to float temporarily, then requantize to log-unary. * The accumulator is integer shifts (2^(p+q-2bias)), which * we can do as int64 left-shifts for small exponents. * * For the exponent range we're in (p+q in [0,14] with bias ~4), * net shift is [-8, 6], so we use a fixed-point int64 accumulator * with a base shift to keep everything positive. */ int base_shift = w_bias + x_bias; /* shift to add to make all exponents >= 0 */ /* We'll accumulate as int64 with implicit 2^(-base_shift) factor */ /* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */ float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float)); #pragma omp parallel for schedule(dynamic, 32) for (int i = 0; i < out_dim; i++) { const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; long long acc = 0; for (int c = 0; c < chunks; c++) { uint64_t ws = w_sign_row[c]; uint64_t xs = x->sign[c]; uint64_t same = ~(ws ^ xs); uint64_t diff = ws ^ xs; for (int p = 0; p < wp; p++) { uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c]; for (int q = 0; q < xp; q++) { uint64_t x_plane = x->planes[(size_t)q * chunks + c]; uint64_t active = w_plane & x_plane; uint64_t pos = active & same; uint64_t neg = active & diff; int count = __builtin_popcountll(pos) - __builtin_popcountll(neg); /* Weighted by 2^(p + q) relative to base */ int shift = p + q; /* relative to 2^(-base_shift) */ if (count != 0) acc += (long long)count << shift; } } } /* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */ y_float[i] = (float)acc * M->row_scales[i] * x->base_scale * ldexpf(1.0f, -base_shift); } /* Requantize float result to log-unary */ lut_from_float(y_out, y_float); free(y_float); } /* ============================================================ * LOG-UNARY ELEMENT-WISE ADD: z = a + b * * Dequant both, add as float, requant. * This is O(dim) so not the bottleneck. * Future: direct bitwise add with carry chains. * ============================================================ */ void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) { int dim = a->dim; float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); float *fb = (float *)aligned_alloc(64, dim * sizeof(float)); lut_to_float(a, fa); lut_to_float(b, fb); for (int i = 0; i < dim; i++) fa[i] += fb[i]; lut_from_float(out, fa); free(fa); free(fb); } /* In-place add: a += b (dequant a, add float b, requant) */ void lut_add_float(LogUnaryTensor *a, const float *b) { int dim = a->dim; float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); lut_to_float(a, fa); for (int i = 0; i < dim; i++) fa[i] += b[i]; lut_from_float(a, fa); free(fa); } /* ============================================================ * LOG-UNARY RMSNORM * * Needs float for the sqrt/reciprocal, but O(dim). * Input: log-unary, Output: log-unary * ============================================================ */ void lut_rmsnorm( const LogUnaryTensor *x, const float *weight, /* norm weights stay float (tiny) */ LogUnaryTensor *out, float eps ) { int dim = x->dim; float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); lut_to_float(x, xf); float ss = 0.0f; for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; float rms = 1.0f / sqrtf(ss / dim + eps); for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i]; lut_from_float(out, xf); free(xf); } /* ============================================================ * LOG-UNARY SILU_MUL: out = SiLU(gate) * up * * O(dim), not bottleneck. Dequant, compute, requant. * ============================================================ */ void lut_silu_mul( const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out ) { int dim = gate->dim; float *gf = (float *)aligned_alloc(64, dim * sizeof(float)); float *uf = (float *)aligned_alloc(64, dim * sizeof(float)); lut_to_float(gate, gf); lut_to_float(up, uf); for (int i = 0; i < dim; i++) gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i]; lut_from_float(out, gf); free(gf); free(uf); } /* ============================================================ * LOG-UNARY ROPE * * O(dim), dequant-compute-requant per head. * ============================================================ */ void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) { /* Dequant the relevant slice, apply RoPE, requant */ float *f = (float *)aligned_alloc(64, head_dim * sizeof(float)); /* Extract slice */ float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); lut_to_float(t, full); memcpy(f, full + start, head_dim * sizeof(float)); for (int i = 0; i < head_dim; i += 2) { float freq = 1.0f / powf(theta, (float)i / head_dim); float angle = offset * freq; float c = cosf(angle), s = sinf(angle); float v0 = f[i], v1 = f[i + 1]; f[i] = v0 * c - v1 * s; f[i + 1] = v0 * s + v1 * c; } memcpy(full + start, f, head_dim * sizeof(float)); lut_from_float(t, full); free(f); free(full); } /* ============================================================ * UTILITY: Get float slice from log-unary tensor * (for attention scores which need float softmax) * ============================================================ */ void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) { float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); lut_to_float(t, full); memcpy(out, full + start, len * sizeof(float)); free(full); } /* ============================================================ * BENCHMARK: measure matvec throughput * ============================================================ */ typedef struct { double total_and_ops; double total_popcount_ops; double wall_time_s; double elements_per_sec; double gops; /* giga-operations per second */ } BenchResult; BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) { LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias); LogUnaryTensor *x = lut_alloc(cols, x_planes, bias); LogUnaryTensor *y = lut_alloc(rows, x_planes, bias); /* Fill with random bits */ for (size_t i = 0; i < (size_t)rows * M->chunks; i++) M->sign[i] = ((uint64_t)rand() << 32) | rand(); for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++) M->planes[i] = ((uint64_t)rand() << 32) | rand(); for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f; for (size_t i = 0; i < (size_t)x->chunks; i++) x->sign[i] = ((uint64_t)rand() << 32) | rand(); for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++) x->planes[i] = ((uint64_t)rand() << 32) | rand(); x->base_scale = 1.0f; /* Warmup */ lum_matvec(M, x, y); struct timespec t0, t1; clock_gettime(CLOCK_MONOTONIC, &t0); for (int i = 0; i < iters; i++) lum_matvec(M, x, y); clock_gettime(CLOCK_MONOTONIC, &t1); double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; int chunks = M->chunks; double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; /* AND + popcount pairs */ BenchResult r; r.wall_time_s = dt / iters; r.total_and_ops = ops_per_call; r.total_popcount_ops = ops_per_call; r.elements_per_sec = (double)rows * cols * iters / dt; r.gops = ops_per_call * iters / dt / 1e9; lum_free(M); lut_free(x); lut_free(y); return r; } /* ============================================================ * ACCURACY TEST: convert float->logunary->float roundtrip * ============================================================ */ typedef struct { float max_error; float mean_error; float cosine_sim; float snr_db; } AccuracyResult; AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) { float *original = (float *)aligned_alloc(64, dim * sizeof(float)); float *recovered = (float *)aligned_alloc(64, dim * sizeof(float)); /* Random normal-ish distribution */ for (int i = 0; i < dim; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } LogUnaryTensor *t = lut_alloc(dim, n_planes, bias); lut_from_float(t, original); lut_to_float(t, recovered); float max_err = 0, sum_err = 0; float dot = 0, na = 0, nb = 0; for (int i = 0; i < dim; i++) { float err = fabsf(original[i] - recovered[i]); if (err > max_err) max_err = err; sum_err += err; dot += original[i] * recovered[i]; na += original[i] * original[i]; nb += recovered[i] * recovered[i]; } float noise_power = 0; for (int i = 0; i < dim; i++) { float e = original[i] - recovered[i]; noise_power += e * e; } AccuracyResult r; r.max_error = max_err; r.mean_error = sum_err / dim; r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f)); lut_free(t); free(original); free(recovered); return r; }