unary-quantization-research / logunary_tensor.c
OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
#define _POSIX_C_SOURCE 199309L
/*
* LOG-UNARY TENSOR LIBRARY
*
* Native tensor type where values are represented as:
* sign (1 bit) + log-magnitude bitplanes
*
* Plane p is set if |value| >= 2^(p - bias)
* With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
*
* ALL arithmetic stays in this representation:
* - matmul: AND + weighted_popcount (shift by p+q-2*bias)
* - add: bitwise merge with carry propagation
* - scale: shift planes up/down
* - negate: flip sign bits
*
* Float conversion only at boundaries (embed lookup, final logits)
*
* (c) 2026 OpenTransformers Ltd / Scott Bisset
*/
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
/* ============================================================
* LOG-UNARY TENSOR
*
* For a vector of length `dim`:
* sign: uint64[chunks] - 1 bit per element
* planes: uint64[n_planes][chunks] - 1 bit per element per plane
* chunks = (dim + 63) / 64
*
* Plane p is set if |value| >= threshold[p]
* threshold[p] = base_scale * 2^(p - bias)
*
* This is a LOG thermometer code:
* value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
* value=1.0 with bias=10 -> planes 0-10 set
* value=64.0 with bias=10 -> planes 0-16 set
*
* ============================================================ */
typedef struct {
uint64_t *sign; /* [chunks] */
uint64_t *planes; /* [n_planes * chunks] contiguous */
int dim;
int chunks;
int n_planes;
int bias; /* log2 offset: threshold[p] = base * 2^(p-bias) */
float base_scale; /* per-tensor scale factor */
} LogUnaryTensor;
/* 2D tensor (matrix) - row-major */
typedef struct {
uint64_t *sign; /* [rows * chunks_per_row] */
uint64_t *planes; /* [n_planes * rows * chunks_per_row] */
float *row_scales; /* [rows] per-row base scales */
int rows;
int cols;
int chunks; /* chunks per row = (cols+63)/64 */
int n_planes;
int bias;
} LogUnaryMatrix;
/* ============================================================
* ALLOCATION
* ============================================================ */
LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor));
t->dim = dim;
t->n_planes = n_planes;
t->bias = bias;
t->chunks = (dim + 63) / 64;
t->base_scale = 1.0f;
t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t));
t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t));
memset(t->sign, 0, t->chunks * sizeof(uint64_t));
memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
return t;
}
LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix));
m->rows = rows;
m->cols = cols;
m->n_planes = n_planes;
m->bias = bias;
m->chunks = (cols + 63) / 64;
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float));
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
return m;
}
void lut_free(LogUnaryTensor *t) {
if (t) { free(t->sign); free(t->planes); free(t); }
}
void lum_free(LogUnaryMatrix *m) {
if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
}
/* ============================================================
* FLOAT <-> LOG-UNARY CONVERSION
* Only used at boundaries (embedding, final output)
* ============================================================ */
void lut_from_float(LogUnaryTensor *t, const float *x) {
int dim = t->dim;
int np = t->n_planes;
int bias = t->bias;
int chunks = t->chunks;
memset(t->sign, 0, chunks * sizeof(uint64_t));
memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));
/* Find absmax for base_scale */
float amax = 0.0f;
for (int i = 0; i < dim; i++) {
float a = fabsf(x[i]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { t->base_scale = 1.0f; return; }
/* Set base_scale so that max value uses the highest plane */
/* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
if (x[i] < 0.0f) t->sign[c] |= bit;
float mag = fabsf(x[i]);
/* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
for (int p = 0; p < np; p++) {
float thresh = t->base_scale * ldexpf(1.0f, p - bias);
if (mag >= thresh)
t->planes[(size_t)p * chunks + c] |= bit;
else
break; /* thermometer: once we stop, all higher planes are 0 */
}
}
}
void lut_to_float(const LogUnaryTensor *t, float *out) {
int dim = t->dim;
int np = t->n_planes;
int bias = t->bias;
int chunks = t->chunks;
memset(out, 0, dim * sizeof(float));
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
/* Find highest set plane */
int highest = -1;
for (int p = np - 1; p >= 0; p--) {
if (t->planes[(size_t)p * chunks + c] & bit) {
highest = p;
break;
}
}
if (highest < 0) {
out[i] = 0.0f;
} else {
/* Value is approximately base * 2^(highest - bias) */
/* More precise: midpoint between this threshold and next */
float val = t->base_scale * ldexpf(1.0f, highest - bias);
if (highest < np - 1) {
float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
val = (val + next) * 0.5f; /* midpoint reconstruction */
}
out[i] = (t->sign[c] & bit) ? -val : val;
}
}
}
/* Convert float matrix to log-unary matrix (per-row scaling) */
void lum_from_float(LogUnaryMatrix *m, const float *data) {
int rows = m->rows, cols = m->cols;
int np = m->n_planes, bias = m->bias;
int chunks = m->chunks;
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));
for (int r = 0; r < rows; r++) {
const float *row = data + (size_t)r * cols;
/* Per-row absmax */
float amax = 0.0f;
for (int j = 0; j < cols; j++) {
float a = fabsf(row[j]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);
uint64_t *row_sign = m->sign + (size_t)r * chunks;
for (int j = 0; j < cols; j++) {
int c = j / 64;
uint64_t bit = 1ULL << (j % 64);
if (row[j] < 0.0f) row_sign[c] |= bit;
float mag = fabsf(row[j]);
for (int p = 0; p < np; p++) {
float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
if (mag >= thresh)
m->planes[((size_t)p * rows + r) * chunks + c] |= bit;
else
break;
}
}
}
}
/* ============================================================
* LOG-UNARY MATMUL: y = M @ x
*
* Both M (matrix) and x (vector) are log-unary encoded.
*
* For each output element y[i]:
* For each weight plane p, activation plane q:
* active = M.planes[p][i] AND x.planes[q]
* same = active AND ~(M.sign[i] XOR x.sign)
* diff = active AND (M.sign[i] XOR x.sign)
* contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
*
* Output is a LogUnaryTensor (converted from integer accumulator)
* ============================================================ */
void lum_matvec(
const LogUnaryMatrix *M,
const LogUnaryTensor *x,
LogUnaryTensor *y_out /* output: log-unary encoded result */
) {
int out_dim = M->rows;
int chunks = M->chunks;
int wp = M->n_planes;
int xp = x->n_planes;
int w_bias = M->bias;
int x_bias = x->bias;
/* Accumulate to float temporarily, then requantize to log-unary.
* The accumulator is integer shifts (2^(p+q-2bias)), which
* we can do as int64 left-shifts for small exponents.
*
* For the exponent range we're in (p+q in [0,14] with bias ~4),
* net shift is [-8, 6], so we use a fixed-point int64 accumulator
* with a base shift to keep everything positive.
*/
int base_shift = w_bias + x_bias; /* shift to add to make all exponents >= 0 */
/* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
/* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */
float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
#pragma omp parallel for schedule(dynamic, 32)
for (int i = 0; i < out_dim; i++) {
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
long long acc = 0;
for (int c = 0; c < chunks; c++) {
uint64_t ws = w_sign_row[c];
uint64_t xs = x->sign[c];
uint64_t same = ~(ws ^ xs);
uint64_t diff = ws ^ xs;
for (int p = 0; p < wp; p++) {
uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];
for (int q = 0; q < xp; q++) {
uint64_t x_plane = x->planes[(size_t)q * chunks + c];
uint64_t active = w_plane & x_plane;
uint64_t pos = active & same;
uint64_t neg = active & diff;
int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);
/* Weighted by 2^(p + q) relative to base */
int shift = p + q; /* relative to 2^(-base_shift) */
if (count != 0)
acc += (long long)count << shift;
}
}
}
/* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
* ldexpf(1.0f, -base_shift);
}
/* Requantize float result to log-unary */
lut_from_float(y_out, y_float);
free(y_float);
}
/* ============================================================
* LOG-UNARY ELEMENT-WISE ADD: z = a + b
*
* Dequant both, add as float, requant.
* This is O(dim) so not the bottleneck.
* Future: direct bitwise add with carry chains.
* ============================================================ */
void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) {
int dim = a->dim;
float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
float *fb = (float *)aligned_alloc(64, dim * sizeof(float));
lut_to_float(a, fa);
lut_to_float(b, fb);
for (int i = 0; i < dim; i++) fa[i] += fb[i];
lut_from_float(out, fa);
free(fa); free(fb);
}
/* In-place add: a += b (dequant a, add float b, requant) */
void lut_add_float(LogUnaryTensor *a, const float *b) {
int dim = a->dim;
float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
lut_to_float(a, fa);
for (int i = 0; i < dim; i++) fa[i] += b[i];
lut_from_float(a, fa);
free(fa);
}
/* ============================================================
* LOG-UNARY RMSNORM
*
* Needs float for the sqrt/reciprocal, but O(dim).
* Input: log-unary, Output: log-unary
* ============================================================ */
void lut_rmsnorm(
const LogUnaryTensor *x,
const float *weight, /* norm weights stay float (tiny) */
LogUnaryTensor *out,
float eps
) {
int dim = x->dim;
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
lut_to_float(x, xf);
float ss = 0.0f;
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
float rms = 1.0f / sqrtf(ss / dim + eps);
for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];
lut_from_float(out, xf);
free(xf);
}
/* ============================================================
* LOG-UNARY SILU_MUL: out = SiLU(gate) * up
*
* O(dim), not bottleneck. Dequant, compute, requant.
* ============================================================ */
void lut_silu_mul(
const LogUnaryTensor *gate,
const LogUnaryTensor *up,
LogUnaryTensor *out
) {
int dim = gate->dim;
float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
lut_to_float(gate, gf);
lut_to_float(up, uf);
for (int i = 0; i < dim; i++)
gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
lut_from_float(out, gf);
free(gf); free(uf);
}
/* ============================================================
* LOG-UNARY ROPE
*
* O(dim), dequant-compute-requant per head.
* ============================================================ */
void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
/* Dequant the relevant slice, apply RoPE, requant */
float *f = (float *)aligned_alloc(64, head_dim * sizeof(float));
/* Extract slice */
float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
lut_to_float(t, full);
memcpy(f, full + start, head_dim * sizeof(float));
for (int i = 0; i < head_dim; i += 2) {
float freq = 1.0f / powf(theta, (float)i / head_dim);
float angle = offset * freq;
float c = cosf(angle), s = sinf(angle);
float v0 = f[i], v1 = f[i + 1];
f[i] = v0 * c - v1 * s;
f[i + 1] = v0 * s + v1 * c;
}
memcpy(full + start, f, head_dim * sizeof(float));
lut_from_float(t, full);
free(f); free(full);
}
/* ============================================================
* UTILITY: Get float slice from log-unary tensor
* (for attention scores which need float softmax)
* ============================================================ */
void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) {
float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
lut_to_float(t, full);
memcpy(out, full + start, len * sizeof(float));
free(full);
}
/* ============================================================
* BENCHMARK: measure matvec throughput
* ============================================================ */
typedef struct {
double total_and_ops;
double total_popcount_ops;
double wall_time_s;
double elements_per_sec;
double gops; /* giga-operations per second */
} BenchResult;
BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);
/* Fill with random bits */
for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
M->sign[i] = ((uint64_t)rand() << 32) | rand();
for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
M->planes[i] = ((uint64_t)rand() << 32) | rand();
for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
for (size_t i = 0; i < (size_t)x->chunks; i++)
x->sign[i] = ((uint64_t)rand() << 32) | rand();
for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
x->planes[i] = ((uint64_t)rand() << 32) | rand();
x->base_scale = 1.0f;
/* Warmup */
lum_matvec(M, x, y);
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int i = 0; i < iters; i++)
lum_matvec(M, x, y);
clock_gettime(CLOCK_MONOTONIC, &t1);
double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
int chunks = M->chunks;
double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; /* AND + popcount pairs */
BenchResult r;
r.wall_time_s = dt / iters;
r.total_and_ops = ops_per_call;
r.total_popcount_ops = ops_per_call;
r.elements_per_sec = (double)rows * cols * iters / dt;
r.gops = ops_per_call * iters / dt / 1e9;
lum_free(M); lut_free(x); lut_free(y);
return r;
}
/* ============================================================
* ACCURACY TEST: convert float->logunary->float roundtrip
* ============================================================ */
typedef struct {
float max_error;
float mean_error;
float cosine_sim;
float snr_db;
} AccuracyResult;
AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
float *original = (float *)aligned_alloc(64, dim * sizeof(float));
float *recovered = (float *)aligned_alloc(64, dim * sizeof(float));
/* Random normal-ish distribution */
for (int i = 0; i < dim; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
lut_from_float(t, original);
lut_to_float(t, recovered);
float max_err = 0, sum_err = 0;
float dot = 0, na = 0, nb = 0;
for (int i = 0; i < dim; i++) {
float err = fabsf(original[i] - recovered[i]);
if (err > max_err) max_err = err;
sum_err += err;
dot += original[i] * recovered[i];
na += original[i] * original[i];
nb += recovered[i] * recovered[i];
}
float noise_power = 0;
for (int i = 0; i < dim; i++) {
float e = original[i] - recovered[i];
noise_power += e * e;
}
AccuracyResult r;
r.max_error = max_err;
r.mean_error = sum_err / dim;
r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));
lut_free(t);
free(original); free(recovered);
return r;
}