OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
/*
* TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
*
* Representation:
* A value of magnitude M is stored as M consecutive 1-bits.
* The number IS the count of ones.
* Every bit has weight exactly 1.
*
* For a vector element quantized to integer range [-K, K]:
* sign: 1 bit (0=positive, 1=negative)
* magnitude: K bit positions, first |value| are 1, rest are 0
*
* Storage layout for a vector of dim D with max magnitude K:
* sign: uint64[(D+63)/64] — one sign bit per element
* unary: uint64[K * (D+63)/64] — K bitplanes across D elements
* Plane p has bit j set iff |element_j| > p
* (thermometer = true unary in bitplane form)
*
* Multiplication: w * x = popcount of ones(w) matched with ones(x)
* Since every bit = 1, the dot product is JUST COUNTING.
* No weights, no shifts, no corrections.
* sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
* = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
*
* YES this uses more memory. A 2560-dim vector with K=32 uses:
* 32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
* But the MATH IS EXACT (to quantization level).
*
* (c) 2026 OpenTransformers Ltd / Scott Bisset
*/
#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
/* ============================================================
* TRUE UNARY VECTOR
* ============================================================ */
typedef struct {
uint64_t *sign; /* [chunks] — 1 bit per element */
uint64_t *unary; /* [K * chunks] — K bitplanes, each bit = weight 1 */
float scale; /* float scale: real_value = sign * count * scale */
int dim;
int chunks; /* (dim+63)/64 */
int K; /* max magnitude = number of unary bitplanes */
} TrueUnaryVec;
/* TRUE UNARY MATRIX — row-major */
typedef struct {
uint64_t *sign; /* [rows * chunks] */
uint64_t *unary; /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */
float *scales; /* [rows] — per-row scale factors */
int rows;
int cols;
int chunks; /* (cols+63)/64 */
int K; /* max magnitude per element */
} TrueUnaryMat;
/* ============================================================
* ALLOCATION
* ============================================================ */
TrueUnaryVec* tuv_alloc(int dim, int K) {
TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec));
v->dim = dim;
v->K = K;
v->chunks = (dim + 63) / 64;
v->scale = 1.0f;
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t));
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
return v;
}
TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat));
m->rows = rows;
m->cols = cols;
m->K = K;
m->chunks = (cols + 63) / 64;
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
m->unary = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
return m;
}
void tuv_free(TrueUnaryVec *v) {
if (v) { free(v->sign); free(v->unary); free(v); }
}
void tum_free(TrueUnaryMat *m) {
if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
}
/* ============================================================
* FLOAT → TRUE UNARY
*
* Quantize: integer_val = round(float_val / scale * K)
* Then store |integer_val| as that many 1-bits.
*
* For vector: single global scale = absmax / K
* For matrix: per-row scale = row_absmax / K
* ============================================================ */
void tuv_from_float(TrueUnaryVec *v, const float *x) {
int dim = v->dim, K = v->K, chunks = v->chunks;
memset(v->sign, 0, chunks * sizeof(uint64_t));
memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));
float amax = 0.0f;
for (int i = 0; i < dim; i++) {
float a = fabsf(x[i]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { v->scale = 1.0f; return; }
v->scale = amax / K;
float inv = K / amax;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
if (x[i] < 0.0f) v->sign[c] |= bit;
int mag = (int)(fabsf(x[i]) * inv + 0.5f);
if (mag > K) mag = K;
/* TRUE UNARY: set planes 0 through mag-1 */
for (int p = 0; p < mag; p++)
v->unary[(size_t)p * chunks + c] |= bit;
}
}
void tuv_to_float(const TrueUnaryVec *v, float *out) {
int dim = v->dim, K = v->K, chunks = v->chunks;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
/* Count set planes = magnitude in base-1 */
int mag = 0;
for (int p = 0; p < K; p++) {
if (v->unary[(size_t)p * chunks + c] & bit)
mag++;
}
float val = (float)mag * v->scale;
out[i] = (v->sign[c] & bit) ? -val : val;
}
}
void tum_from_float(TrueUnaryMat *m, const float *data) {
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
for (int r = 0; r < rows; r++) {
const float *row = data + (size_t)r * cols;
float amax = 0.0f;
for (int j = 0; j < cols; j++) {
float a = fabsf(row[j]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
m->scales[r] = amax / K;
float inv = K / amax;
uint64_t *row_sign = m->sign + (size_t)r * chunks;
for (int j = 0; j < cols; j++) {
int c = j / 64;
uint64_t bit = 1ULL << (j % 64);
if (row[j] < 0.0f) row_sign[c] |= bit;
int mag = (int)(fabsf(row[j]) * inv + 0.5f);
if (mag > K) mag = K;
for (int p = 0; p < mag; p++)
m->unary[((size_t)p * rows + r) * chunks + c] |= bit;
}
}
}
/* ============================================================
* TRUE UNARY MATVEC: y = M @ x
*
* THE CORE OPERATION.
*
* For each output element y[i]:
* For each pair of planes (p from weight, q from activation):
* active = w_plane_p[i] AND x_plane_q
* same = active AND ~(w_sign[i] XOR x_sign)
* diff = active AND (w_sign[i] XOR x_sign)
* acc += popcount(same) - popcount(diff)
*
* EVERY PLANE PAIR HAS WEIGHT = 1.
* No shifts. No scaling between planes. No corrections.
* The count IS the answer.
*
* y[i] = acc * w_scale[i] * x_scale
* (single float multiply at the very end)
*
* ============================================================ */
void tum_matvec(
const TrueUnaryMat *M,
const TrueUnaryVec *x,
float *y_out /* float output, requantize externally if needed */
) {
int out_dim = M->rows;
int chunks = M->chunks;
int wK = M->K;
int xK = x->K;
#pragma omp parallel for schedule(dynamic, 32)
for (int i = 0; i < out_dim; i++) {
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
long long acc = 0;
for (int c = 0; c < chunks; c++) {
uint64_t ws = w_sign_row[c];
uint64_t xs = x->sign[c];
uint64_t same = ~(ws ^ xs);
uint64_t diff = ws ^ xs;
/*
* PURE BASE-1: every plane pair contributes weight 1.
* acc += popcount(w_plane AND x_plane AND same_sign)
* - popcount(w_plane AND x_plane AND diff_sign)
*/
for (int p = 0; p < wK; p++) {
uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];
for (int q = 0; q < xK; q++) {
uint64_t xq = x->unary[(size_t)q * chunks + c];
uint64_t active = wp & xq;
acc += __builtin_popcountll(active & same)
- __builtin_popcountll(active & diff);
}
}
}
/* Single float rescale per output element */
y_out[i] = (float)acc * M->scales[i] * x->scale;
}
}
/* ============================================================
* OPTIMIZED MATVEC: collapse x planes first
*
* Instead of iterating wK * xK plane pairs per chunk,
* precompute per-chunk activation sums:
* x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
* x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
*
* Then for each weight plane p:
* This doesn't directly simplify because we need AND with wp first.
*
* ALTERNATIVE: precompute per-element x magnitudes in unary,
* then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
*
* For now: provide both the naive and a vertically-accumulated variant.
*
* VERTICAL ACCUMULATE: sum all weight planes into a per-element
* count, then multiply by x count. Reduces from O(wK*xK*chunks)
* to O((wK+xK)*chunks + dim).
* ============================================================ */
void tum_matvec_fast(
const TrueUnaryMat *M,
const TrueUnaryVec *x,
float *y_out
) {
int out_dim = M->rows;
int cols = M->cols;
int chunks = M->chunks;
int xK = x->K;
/* Step 1: compute x magnitudes (per-element popcount across planes)
* x_mag[j] = number of x planes where bit j is set
* This is O(xK * chunks) = O(xK * dim / 64)
*/
int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
for (int q = 0; q < xK; q++) {
const uint64_t *xplane = x->unary + (size_t)q * chunks;
for (int c = 0; c < chunks; c++) {
uint64_t bits = xplane[c];
while (bits) {
int bit = __builtin_ctzll(bits);
int j = c * 64 + bit;
if (j < cols) x_mag[j]++;
bits &= bits - 1;
}
}
}
/* Apply sign to x_mag: positive if same sign as...
* Actually we need signed x_mag relative to each weight row's sign.
* So we keep x_mag unsigned and handle sign per output element.
*/
/* Step 2: for each output row, compute:
* y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
*
* w_mag[i][j] = number of weight planes where bit j is set
* sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
*
* We compute w_mag by vertical popcount across weight planes.
* This is O(wK * chunks) per row.
*/
#pragma omp parallel
{
int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
#pragma omp for schedule(dynamic, 32)
for (int i = 0; i < out_dim; i++) {
memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));
/* Vertical popcount: count set planes per element */
for (int p = 0; p < M->K; p++) {
const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks;
for (int c = 0; c < chunks; c++) {
uint64_t bits = wplane[c];
while (bits) {
int bit = __builtin_ctzll(bits);
int j = c * 64 + bit;
if (j < cols) w_mag[j]++;
bits &= bits - 1;
}
}
}
/* Dot product with sign */
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
long long acc = 0;
for (int j = 0; j < cols; j++) {
int c = j / 64;
uint64_t bit = 1ULL << (j % 64);
int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
int product = (int)w_mag[j] * (int)x_mag[j];
acc += same_sign ? product : -product;
}
y_out[i] = (float)acc * M->scales[i] * x->scale;
}
free(w_mag);
}
free(x_mag);
}
/* ============================================================
* BENCHMARK + ACCURACY
* ============================================================ */
typedef struct {
float cosine;
float snr_db;
float max_rel_err;
double ms_naive;
double ms_fast;
double gops_naive;
double gops_fast;
} TestResult;
TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
TestResult r = {0};
srand(42);
/* Random float matrix and vector (normal distribution) */
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
float *xf = (float *)malloc(cols * sizeof(float));
float *y_ref = (float *)calloc(rows, sizeof(float));
float *y_naive = (float *)malloc(rows * sizeof(float));
float *y_fast = (float *)malloc(rows * sizeof(float));
for (size_t i = 0; i < (size_t)rows * cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
/* Float reference */
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
/* Convert to true unary */
TrueUnaryMat *M = tum_alloc(rows, cols, wK);
TrueUnaryVec *x = tuv_alloc(cols, xK);
tum_from_float(M, Mf);
tuv_from_float(x, xf);
/* Naive matvec */
struct timespec t0, t1;
tum_matvec(M, x, y_naive); /* warmup */
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int i = 0; i < iters; i++)
tum_matvec(M, x, y_naive);
clock_gettime(CLOCK_MONOTONIC, &t1);
r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
/* Fast matvec */
tum_matvec_fast(M, x, y_fast); /* warmup */
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int i = 0; i < iters; i++)
tum_matvec_fast(M, x, y_fast);
clock_gettime(CLOCK_MONOTONIC, &t1);
r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;
/* Accuracy vs float reference */
float dot = 0, na = 0, nb = 0, max_re = 0;
for (int i = 0; i < rows; i++) {
dot += y_ref[i] * y_naive[i];
na += y_ref[i] * y_ref[i];
nb += y_naive[i] * y_naive[i];
float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
if (re > max_re) max_re = re;
}
r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
float noise = 0;
for (int i = 0; i < rows; i++) {
float e = y_ref[i] - y_naive[i]; noise += e * e;
}
r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
r.max_rel_err = max_re;
/* Verify naive == fast */
float fast_err = 0;
for (int i = 0; i < rows; i++) {
float e = fabsf(y_naive[i] - y_fast[i]);
if (e > fast_err) fast_err = e;
}
if (fast_err > 0.01f)
printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err);
double ops = 2.0 * rows * cols;
r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);
tum_free(M); tuv_free(x);
free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
return r;
}
/* ============================================================
* MAIN: sweep K values, show accuracy + speed tradeoff
* ============================================================ */
int main() {
printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
printf("Every bit has weight 1. Value = count of ones.\n");
printf("Matmul = AND + popcount, no weighting.\n\n");
/* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
int rows = 4096, cols = 2560;
printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);
printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n",
"wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
"Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");
struct { int wK; int xK; } configs[] = {
{8, 4},
{8, 8},
{16, 8},
{16, 16},
{32, 8},
{32, 16},
{32, 32},
{64, 16},
{64, 32},
};
int n = sizeof(configs) / sizeof(configs[0]);
for (int c = 0; c < n; c++) {
int wK = configs[c].wK;
int xK = configs[c].xK;
int iters = (wK <= 16 && xK <= 16) ? 3 : 1;
TestResult r = tum_test(rows, cols, wK, xK, iters);
/* Memory for this layer's weights */
size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
size_t scale_bytes = rows * 4;
double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;
printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n",
wK, xK, r.cosine, r.snr_db, r.max_rel_err,
r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
}
/* Show first 5 values for K=32,16 case */
printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
{
int sr = 512, sc = 2560;
srand(42);
float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float));
float *xf = (float *)malloc(sc * sizeof(float));
float *y_ref = (float *)calloc(sr, sizeof(float));
float *y_unary = (float *)malloc(sr * sizeof(float));
for (size_t i = 0; i < (size_t)sr * sc; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < sc; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < sr; i++)
for (int j = 0; j < sc; j++)
y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];
TrueUnaryMat *M = tum_alloc(sr, sc, 32);
TrueUnaryVec *x = tuv_alloc(sc, 16);
tum_from_float(M, Mf);
tuv_from_float(x, xf);
tum_matvec(M, x, y_unary);
printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
for (int i = 0; i < 10; i++)
printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);
tum_free(M); tuv_free(x);
free(Mf); free(xf); free(y_ref); free(y_unary);
}
printf("\n=== DONE ===\n");
return 0;
}