unary-quantization-research / proper_unary.c
OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
/*
* PROPER UNARY — ONE QUANTUM, NO SCALES
*
* Every single bit in the entire system has weight = 1 quantum.
* The quantum is set ONCE for the whole model.
* There are NO per-vector scales. NO per-row scales.
*
* The number 5.0 with quantum=0.1 is stored as 50 ones.
* The number 5.0 with quantum=0.01 is stored as 500 ones.
* More precision = more bits. That's the tradeoff.
*
* ADDITION = CONCATENATION. Always. No exceptions.
* Because every bit everywhere means the same thing.
*
* MATMUL: y[i] = sum_j W[i][j] * x[j]
* = sum over all (w_slot, x_slot) pairs:
* popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
* - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
* = quantum² * integer_count
*
* Output quantum = input_quantum² (magnitude grows)
* Or we pick output quantum = input_quantum and accept
* that the integer count includes the scaling.
*
* (c) 2026 OpenTransformers Ltd / Scott Bisset
*/
#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
/* ============================================================
* PROPER UNARY VECTOR
* Every bit = 1 quantum. No local scale.
* ============================================================ */
typedef struct {
uint64_t *sign; /* [chunks] */
uint64_t *slots; /* [n_slots * chunks] */
int dim;
int chunks;
int n_slots;
int cap; /* max slots allocated */
} UVec;
/* Proper unary matrix — same quantum as vectors */
typedef struct {
uint64_t *sign; /* [rows * chunks] */
uint64_t *slots; /* [K * rows * chunks] */
int rows, cols, chunks, K;
} UMat;
/* Global system quantum */
typedef struct {
float quantum; /* every bit = this much */
/* quantum² is the matmul output unit */
} USystem;
/* ============================================================
* ALLOC
* ============================================================ */
UVec* uv_new(int dim, int cap) {
UVec *v = (UVec *)calloc(1, sizeof(UVec));
v->dim = dim;
v->chunks = (dim + 63) / 64;
v->n_slots = 0;
v->cap = cap;
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t));
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
return v;
}
UMat* um_new(int rows, int cols, int K) {
UMat *m = (UMat *)calloc(1, sizeof(UMat));
m->rows = rows; m->cols = cols; m->K = K;
m->chunks = (cols + 63) / 64;
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
return m;
}
void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }
/* ============================================================
* QUANTIZE: float → proper unary
*
* Given global quantum q:
* magnitude = round(|value| / q)
* That many slots get bit set.
*
* NO per-vector absmax. NO local scale.
* Values that exceed K are clipped.
* ============================================================ */
void uv_from_float(UVec *v, const float *x, int K, float quantum) {
int dim = v->dim, chunks = v->chunks;
v->n_slots = K;
memset(v->sign, 0, chunks * sizeof(uint64_t));
memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
float inv_q = 1.0f / quantum;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
if (x[i] < 0.0f) v->sign[c] |= bit;
int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
if (mag > K) mag = K; /* clip */
for (int s = 0; s < mag; s++)
v->slots[(size_t)s * chunks + c] |= bit;
}
}
void uv_to_float(const UVec *v, float *out, float quantum) {
int dim = v->dim, chunks = v->chunks;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
int mag = 0;
for (int s = 0; s < v->n_slots; s++)
if (v->slots[(size_t)s * chunks + c] & bit)
mag++;
out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
}
}
void um_from_float(UMat *m, const float *data, float quantum) {
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
float inv_q = 1.0f / quantum;
for (int r = 0; r < rows; r++) {
const float *row = data + (size_t)r * cols;
uint64_t *rs = m->sign + (size_t)r * chunks;
for (int j = 0; j < cols; j++) {
int c = j / 64;
uint64_t bit = 1ULL << (j % 64);
if (row[j] < 0.0f) rs[c] |= bit;
int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
if (mag > K) mag = K;
for (int s = 0; s < mag; s++)
m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
}
}
}
/* ============================================================
* CONCATENATION = ADDITION
*
* Since every bit everywhere = same quantum,
* appending slots IS adding magnitudes. Period.
*
* Sign handling: for elements where signs differ,
* cancel bits from existing slots.
* ============================================================ */
void uv_concat(UVec *dst, const UVec *src) {
int chunks = dst->chunks;
for (int s = 0; s < src->n_slots; s++) {
if (dst->n_slots >= dst->cap) {
printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
return;
}
const uint64_t *src_slot = src->slots + (size_t)s * chunks;
uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks;
for (int c = 0; c < chunks; c++) {
uint64_t sb = src_slot[c];
uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
uint64_t disagree = dst->sign[c] ^ src->sign[c];
/* Same sign: straight append */
uint64_t add = sb & agree;
/* Different sign: cancel from existing */
uint64_t cancel = sb & disagree;
for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
uint64_t *ds = dst->slots + (size_t)d * chunks + c;
uint64_t overlap = *ds & cancel;
*ds &= ~overlap;
cancel &= ~overlap;
}
/* Leftover cancel = src magnitude exceeds dst, flip sign */
if (cancel) {
dst->sign[c] ^= cancel;
add |= cancel;
}
new_slot[c] = add;
}
/* Check if slot has any bits */
int any = 0;
for (int c = 0; c < chunks && !any; c++)
if (new_slot[c]) any = 1;
if (any) dst->n_slots++;
}
}
/* ============================================================
* MATMUL: y = M @ x
*
* Output unit = quantum² (one quantum from weight × one from activation)
* The integer accumulator directly gives the value in units of quantum².
*
* To keep everything in the same quantum system:
* y_float[i] = acc * quantum²
* Then requantize to unary with the SAME global quantum.
* y_mag[i] = acc * quantum² / quantum = acc * quantum
*
* ============================================================ */
void uv_matmul(
const UMat *M, const UVec *x,
UVec *y, int K_out, float quantum
) {
int out_dim = M->rows;
int chunks = M->chunks;
int wK = M->K;
int xK = x->n_slots;
float q2 = quantum * quantum;
y->n_slots = K_out;
memset(y->sign, 0, y->chunks * sizeof(uint64_t));
memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));
/* Compute integer dot products */
int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int));
uint8_t *neg = (uint8_t *)calloc(out_dim, 1);
#pragma omp parallel for schedule(dynamic, 32)
for (int i = 0; i < out_dim; i++) {
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
long long a = 0;
for (int c = 0; c < chunks; c++) {
uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
uint64_t diff = w_sign_row[c] ^ x->sign[c];
for (int p = 0; p < wK; p++) {
uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
for (int q = 0; q < xK; q++) {
uint64_t xq = x->slots[(size_t)q * chunks + c];
uint64_t active = wp & xq;
a += __builtin_popcountll(active & same)
- __builtin_popcountll(active & diff);
}
}
}
/* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
float val = (float)a * quantum;
int mag = (int)(fabsf(val) + 0.5f);
if (mag > K_out) mag = K_out;
acc[i] = mag;
neg[i] = (val < 0.0f) ? 1 : 0;
}
/* Encode directly to unary — no float intermediate */
for (int i = 0; i < out_dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
if (neg[i]) y->sign[c] |= bit;
for (int s = 0; s < acc[i]; s++)
y->slots[(size_t)s * y->chunks + c] |= bit;
}
free(acc); free(neg);
}
/* ============================================================
* RMSNORM — resets slot count, keeps same quantum
* ============================================================ */
void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) {
int dim = x->dim;
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
uv_to_float(x, xf, quantum);
float ss = 0.0f;
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
float rms = 1.0f / sqrtf(ss / dim + eps);
for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
uv_from_float(out, xf, K_out, quantum);
free(xf);
}
/* ============================================================
* TESTS
* ============================================================ */
void test_concat_correct() {
printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");
float quantum = 0.25f; /* every bit = 0.25 */
int dim = 8;
/* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
* In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
* Need K >= 28 slots to hold 7.0
*/
float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};
int K = 32;
UVec *a = uv_new(dim, 128);
UVec *b = uv_new(dim, 128);
uv_from_float(a, a_vals, K, quantum);
uv_from_float(b, b_vals, K, quantum);
float a_rec[8], b_rec[8];
uv_to_float(a, a_rec, quantum);
uv_to_float(b, b_rec, quantum);
printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");
printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
uv_concat(a, b);
printf("After concat: %d slots\n\n", a->n_slots);
float result[8];
uv_to_float(a, result, quantum);
printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");
uv_free(a); uv_free(b);
}
void test_chain_concat() {
printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");
float quantum = 0.1f;
int dim = 4;
int K = 64;
float vals[] = {1.0, -2.0, 3.0, -0.5};
UVec *acc = uv_new(dim, 512);
uv_from_float(acc, vals, K, quantum);
printf("Start: ");
float tmp[4];
uv_to_float(acc, tmp, quantum);
for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
printf(" (%d slots)\n", acc->n_slots);
float expected[] = {1.0, -2.0, 3.0, -0.5};
for (int step = 0; step < 5; step++) {
float add_vals[] = {0.5, 0.3, -1.0, 0.7};
UVec *delta = uv_new(dim, K);
uv_from_float(delta, add_vals, K, quantum);
uv_concat(acc, delta);
for (int i = 0; i < 4; i++) expected[i] += add_vals[i];
uv_to_float(acc, tmp, quantum);
printf(" +[0.5,0.3,-1.0,0.7] = ");
for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
printf(" (%d slots) expect:", acc->n_slots);
for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);
/* Check error */
float max_err = 0;
for(int i=0;i<4;i++) {
float e = fabsf(expected[i] - tmp[i]);
if (e > max_err) max_err = e;
}
printf(" err=%.2f\n", max_err);
uv_free(delta);
}
uv_free(acc);
}
void test_matmul() {
printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");
int rows = 512, cols = 256;
int wK = 32, xK = 32;
srand(42);
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
float *xf = (float *)malloc(cols * sizeof(float));
float *y_ref = (float *)calloc(rows, sizeof(float));
/* Small values so magnitudes fit in K slots */
for (size_t i = 0; i < (size_t)rows * cols; i++)
Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
for (int i = 0; i < cols; i++)
xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
/* Find quantum that fits the data range */
float data_max = 0;
for (size_t i = 0; i < (size_t)rows * cols; i++) {
float a = fabsf(Mf[i]);
if (a > data_max) data_max = a;
}
for (int i = 0; i < cols; i++) {
float a = fabsf(xf[i]);
if (a > data_max) data_max = a;
}
float quantum = data_max / wK;
printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK);
printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);
UMat *M = um_new(rows, cols, wK);
UVec *x = uv_new(cols, xK);
um_from_float(M, Mf, quantum);
uv_from_float(x, xf, xK, quantum);
/* Output needs enough K for the matmul result range */
float ymax = 0;
for (int i = 0; i < rows; i++) {
float a = fabsf(y_ref[i]);
if (a > ymax) ymax = a;
}
int K_out = (int)(ymax / quantum + 1);
if (K_out > 4096) K_out = 4096;
printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);
UVec *y = uv_new(rows, K_out);
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
uv_matmul(M, x, y, K_out, quantum);
clock_gettime(CLOCK_MONOTONIC, &t1);
double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
float *yf = (float *)malloc(rows * sizeof(float));
uv_to_float(y, yf, quantum);
float dot = 0, na = 0, nb = 0, noise = 0;
for (int i = 0; i < rows; i++) {
dot += y_ref[i] * yf[i];
na += y_ref[i] * y_ref[i];
nb += yf[i] * yf[i];
float e = y_ref[i] - yf[i]; noise += e * e;
}
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
float snr = 10.0f * log10f(na / (noise + 1e-10f));
printf("\nCosine: %.6f\n", cosine);
printf("SNR: %.1f dB\n", snr);
printf("Time: %.1f ms\n", ms);
printf("\nFirst 10 values:\n");
printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
for (int i = 0; i < 10; i++)
printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);
um_free(M); uv_free(x); uv_free(y);
free(Mf); free(xf); free(y_ref); free(yf);
}
void test_residual_chain() {
printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");
float quantum = 0.05f;
int dim = 1024;
int K = 128; /* fits values up to 6.4 */
srand(123);
float *embed = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < dim; i++)
embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;
/* Float reference: accumulate residuals */
float *ref = (float *)malloc(dim * sizeof(float));
memcpy(ref, embed, dim * sizeof(float));
/* Unary: grow via concat */
int total_cap = K + 10 * K; /* room for 10 concat operations */
UVec *residual = uv_new(dim, total_cap);
uv_from_float(residual, embed, K, quantum);
printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");
for (int step = 0; step < 10; step++) {
float *delta = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < dim; i++)
delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;
/* Float reference */
for (int i = 0; i < dim; i++) ref[i] += delta[i];
/* Unary: concat */
UVec *d = uv_new(dim, K);
uv_from_float(d, delta, K, quantum);
uv_concat(residual, d);
/* Compare */
float *rec = (float *)malloc(dim * sizeof(float));
uv_to_float(residual, rec, quantum);
float dot = 0, na = 0, nb = 0, max_err = 0;
for (int i = 0; i < dim; i++) {
dot += ref[i] * rec[i];
na += ref[i] * ref[i];
nb += rec[i] * rec[i];
float e = fabsf(ref[i] - rec[i]);
if (e > max_err) max_err = e;
}
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);
uv_free(d); free(delta); free(rec);
}
uv_free(residual);
free(embed); free(ref);
}
int main() {
printf("================================================\n");
printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
printf(" Every bit = 1 quantum. Concat = Add.\n");
printf("================================================\n\n");
test_concat_correct();
test_chain_concat();
test_matmul();
test_residual_chain();
printf("\n=== DONE ===\n");
return 0;
}