unary-quantization-research / concat_unary.c
OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
/*
* CONCATENATIVE UNARY ENGINE
*
* In base-1, the value IS the count of ones.
* Addition = concatenation of bitstreams.
* Multiplication = AND + count.
*
* REPRESENTATION:
* Each element of a vector has:
* - A sign bit (positive/negative)
* - A magnitude = number of 1-bits across K "slots"
*
* But crucially, when we ADD two unary vectors (residual connection),
* we DON'T dequantize-add-requantize. We CONCATENATE the slots.
*
* If vector A has K_a slots and vector B has K_b slots,
* A + B has K_a + K_b slots. The magnitude of element j is
* just the total count of 1-bits at position j across ALL slots.
*
* This means the residual stream GROWS through the network:
* After embed: K_0 slots
* After layer 1: K_0 + K_attn + K_mlp slots
* After layer L: K_0 + L*(K_attn + K_mlp) slots
*
* No information is ever destroyed by requantization.
*
* MATMUL:
* y = W @ x where W has K_w slots and x has K_x slots.
* For each output element y[i]:
* For each slot pair (p from W, q from x):
* count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
* - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
* Output gets K_out = some fixed number of slots (requantized)
* because matmul output magnitude is in a different scale.
*
* SAME-SIGN ADD (residual):
* Just append slots. Zero compute.
* For different signs: need cancellation.
* In practice residual connections are same-sign-dominant,
* so we track sign separately and concat magnitudes,
* deferring cancellation to the next norm.
*
* (c) 2026 OpenTransformers Ltd / Scott Bisset
*/
#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>
/* ============================================================
* GROWABLE UNARY VECTOR
*
* The key data structure. Slots can be appended (concat = add).
* Each slot is a bitplane of dim bits packed into uint64 chunks.
*
* sign: uint64[chunks] — per-element sign
* slots: uint64[n_slots * chunks] — each slot is chunks uint64s
* n_slots: current number of slots (grows via concat)
* max_slots: allocated capacity
*
* For element j:
* magnitude = number of slots where bit j is set
* value = sign * magnitude * scale
*
* ============================================================ */
typedef struct {
uint64_t *sign;
uint64_t *slots; /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */
float scale; /* per-vector scale factor */
int dim;
int chunks; /* (dim+63)/64 */
int n_slots; /* current slot count */
int max_slots; /* allocated capacity */
} GrowVec;
/* Fixed-size unary matrix (weights don't grow) */
typedef struct {
uint64_t *sign; /* [rows * chunks] */
uint64_t *slots; /* [K * rows * chunks] */
float *scales; /* [rows] per-row scale */
int rows, cols, chunks, K;
} FixedMat;
/* ============================================================
* ALLOCATION
* ============================================================ */
GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec));
v->dim = dim;
v->chunks = (dim + 63) / 64;
v->n_slots = 0;
v->max_slots = max_slots;
v->scale = 1.0f;
v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t));
memset(v->sign, 0, v->chunks * sizeof(uint64_t));
memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
return v;
}
void gv_free(GrowVec *v) {
if (v) { free(v->sign); free(v->slots); free(v); }
}
FixedMat* fm_alloc(int rows, int cols, int K) {
FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat));
m->rows = rows; m->cols = cols; m->K = K;
m->chunks = (cols + 63) / 64;
m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
return m;
}
void fm_free(FixedMat *m) {
if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
}
/* ============================================================
* FLOAT → UNARY CONVERSION (only at boundaries)
* ============================================================ */
void gv_from_float(GrowVec *v, const float *x, int K) {
int dim = v->dim, chunks = v->chunks;
v->n_slots = K;
memset(v->sign, 0, chunks * sizeof(uint64_t));
memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));
float amax = 0.0f;
for (int i = 0; i < dim; i++) {
float a = fabsf(x[i]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { v->scale = 1.0f; return; }
v->scale = amax / K;
float inv = K / amax;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
if (x[i] < 0.0f) v->sign[c] |= bit;
int mag = (int)(fabsf(x[i]) * inv + 0.5f);
if (mag > K) mag = K;
for (int s = 0; s < mag; s++)
v->slots[(size_t)s * chunks + c] |= bit;
}
}
void gv_to_float(const GrowVec *v, float *out) {
int dim = v->dim, chunks = v->chunks;
for (int i = 0; i < dim; i++) {
int c = i / 64;
uint64_t bit = 1ULL << (i % 64);
int mag = 0;
for (int s = 0; s < v->n_slots; s++) {
if (v->slots[(size_t)s * chunks + c] & bit)
mag++;
}
float val = (float)mag * v->scale;
out[i] = (v->sign[c] & bit) ? -val : val;
}
}
void fm_from_float(FixedMat *m, const float *data) {
int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;
memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));
for (int r = 0; r < rows; r++) {
const float *row = data + (size_t)r * cols;
float amax = 0.0f;
for (int j = 0; j < cols; j++) {
float a = fabsf(row[j]);
if (a > amax) amax = a;
}
if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
m->scales[r] = amax / K;
float inv = K / amax;
uint64_t *rs = m->sign + (size_t)r * chunks;
for (int j = 0; j < cols; j++) {
int c = j / 64;
uint64_t bit = 1ULL << (j % 64);
if (row[j] < 0.0f) rs[c] |= bit;
int mag = (int)(fabsf(row[j]) * inv + 0.5f);
if (mag > K) mag = K;
for (int s = 0; s < mag; s++)
m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
}
}
}
/* ============================================================
* CONCATENATION = ADDITION
*
* gv_concat(dst, src):
* Appends src's slots to dst.
* Same-sign: just append.
* Different-sign: cancel bits (remove from both).
*
* For efficiency with residual connections where scales differ:
* We track a "slot_scales" or use a single scale with normalization.
*
* SIMPLE VERSION: assumes same scale (works after norm).
* ============================================================ */
/* Simple concat: append src slots to dst. Handles sign cancellation. */
void gv_concat(GrowVec *dst, const GrowVec *src) {
int chunks = dst->chunks;
/* For each source slot, process element-wise:
* Where signs agree: copy bit to new dst slot
* Where signs differ: cancel - find a dst slot with that bit set and clear it
*
* Optimization: for most transformer residuals, signs mostly agree.
* So we do the simple thing: compute per-element sign agreement,
* then for agreeing elements just append, for disagreeing elements cancel.
*/
/* Sign agreement mask */
/* agree[c] = ~(dst_sign[c] ^ src_sign[c]) — bits where signs match */
for (int s = 0; s < src->n_slots; s++) {
const uint64_t *src_slot = src->slots + (size_t)s * chunks;
/* Split into agree and disagree portions */
int new_slot = dst->n_slots;
if (new_slot >= dst->max_slots) {
/* Out of room — would need realloc in production */
printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
return;
}
uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks;
for (int c = 0; c < chunks; c++) {
uint64_t src_bits = src_slot[c];
uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
uint64_t disagree = dst->sign[c] ^ src->sign[c];
/* Same sign: just append to new slot */
uint64_t to_add = src_bits & agree;
/* Different sign: cancel from existing dst slots */
uint64_t to_cancel = src_bits & disagree;
/* Cancel by walking backwards through dst slots */
for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
uint64_t *dslot = dst->slots + (size_t)d * chunks + c;
uint64_t overlap = *dslot & to_cancel;
*dslot &= ~overlap; /* clear cancelled bits in dst */
to_cancel &= ~overlap; /* mark as cancelled */
}
/* Any remaining to_cancel means src > dst for those elements
* — flip the sign and add to new slot */
if (to_cancel) {
dst->sign[c] ^= to_cancel; /* flip sign for these elements */
to_add |= to_cancel;
}
dst_new[c] = to_add;
}
/* Only increment if new slot is non-empty */
int non_empty = 0;
for (int c = 0; c < chunks && !non_empty; c++)
if (dst_new[c]) non_empty = 1;
if (non_empty)
dst->n_slots++;
}
}
/* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
void gv_concat_fast(GrowVec *dst, const GrowVec *src) {
int chunks = dst->chunks;
int src_slots = src->n_slots;
if (dst->n_slots + src_slots > dst->max_slots) {
printf("WARNING: GrowVec overflow\n");
src_slots = dst->max_slots - dst->n_slots;
}
/* Just memcpy the slots — handles same-sign correctly,
* defers opposite-sign cancellation to next norm */
memcpy(dst->slots + (size_t)dst->n_slots * chunks,
src->slots,
(size_t)src_slots * chunks * sizeof(uint64_t));
dst->n_slots += src_slots;
}
/* ============================================================
* MATMUL: y = M @ x
*
* M is fixed (K_w slots), x is growable (n_slots slots).
* Output is a NEW GrowVec with K_out slots.
*
* Core: for each output element i, accumulate:
* acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
* - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
*
* Then quantize acc to K_out unary slots.
* ============================================================ */
void gv_matmul(
const FixedMat *M,
const GrowVec *x,
GrowVec *y, /* output — gets filled with K_out slots */
int K_out /* how many output slots */
) {
int out_dim = M->rows;
int chunks = M->chunks;
int wK = M->K;
int xK = x->n_slots;
float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));
#pragma omp parallel for schedule(dynamic, 32)
for (int i = 0; i < out_dim; i++) {
const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
long long acc = 0;
for (int c = 0; c < chunks; c++) {
uint64_t ws = w_sign_row[c];
uint64_t xs = x->sign[c];
uint64_t same = ~(ws ^ xs);
uint64_t diff = ws ^ xs;
for (int p = 0; p < wK; p++) {
uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
for (int q = 0; q < xK; q++) {
uint64_t xq = x->slots[(size_t)q * chunks + c];
uint64_t active = wp & xq;
acc += __builtin_popcountll(active & same)
- __builtin_popcountll(active & diff);
}
}
}
y_float[i] = (float)acc * M->scales[i] * x->scale;
}
/* Quantize to K_out slots */
gv_from_float(y, y_float, K_out);
free(y_float);
}
/* ============================================================
* NORM: GrowVec → GrowVec with controlled slot count
*
* RMSNorm dequantizes (counting), normalizes (float),
* then requantizes to a fixed K.
* This is where slot count gets reset.
* ============================================================ */
void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) {
int dim = x->dim;
float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
gv_to_float(x, xf);
float ss = 0.0f;
for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
float rms = 1.0f / sqrtf(ss / dim + eps);
for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];
gv_from_float(out, xf, K_out);
free(xf);
}
/* ============================================================
* SILU_MUL: out = SiLU(gate) * up
* Dequant, compute, requant. O(dim).
* ============================================================ */
void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) {
int dim = gate->dim;
float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
gv_to_float(gate, gf);
gv_to_float(up, uf);
for (int i = 0; i < dim; i++)
gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];
gv_from_float(out, gf, K_out);
free(gf); free(uf);
}
/* ============================================================
* TEST: demonstrate growing residual stream
* ============================================================ */
void test_concat_add() {
printf("=== CONCATENATION = ADDITION TEST ===\n\n");
int dim = 16;
/* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};
GrowVec *a = gv_alloc(dim, 8, 64);
GrowVec *b = gv_alloc(dim, 8, 64);
gv_from_float(a, a_vals, 8);
gv_from_float(b, b_vals, 8);
printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
float af[16], bf[16];
gv_to_float(a, af);
printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");
printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
gv_to_float(b, bf);
printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");
/* Concatenate (= add) */
printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);
/* Need same scale for concat to work correctly */
/* In a real network, both come from norm so they have comparable scale */
/* For this test, use fast concat (no cancellation) */
gv_concat(a, b);
printf(" -> %d):\n", a->n_slots);
float result[16], ref[16];
gv_to_float(a, result);
for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];
/* NOTE: concat addition only works correctly when scales match.
* When scales differ, we'd need to adjust. In a transformer,
* the norm before each sublayer ensures comparable scales. */
printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");
gv_free(a); gv_free(b);
}
void test_growing_residual() {
printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
printf("Simulating 6 transformer layers with concat residuals\n\n");
int dim = 2560;
int K_embed = 16; /* initial embedding quantization */
int K_sublayer = 8; /* each sublayer output */
int n_layers = 6;
/* Create random embedding */
float *embed = (float *)malloc(dim * sizeof(float));
srand(42);
for (int i = 0; i < dim; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
/* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
gv_from_float(residual, embed, K_embed);
printf("After embedding: %d slots (%.1f KB)\n",
residual->n_slots,
(float)residual->n_slots * residual->chunks * 8 / 1024);
for (int l = 0; l < n_layers; l++) {
/* Simulate attention output */
GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
float *fake_attn = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < dim; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
}
gv_from_float(attn_out, fake_attn, K_sublayer);
/* Scale must match for concat to work — in real net, norm handles this */
attn_out->scale = residual->scale;
/* RESIDUAL ADD = CONCATENATION */
gv_concat_fast(residual, attn_out);
/* Simulate MLP output */
GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
float *fake_mlp = (float *)malloc(dim * sizeof(float));
for (int i = 0; i < dim; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
}
gv_from_float(mlp_out, fake_mlp, K_sublayer);
mlp_out->scale = residual->scale;
/* RESIDUAL ADD = CONCATENATION */
gv_concat_fast(residual, mlp_out);
printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
l + 1, residual->n_slots,
(float)residual->n_slots * residual->chunks * 8 / 1024,
K_sublayer, K_sublayer);
gv_free(attn_out); gv_free(mlp_out);
free(fake_attn); free(fake_mlp);
}
printf("\nResidual grew from %d to %d slots through %d layers\n",
K_embed, residual->n_slots, n_layers);
printf("Information accumulated, never lost to requantization\n");
gv_free(residual);
free(embed);
}
void test_matmul_accuracy() {
printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");
int rows = 512, cols = 2560;
int wK = 32;
printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");
srand(42);
float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
float *xf = (float *)malloc(cols * sizeof(float));
float *y_ref = (float *)calloc(rows, sizeof(float));
for (size_t i = 0; i < (size_t)rows * cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];
FixedMat *M = fm_alloc(rows, cols, wK);
fm_from_float(M, Mf);
/* Test with different x slot counts (simulating growing residual) */
int x_slots[] = {8, 16, 32, 48, 64, 96};
for (int t = 0; t < 6; t++) {
int xK = x_slots[t];
GrowVec *x = gv_alloc(cols, xK, xK);
GrowVec *y = gv_alloc(rows, xK, xK);
gv_from_float(x, xf, xK);
struct timespec t0, t1;
float *yf = (float *)malloc(rows * sizeof(float));
clock_gettime(CLOCK_MONOTONIC, &t0);
gv_matmul(M, x, y, xK);
clock_gettime(CLOCK_MONOTONIC, &t1);
double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;
gv_to_float(y, yf);
float dot = 0, na = 0, nb = 0, noise = 0;
for (int i = 0; i < rows; i++) {
dot += y_ref[i] * yf[i];
na += y_ref[i] * y_ref[i];
nb += yf[i] * yf[i];
float e = y_ref[i] - yf[i];
noise += e * e;
}
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
float snr = 10.0f * log10f(na / (noise + 1e-10f));
printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);
gv_free(x); gv_free(y); free(yf);
}
fm_free(M);
free(Mf); free(xf); free(y_ref);
}
int main() {
printf("========================================\n");
printf(" CONCATENATIVE UNARY ENGINE TESTS\n");
printf(" Addition = Concatenation\n");
printf(" Value = Count of Ones\n");
printf("========================================\n");
test_concat_add();
test_growing_residual();
test_matmul_accuracy();
printf("\n=== ALL TESTS DONE ===\n");
return 0;
}