Add files using upload-large-folder tool

19ed98b verified 14 days ago

22 kB

	/*
	* CONCATENATIVE UNARY ENGINE
	*
	* In base-1, the value IS the count of ones.
	* Addition = concatenation of bitstreams.
	* Multiplication = AND + count.
	*
	* REPRESENTATION:
	* Each element of a vector has:
	* - A sign bit (positive/negative)
	* - A magnitude = number of 1-bits across K "slots"
	*
	* But crucially, when we ADD two unary vectors (residual connection),
	* we DON'T dequantize-add-requantize. We CONCATENATE the slots.
	*
	* If vector A has K_a slots and vector B has K_b slots,
	* A + B has K_a + K_b slots. The magnitude of element j is
	* just the total count of 1-bits at position j across ALL slots.
	*
	* This means the residual stream GROWS through the network:
	* After embed: K_0 slots
	* After layer 1: K_0 + K_attn + K_mlp slots
	* After layer L: K_0 + L*(K_attn + K_mlp) slots
	*
	* No information is ever destroyed by requantization.
	*
	* MATMUL:
	* y = W @ x where W has K_w slots and x has K_x slots.
	* For each output element y[i]:
	* For each slot pair (p from W, q from x):
	* count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
	* - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
	* Output gets K_out = some fixed number of slots (requantized)
	* because matmul output magnitude is in a different scale.
	*
	* SAME-SIGN ADD (residual):
	* Just append slots. Zero compute.
	* For different signs: need cancellation.
	* In practice residual connections are same-sign-dominant,
	* so we track sign separately and concat magnitudes,
	* deferring cancellation to the next norm.
	*
	* (c) 2026 OpenTransformers Ltd / Scott Bisset
	*/

	#define _POSIX_C_SOURCE 199309L
	#include <immintrin.h>
	#include <omp.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <stdio.h>
	#include <time.h>

	/* ============================================================
	* GROWABLE UNARY VECTOR
	*
	* The key data structure. Slots can be appended (concat = add).
	* Each slot is a bitplane of dim bits packed into uint64 chunks.
	*
	* sign: uint64[chunks] — per-element sign
	* slots: uint64[n_slots * chunks] — each slot is chunks uint64s
	* n_slots: current number of slots (grows via concat)
	* max_slots: allocated capacity
	*
	* For element j:
	* magnitude = number of slots where bit j is set
	* value = sign * magnitude * scale
	*
	* ============================================================ */
	typedef struct {
	uint64_t *sign;
	uint64_t slots; / contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2chunks-1], etc /
	float scale; /* per-vector scale factor */
	int dim;
	int chunks; /* (dim+63)/64 */
	int n_slots; /* current slot count */
	int max_slots; /* allocated capacity */
	} GrowVec;

	/* Fixed-size unary matrix (weights don't grow) */
	typedef struct {
	uint64_t sign; / [rows * chunks] */
	uint64_t slots; / [K * rows * chunks] */
	float scales; / [rows] per-row scale */
	int rows, cols, chunks, K;
	} FixedMat;

	/* ============================================================
	* ALLOCATION
	* ============================================================ */
	GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
	GrowVec v = (GrowVec )calloc(1, sizeof(GrowVec));
	v->dim = dim;
	v->chunks = (dim + 63) / 64;
	v->n_slots = 0;
	v->max_slots = max_slots;
	v->scale = 1.0f;
	v->sign = (uint64_t )aligned_alloc(64, v->chunks sizeof(uint64_t));
	v->slots = (uint64_t )aligned_alloc(64, (size_t)max_slots v->chunks * sizeof(uint64_t));
	memset(v->sign, 0, v->chunks * sizeof(uint64_t));
	memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
	return v;
	}

	void gv_free(GrowVec *v) {
	if (v) { free(v->sign); free(v->slots); free(v); }
	}

	FixedMat* fm_alloc(int rows, int cols, int K) {
	FixedMat m = (FixedMat )calloc(1, sizeof(FixedMat));
	m->rows = rows; m->cols = cols; m->K = K;
	m->chunks = (cols + 63) / 64;
	m->sign = (uint64_t )aligned_alloc(64, (size_t)rows m->chunks * sizeof(uint64_t));
	m->slots = (uint64_t )aligned_alloc(64, (size_t)K rows * m->chunks * sizeof(uint64_t));
	m->scales = (float )aligned_alloc(64, rows sizeof(float));
	memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
	memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
	return m;
	}

	void fm_free(FixedMat *m) {
	if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
	}

	/* ============================================================
	* FLOAT → UNARY CONVERSION (only at boundaries)
	* ============================================================ */
	void gv_from_float(GrowVec v, const float x, int K) {
	int dim = v->dim, chunks = v->chunks;

	v->n_slots = K;
	memset(v->sign, 0, chunks * sizeof(uint64_t));
	memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));

	float amax = 0.0f;
	for (int i = 0; i < dim; i++) {
	float a = fabsf(x[i]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { v->scale = 1.0f; return; }
	v->scale = amax / K;
	float inv = K / amax;

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	if (x[i] < 0.0f) v->sign[c] \|= bit;

	int mag = (int)(fabsf(x[i]) * inv + 0.5f);
	if (mag > K) mag = K;
	for (int s = 0; s < mag; s++)
	v->slots[(size_t)s * chunks + c] \|= bit;
	}
	}

	void gv_to_float(const GrowVec v, float out) {
	int dim = v->dim, chunks = v->chunks;

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	int mag = 0;
	for (int s = 0; s < v->n_slots; s++) {
	if (v->slots[(size_t)s * chunks + c] & bit)
	mag++;
	}

	float val = (float)mag * v->scale;
	out[i] = (v->sign[c] & bit) ? -val : val;
	}
	}

	void fm_from_float(FixedMat m, const float data) {
	int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

	memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
	memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

	for (int r = 0; r < rows; r++) {
	const float row = data + (size_t)r cols;
	float amax = 0.0f;
	for (int j = 0; j < cols; j++) {
	float a = fabsf(row[j]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
	m->scales[r] = amax / K;
	float inv = K / amax;

	uint64_t rs = m->sign + (size_t)r chunks;
	for (int j = 0; j < cols; j++) {
	int c = j / 64;
	uint64_t bit = 1ULL << (j % 64);
	if (row[j] < 0.0f) rs[c] \|= bit;

	int mag = (int)(fabsf(row[j]) * inv + 0.5f);
	if (mag > K) mag = K;
	for (int s = 0; s < mag; s++)
	m->slots[((size_t)s * rows + r) * chunks + c] \|= bit;
	}
	}
	}

	/* ============================================================
	* CONCATENATION = ADDITION
	*
	* gv_concat(dst, src):
	* Appends src's slots to dst.
	* Same-sign: just append.
	* Different-sign: cancel bits (remove from both).
	*
	* For efficiency with residual connections where scales differ:
	* We track a "slot_scales" or use a single scale with normalization.
	*
	* SIMPLE VERSION: assumes same scale (works after norm).
	* ============================================================ */

	/* Simple concat: append src slots to dst. Handles sign cancellation. */
	void gv_concat(GrowVec dst, const GrowVec src) {
	int chunks = dst->chunks;

	/* For each source slot, process element-wise:
	* Where signs agree: copy bit to new dst slot
	* Where signs differ: cancel - find a dst slot with that bit set and clear it
	*
	* Optimization: for most transformer residuals, signs mostly agree.
	* So we do the simple thing: compute per-element sign agreement,
	* then for agreeing elements just append, for disagreeing elements cancel.
	*/

	/* Sign agreement mask */
	/* agree[c] = ~(dst_sign[c] ^ src_sign[c]) — bits where signs match */

	for (int s = 0; s < src->n_slots; s++) {
	const uint64_t src_slot = src->slots + (size_t)s chunks;

	/* Split into agree and disagree portions */
	int new_slot = dst->n_slots;
	if (new_slot >= dst->max_slots) {
	/* Out of room — would need realloc in production */
	printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
	return;
	}
	uint64_t dst_new = dst->slots + (size_t)new_slot chunks;

	for (int c = 0; c < chunks; c++) {
	uint64_t src_bits = src_slot[c];
	uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
	uint64_t disagree = dst->sign[c] ^ src->sign[c];

	/* Same sign: just append to new slot */
	uint64_t to_add = src_bits & agree;

	/* Different sign: cancel from existing dst slots */
	uint64_t to_cancel = src_bits & disagree;

	/* Cancel by walking backwards through dst slots */
	for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
	uint64_t dslot = dst->slots + (size_t)d chunks + c;
	uint64_t overlap = *dslot & to_cancel;
	dslot &= ~overlap; / clear cancelled bits in dst */
	to_cancel &= ~overlap; /* mark as cancelled */
	}

	/* Any remaining to_cancel means src > dst for those elements
	* — flip the sign and add to new slot */
	if (to_cancel) {
	dst->sign[c] ^= to_cancel; /* flip sign for these elements */
	to_add \|= to_cancel;
	}

	dst_new[c] = to_add;
	}

	/* Only increment if new slot is non-empty */
	int non_empty = 0;
	for (int c = 0; c < chunks && !non_empty; c++)
	if (dst_new[c]) non_empty = 1;
	if (non_empty)
	dst->n_slots++;
	}
	}

	/* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
	void gv_concat_fast(GrowVec dst, const GrowVec src) {
	int chunks = dst->chunks;
	int src_slots = src->n_slots;

	if (dst->n_slots + src_slots > dst->max_slots) {
	printf("WARNING: GrowVec overflow\n");
	src_slots = dst->max_slots - dst->n_slots;
	}

	/* Just memcpy the slots — handles same-sign correctly,
	* defers opposite-sign cancellation to next norm */
	memcpy(dst->slots + (size_t)dst->n_slots * chunks,
	src->slots,
	(size_t)src_slots * chunks * sizeof(uint64_t));
	dst->n_slots += src_slots;
	}

	/* ============================================================
	* MATMUL: y = M @ x
	*
	* M is fixed (K_w slots), x is growable (n_slots slots).
	* Output is a NEW GrowVec with K_out slots.
	*
	* Core: for each output element i, accumulate:
	* acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
	* - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
	*
	* Then quantize acc to K_out unary slots.
	* ============================================================ */
	void gv_matmul(
	const FixedMat *M,
	const GrowVec *x,
	GrowVec y, / output — gets filled with K_out slots */
	int K_out /* how many output slots */
	) {
	int out_dim = M->rows;
	int chunks = M->chunks;
	int wK = M->K;
	int xK = x->n_slots;

	float y_float = (float )aligned_alloc(64, out_dim * sizeof(float));

	#pragma omp parallel for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	const uint64_t w_sign_row = M->sign + (size_t)i chunks;
	long long acc = 0;

	for (int c = 0; c < chunks; c++) {
	uint64_t ws = w_sign_row[c];
	uint64_t xs = x->sign[c];
	uint64_t same = ~(ws ^ xs);
	uint64_t diff = ws ^ xs;

	for (int p = 0; p < wK; p++) {
	uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];

	for (int q = 0; q < xK; q++) {
	uint64_t xq = x->slots[(size_t)q * chunks + c];
	uint64_t active = wp & xq;
	acc += __builtin_popcountll(active & same)
	- __builtin_popcountll(active & diff);
	}
	}
	}

	y_float[i] = (float)acc * M->scales[i] * x->scale;
	}

	/* Quantize to K_out slots */
	gv_from_float(y, y_float, K_out);
	free(y_float);
	}

	/* ============================================================
	* NORM: GrowVec → GrowVec with controlled slot count
	*
	* RMSNorm dequantizes (counting), normalizes (float),
	* then requantizes to a fixed K.
	* This is where slot count gets reset.
	* ============================================================ */
	void gv_rmsnorm(const GrowVec x, const float weight, GrowVec *out, int K_out, float eps) {
	int dim = x->dim;
	float xf = (float )aligned_alloc(64, dim * sizeof(float));
	gv_to_float(x, xf);

	float ss = 0.0f;
	for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
	float rms = 1.0f / sqrtf(ss / dim + eps);
	for (int i = 0; i < dim; i++) xf[i] = rms weight[i];

	gv_from_float(out, xf, K_out);
	free(xf);
	}

	/* ============================================================
	* SILU_MUL: out = SiLU(gate) * up
	* Dequant, compute, requant. O(dim).
	* ============================================================ */
	void gv_silu_mul(const GrowVec gate, const GrowVec up, GrowVec *out, int K_out) {
	int dim = gate->dim;
	float gf = (float )aligned_alloc(64, dim * sizeof(float));
	float uf = (float )aligned_alloc(64, dim * sizeof(float));
	gv_to_float(gate, gf);
	gv_to_float(up, uf);

	for (int i = 0; i < dim; i++)
	gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];

	gv_from_float(out, gf, K_out);
	free(gf); free(uf);
	}

	/* ============================================================
	* TEST: demonstrate growing residual stream
	* ============================================================ */
	void test_concat_add() {
	printf("=== CONCATENATION = ADDITION TEST ===\n\n");

	int dim = 16;

	/* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
	float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
	float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};

	GrowVec *a = gv_alloc(dim, 8, 64);
	GrowVec *b = gv_alloc(dim, 8, 64);
	gv_from_float(a, a_vals, 8);
	gv_from_float(b, b_vals, 8);

	printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
	float af[16], bf[16];
	gv_to_float(a, af);
	printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
	printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");

	printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
	gv_to_float(b, bf);
	printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
	printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");

	/* Concatenate (= add) */
	printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);

	/* Need same scale for concat to work correctly */
	/* In a real network, both come from norm so they have comparable scale */
	/* For this test, use fast concat (no cancellation) */
	gv_concat(a, b);
	printf(" -> %d):\n", a->n_slots);

	float result[16], ref[16];
	gv_to_float(a, result);
	for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];

	/* NOTE: concat addition only works correctly when scales match.
	* When scales differ, we'd need to adjust. In a transformer,
	* the norm before each sublayer ensures comparable scales. */

	printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
	printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");

	gv_free(a); gv_free(b);
	}

	void test_growing_residual() {
	printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
	printf("Simulating 6 transformer layers with concat residuals\n\n");

	int dim = 2560;
	int K_embed = 16; /* initial embedding quantization */
	int K_sublayer = 8; /* each sublayer output */
	int n_layers = 6;

	/* Create random embedding */
	float embed = (float )malloc(dim * sizeof(float));
	srand(42);
	for (int i = 0; i < dim; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}

	/* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
	int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
	GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
	gv_from_float(residual, embed, K_embed);

	printf("After embedding: %d slots (%.1f KB)\n",
	residual->n_slots,
	(float)residual->n_slots * residual->chunks * 8 / 1024);

	for (int l = 0; l < n_layers; l++) {
	/* Simulate attention output */
	GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
	float fake_attn = (float )malloc(dim * sizeof(float));
	for (int i = 0; i < dim; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
	}
	gv_from_float(attn_out, fake_attn, K_sublayer);
	/* Scale must match for concat to work — in real net, norm handles this */
	attn_out->scale = residual->scale;

	/* RESIDUAL ADD = CONCATENATION */
	gv_concat_fast(residual, attn_out);

	/* Simulate MLP output */
	GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
	float fake_mlp = (float )malloc(dim * sizeof(float));
	for (int i = 0; i < dim; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
	}
	gv_from_float(mlp_out, fake_mlp, K_sublayer);
	mlp_out->scale = residual->scale;

	/* RESIDUAL ADD = CONCATENATION */
	gv_concat_fast(residual, mlp_out);

	printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
	l + 1, residual->n_slots,
	(float)residual->n_slots * residual->chunks * 8 / 1024,
	K_sublayer, K_sublayer);

	gv_free(attn_out); gv_free(mlp_out);
	free(fake_attn); free(fake_mlp);
	}

	printf("\nResidual grew from %d to %d slots through %d layers\n",
	K_embed, residual->n_slots, n_layers);
	printf("Information accumulated, never lost to requantization\n");

	gv_free(residual);
	free(embed);
	}

	void test_matmul_accuracy() {
	printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");

	int rows = 512, cols = 2560;
	int wK = 32;

	printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
	printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");

	srand(42);
	float Mf = (float )malloc((size_t)rows * cols * sizeof(float));
	float xf = (float )malloc(cols * sizeof(float));
	float y_ref = (float )calloc(rows, sizeof(float));

	for (size_t i = 0; i < (size_t)rows * cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < rows; i++)
	for (int j = 0; j < cols; j++)
	y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

	FixedMat *M = fm_alloc(rows, cols, wK);
	fm_from_float(M, Mf);

	/* Test with different x slot counts (simulating growing residual) */
	int x_slots[] = {8, 16, 32, 48, 64, 96};
	for (int t = 0; t < 6; t++) {
	int xK = x_slots[t];
	GrowVec *x = gv_alloc(cols, xK, xK);
	GrowVec *y = gv_alloc(rows, xK, xK);
	gv_from_float(x, xf, xK);

	struct timespec t0, t1;
	float yf = (float )malloc(rows * sizeof(float));

	clock_gettime(CLOCK_MONOTONIC, &t0);
	gv_matmul(M, x, y, xK);
	clock_gettime(CLOCK_MONOTONIC, &t1);
	double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;

	gv_to_float(y, yf);

	float dot = 0, na = 0, nb = 0, noise = 0;
	for (int i = 0; i < rows; i++) {
	dot += y_ref[i] * yf[i];
	na += y_ref[i] * y_ref[i];
	nb += yf[i] * yf[i];
	float e = y_ref[i] - yf[i];
	noise += e * e;
	}
	float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
	float snr = 10.0f * log10f(na / (noise + 1e-10f));

	printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);

	gv_free(x); gv_free(y); free(yf);
	}

	fm_free(M);
	free(Mf); free(xf); free(y_ref);
	}

	int main() {
	printf("========================================\n");
	printf(" CONCATENATIVE UNARY ENGINE TESTS\n");
	printf(" Addition = Concatenation\n");
	printf(" Value = Count of Ones\n");
	printf("========================================\n");

	test_concat_add();
	test_growing_residual();
	test_matmul_accuracy();

	printf("\n=== ALL TESTS DONE ===\n");
	return 0;
	}