Add files using upload-large-folder tool

19ed98b verified 14 days ago

18.9 kB

	/*
	* PROPER UNARY — ONE QUANTUM, NO SCALES
	*
	* Every single bit in the entire system has weight = 1 quantum.
	* The quantum is set ONCE for the whole model.
	* There are NO per-vector scales. NO per-row scales.
	*
	* The number 5.0 with quantum=0.1 is stored as 50 ones.
	* The number 5.0 with quantum=0.01 is stored as 500 ones.
	* More precision = more bits. That's the tradeoff.
	*
	* ADDITION = CONCATENATION. Always. No exceptions.
	* Because every bit everywhere means the same thing.
	*
	* MATMUL: y[i] = sum_j W[i][j] * x[j]
	* = sum over all (w_slot, x_slot) pairs:
	* popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
	* - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
	* = quantum² * integer_count
	*
	* Output quantum = input_quantum² (magnitude grows)
	* Or we pick output quantum = input_quantum and accept
	* that the integer count includes the scaling.
	*
	* (c) 2026 OpenTransformers Ltd / Scott Bisset
	*/

	#define _POSIX_C_SOURCE 199309L
	#include <immintrin.h>
	#include <omp.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <stdio.h>
	#include <time.h>

	/* ============================================================
	* PROPER UNARY VECTOR
	* Every bit = 1 quantum. No local scale.
	* ============================================================ */
	typedef struct {
	uint64_t sign; / [chunks] */
	uint64_t slots; / [n_slots * chunks] */
	int dim;
	int chunks;
	int n_slots;
	int cap; /* max slots allocated */
	} UVec;

	/* Proper unary matrix — same quantum as vectors */
	typedef struct {
	uint64_t sign; / [rows * chunks] */
	uint64_t slots; / [K * rows * chunks] */
	int rows, cols, chunks, K;
	} UMat;

	/* Global system quantum */
	typedef struct {
	float quantum; /* every bit = this much */
	/* quantum² is the matmul output unit */
	} USystem;

	/* ============================================================
	* ALLOC
	* ============================================================ */
	UVec* uv_new(int dim, int cap) {
	UVec v = (UVec )calloc(1, sizeof(UVec));
	v->dim = dim;
	v->chunks = (dim + 63) / 64;
	v->n_slots = 0;
	v->cap = cap;
	v->sign = (uint64_t )aligned_alloc(64, v->chunks sizeof(uint64_t));
	v->slots = (uint64_t )aligned_alloc(64, (size_t)cap v->chunks * sizeof(uint64_t));
	memset(v->sign, 0, v->chunks * sizeof(uint64_t));
	memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
	return v;
	}

	UMat* um_new(int rows, int cols, int K) {
	UMat m = (UMat )calloc(1, sizeof(UMat));
	m->rows = rows; m->cols = cols; m->K = K;
	m->chunks = (cols + 63) / 64;
	m->sign = (uint64_t )aligned_alloc(64, (size_t)rows m->chunks * sizeof(uint64_t));
	m->slots = (uint64_t )aligned_alloc(64, (size_t)K rows * m->chunks * sizeof(uint64_t));
	memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
	memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
	return m;
	}

	void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
	void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }

	/* ============================================================
	* QUANTIZE: float → proper unary
	*
	* Given global quantum q:
	* magnitude = round(\|value\| / q)
	* That many slots get bit set.
	*
	* NO per-vector absmax. NO local scale.
	* Values that exceed K are clipped.
	* ============================================================ */
	void uv_from_float(UVec v, const float x, int K, float quantum) {
	int dim = v->dim, chunks = v->chunks;
	v->n_slots = K;

	memset(v->sign, 0, chunks * sizeof(uint64_t));
	memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));

	float inv_q = 1.0f / quantum;
	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	if (x[i] < 0.0f) v->sign[c] \|= bit;

	int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
	if (mag > K) mag = K; /* clip */
	for (int s = 0; s < mag; s++)
	v->slots[(size_t)s * chunks + c] \|= bit;
	}
	}

	void uv_to_float(const UVec v, float out, float quantum) {
	int dim = v->dim, chunks = v->chunks;

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	int mag = 0;
	for (int s = 0; s < v->n_slots; s++)
	if (v->slots[(size_t)s * chunks + c] & bit)
	mag++;

	out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
	}
	}

	void um_from_float(UMat m, const float data, float quantum) {
	int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

	memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
	memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

	float inv_q = 1.0f / quantum;
	for (int r = 0; r < rows; r++) {
	const float row = data + (size_t)r cols;
	uint64_t rs = m->sign + (size_t)r chunks;

	for (int j = 0; j < cols; j++) {
	int c = j / 64;
	uint64_t bit = 1ULL << (j % 64);
	if (row[j] < 0.0f) rs[c] \|= bit;

	int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
	if (mag > K) mag = K;
	for (int s = 0; s < mag; s++)
	m->slots[((size_t)s * rows + r) * chunks + c] \|= bit;
	}
	}
	}

	/* ============================================================
	* CONCATENATION = ADDITION
	*
	* Since every bit everywhere = same quantum,
	* appending slots IS adding magnitudes. Period.
	*
	* Sign handling: for elements where signs differ,
	* cancel bits from existing slots.
	* ============================================================ */
	void uv_concat(UVec dst, const UVec src) {
	int chunks = dst->chunks;

	for (int s = 0; s < src->n_slots; s++) {
	if (dst->n_slots >= dst->cap) {
	printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
	return;
	}

	const uint64_t src_slot = src->slots + (size_t)s chunks;
	uint64_t new_slot = dst->slots + (size_t)dst->n_slots chunks;

	for (int c = 0; c < chunks; c++) {
	uint64_t sb = src_slot[c];
	uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
	uint64_t disagree = dst->sign[c] ^ src->sign[c];

	/* Same sign: straight append */
	uint64_t add = sb & agree;

	/* Different sign: cancel from existing */
	uint64_t cancel = sb & disagree;
	for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
	uint64_t ds = dst->slots + (size_t)d chunks + c;
	uint64_t overlap = *ds & cancel;
	*ds &= ~overlap;
	cancel &= ~overlap;
	}
	/* Leftover cancel = src magnitude exceeds dst, flip sign */
	if (cancel) {
	dst->sign[c] ^= cancel;
	add \|= cancel;
	}

	new_slot[c] = add;
	}

	/* Check if slot has any bits */
	int any = 0;
	for (int c = 0; c < chunks && !any; c++)
	if (new_slot[c]) any = 1;
	if (any) dst->n_slots++;
	}
	}

	/* ============================================================
	* MATMUL: y = M @ x
	*
	* Output unit = quantum² (one quantum from weight × one from activation)
	* The integer accumulator directly gives the value in units of quantum².
	*
	* To keep everything in the same quantum system:
	* y_float[i] = acc * quantum²
	* Then requantize to unary with the SAME global quantum.
	* y_mag[i] = acc * quantum² / quantum = acc * quantum
	*
	* ============================================================ */
	void uv_matmul(
	const UMat M, const UVec x,
	UVec *y, int K_out, float quantum
	) {
	int out_dim = M->rows;
	int chunks = M->chunks;
	int wK = M->K;
	int xK = x->n_slots;

	float q2 = quantum * quantum;

	y->n_slots = K_out;
	memset(y->sign, 0, y->chunks * sizeof(uint64_t));
	memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));

	/* Compute integer dot products */
	int acc = (int )aligned_alloc(64, out_dim * sizeof(int));
	uint8_t neg = (uint8_t )calloc(out_dim, 1);

	#pragma omp parallel for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	const uint64_t w_sign_row = M->sign + (size_t)i chunks;
	long long a = 0;

	for (int c = 0; c < chunks; c++) {
	uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
	uint64_t diff = w_sign_row[c] ^ x->sign[c];

	for (int p = 0; p < wK; p++) {
	uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
	for (int q = 0; q < xK; q++) {
	uint64_t xq = x->slots[(size_t)q * chunks + c];
	uint64_t active = wp & xq;
	a += __builtin_popcountll(active & same)
	- __builtin_popcountll(active & diff);
	}
	}
	}

	/* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
	float val = (float)a * quantum;
	int mag = (int)(fabsf(val) + 0.5f);
	if (mag > K_out) mag = K_out;
	acc[i] = mag;
	neg[i] = (val < 0.0f) ? 1 : 0;
	}

	/* Encode directly to unary — no float intermediate */
	for (int i = 0; i < out_dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);
	if (neg[i]) y->sign[c] \|= bit;
	for (int s = 0; s < acc[i]; s++)
	y->slots[(size_t)s * y->chunks + c] \|= bit;
	}

	free(acc); free(neg);
	}

	/* ============================================================
	* RMSNORM — resets slot count, keeps same quantum
	* ============================================================ */
	void uv_rmsnorm(const UVec x, const float weight, UVec *out, int K_out, float quantum, float eps) {
	int dim = x->dim;
	float xf = (float )aligned_alloc(64, dim * sizeof(float));
	uv_to_float(x, xf, quantum);

	float ss = 0.0f;
	for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
	float rms = 1.0f / sqrtf(ss / dim + eps);
	for (int i = 0; i < dim; i++) xf[i] = rms weight[i];

	uv_from_float(out, xf, K_out, quantum);
	free(xf);
	}

	/* ============================================================
	* TESTS
	* ============================================================ */

	void test_concat_correct() {
	printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");

	float quantum = 0.25f; /* every bit = 0.25 */
	int dim = 8;

	/* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
	* In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
	* Need K >= 28 slots to hold 7.0
	*/
	float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
	float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
	float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};

	int K = 32;
	UVec *a = uv_new(dim, 128);
	UVec *b = uv_new(dim, 128);

	uv_from_float(a, a_vals, K, quantum);
	uv_from_float(b, b_vals, K, quantum);

	float a_rec[8], b_rec[8];
	uv_to_float(a, a_rec, quantum);
	uv_to_float(b, b_rec, quantum);

	printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
	printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
	printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
	printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
	printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");

	printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
	uv_concat(a, b);
	printf("After concat: %d slots\n\n", a->n_slots);

	float result[8];
	uv_to_float(a, result, quantum);

	printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
	printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
	printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");

	uv_free(a); uv_free(b);
	}

	void test_chain_concat() {
	printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");

	float quantum = 0.1f;
	int dim = 4;
	int K = 64;

	float vals[] = {1.0, -2.0, 3.0, -0.5};
	UVec *acc = uv_new(dim, 512);
	uv_from_float(acc, vals, K, quantum);

	printf("Start: ");
	float tmp[4];
	uv_to_float(acc, tmp, quantum);
	for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
	printf(" (%d slots)\n", acc->n_slots);

	float expected[] = {1.0, -2.0, 3.0, -0.5};

	for (int step = 0; step < 5; step++) {
	float add_vals[] = {0.5, 0.3, -1.0, 0.7};
	UVec *delta = uv_new(dim, K);
	uv_from_float(delta, add_vals, K, quantum);

	uv_concat(acc, delta);

	for (int i = 0; i < 4; i++) expected[i] += add_vals[i];

	uv_to_float(acc, tmp, quantum);
	printf(" +[0.5,0.3,-1.0,0.7] = ");
	for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
	printf(" (%d slots) expect:", acc->n_slots);
	for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);

	/* Check error */
	float max_err = 0;
	for(int i=0;i<4;i++) {
	float e = fabsf(expected[i] - tmp[i]);
	if (e > max_err) max_err = e;
	}
	printf(" err=%.2f\n", max_err);

	uv_free(delta);
	}

	uv_free(acc);
	}

	void test_matmul() {
	printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");

	int rows = 512, cols = 256;
	int wK = 32, xK = 32;

	srand(42);
	float Mf = (float )malloc((size_t)rows * cols * sizeof(float));
	float xf = (float )malloc(cols * sizeof(float));
	float y_ref = (float )calloc(rows, sizeof(float));

	/* Small values so magnitudes fit in K slots */
	for (size_t i = 0; i < (size_t)rows * cols; i++)
	Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
	for (int i = 0; i < cols; i++)
	xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
	for (int i = 0; i < rows; i++)
	for (int j = 0; j < cols; j++)
	y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

	/* Find quantum that fits the data range */
	float data_max = 0;
	for (size_t i = 0; i < (size_t)rows * cols; i++) {
	float a = fabsf(Mf[i]);
	if (a > data_max) data_max = a;
	}
	for (int i = 0; i < cols; i++) {
	float a = fabsf(xf[i]);
	if (a > data_max) data_max = a;
	}
	float quantum = data_max / wK;

	printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
	printf("Quantum: %.4f (K=%d gives range [-%dq, %dq])\n", quantum, wK, wK, wK);
	printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);

	UMat *M = um_new(rows, cols, wK);
	UVec *x = uv_new(cols, xK);

	um_from_float(M, Mf, quantum);
	uv_from_float(x, xf, xK, quantum);

	/* Output needs enough K for the matmul result range */
	float ymax = 0;
	for (int i = 0; i < rows; i++) {
	float a = fabsf(y_ref[i]);
	if (a > ymax) ymax = a;
	}
	int K_out = (int)(ymax / quantum + 1);
	if (K_out > 4096) K_out = 4096;
	printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);

	UVec *y = uv_new(rows, K_out);

	struct timespec t0, t1;
	clock_gettime(CLOCK_MONOTONIC, &t0);
	uv_matmul(M, x, y, K_out, quantum);
	clock_gettime(CLOCK_MONOTONIC, &t1);
	double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;

	float yf = (float )malloc(rows * sizeof(float));
	uv_to_float(y, yf, quantum);

	float dot = 0, na = 0, nb = 0, noise = 0;
	for (int i = 0; i < rows; i++) {
	dot += y_ref[i] * yf[i];
	na += y_ref[i] * y_ref[i];
	nb += yf[i] * yf[i];
	float e = y_ref[i] - yf[i]; noise += e * e;
	}
	float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
	float snr = 10.0f * log10f(na / (noise + 1e-10f));

	printf("\nCosine: %.6f\n", cosine);
	printf("SNR: %.1f dB\n", snr);
	printf("Time: %.1f ms\n", ms);

	printf("\nFirst 10 values:\n");
	printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
	for (int i = 0; i < 10; i++)
	printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);

	um_free(M); uv_free(x); uv_free(y);
	free(Mf); free(xf); free(y_ref); free(yf);
	}

	void test_residual_chain() {
	printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");

	float quantum = 0.05f;
	int dim = 1024;
	int K = 128; /* fits values up to 6.4 */

	srand(123);
	float embed = (float )malloc(dim * sizeof(float));
	for (int i = 0; i < dim; i++)
	embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;

	/* Float reference: accumulate residuals */
	float ref = (float )malloc(dim * sizeof(float));
	memcpy(ref, embed, dim * sizeof(float));

	/* Unary: grow via concat */
	int total_cap = K + 10 * K; /* room for 10 concat operations */
	UVec *residual = uv_new(dim, total_cap);
	uv_from_float(residual, embed, K, quantum);

	printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
	printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");

	for (int step = 0; step < 10; step++) {
	float delta = (float )malloc(dim * sizeof(float));
	for (int i = 0; i < dim; i++)
	delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;

	/* Float reference */
	for (int i = 0; i < dim; i++) ref[i] += delta[i];

	/* Unary: concat */
	UVec *d = uv_new(dim, K);
	uv_from_float(d, delta, K, quantum);
	uv_concat(residual, d);

	/* Compare */
	float rec = (float )malloc(dim * sizeof(float));
	uv_to_float(residual, rec, quantum);

	float dot = 0, na = 0, nb = 0, max_err = 0;
	for (int i = 0; i < dim; i++) {
	dot += ref[i] * rec[i];
	na += ref[i] * ref[i];
	nb += rec[i] * rec[i];
	float e = fabsf(ref[i] - rec[i]);
	if (e > max_err) max_err = e;
	}
	float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);

	printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);

	uv_free(d); free(delta); free(rec);
	}

	uv_free(residual);
	free(embed); free(ref);
	}

	int main() {
	printf("================================================\n");
	printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
	printf(" Every bit = 1 quantum. Concat = Add.\n");
	printf("================================================\n\n");

	test_concat_correct();
	test_chain_concat();
	test_matmul();
	test_residual_chain();

	printf("\n=== DONE ===\n");
	return 0;
	}