Add files using upload-large-folder tool

19ed98b verified 14 days ago

19.5 kB

	/*
	* TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
	*
	* Representation:
	* A value of magnitude M is stored as M consecutive 1-bits.
	* The number IS the count of ones.
	* Every bit has weight exactly 1.
	*
	* For a vector element quantized to integer range [-K, K]:
	* sign: 1 bit (0=positive, 1=negative)
	* magnitude: K bit positions, first \|value\| are 1, rest are 0
	*
	* Storage layout for a vector of dim D with max magnitude K:
	* sign: uint64[(D+63)/64] — one sign bit per element
	* unary: uint64[K * (D+63)/64] — K bitplanes across D elements
	* Plane p has bit j set iff \|element_j\| > p
	* (thermometer = true unary in bitplane form)
	*
	* Multiplication: w * x = popcount of ones(w) matched with ones(x)
	* Since every bit = 1, the dot product is JUST COUNTING.
	* No weights, no shifts, no corrections.
	* sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
	* = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
	*
	* YES this uses more memory. A 2560-dim vector with K=32 uses:
	* 32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
	* But the MATH IS EXACT (to quantization level).
	*
	* (c) 2026 OpenTransformers Ltd / Scott Bisset
	*/

	#define _POSIX_C_SOURCE 199309L
	#include <immintrin.h>
	#include <omp.h>
	#include <stdint.h>
	#include <stdlib.h>
	#include <string.h>
	#include <math.h>
	#include <stdio.h>
	#include <time.h>

	/* ============================================================
	* TRUE UNARY VECTOR
	* ============================================================ */
	typedef struct {
	uint64_t sign; / [chunks] — 1 bit per element */
	uint64_t unary; / [K * chunks] — K bitplanes, each bit = weight 1 */
	float scale; /* float scale: real_value = sign * count * scale */
	int dim;
	int chunks; /* (dim+63)/64 */
	int K; /* max magnitude = number of unary bitplanes */
	} TrueUnaryVec;

	/* TRUE UNARY MATRIX — row-major */
	typedef struct {
	uint64_t sign; / [rows * chunks] */
	uint64_t unary; / [K * rows * chunks] — plane p, row i at [prowschunks + ichunks] /
	float scales; / [rows] — per-row scale factors */
	int rows;
	int cols;
	int chunks; /* (cols+63)/64 */
	int K; /* max magnitude per element */
	} TrueUnaryMat;

	/* ============================================================
	* ALLOCATION
	* ============================================================ */
	TrueUnaryVec* tuv_alloc(int dim, int K) {
	TrueUnaryVec v = (TrueUnaryVec )calloc(1, sizeof(TrueUnaryVec));
	v->dim = dim;
	v->K = K;
	v->chunks = (dim + 63) / 64;
	v->scale = 1.0f;
	v->sign = (uint64_t )aligned_alloc(64, v->chunks sizeof(uint64_t));
	v->unary = (uint64_t )aligned_alloc(64, (size_t)K v->chunks * sizeof(uint64_t));
	memset(v->sign, 0, v->chunks * sizeof(uint64_t));
	memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
	return v;
	}

	TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
	TrueUnaryMat m = (TrueUnaryMat )calloc(1, sizeof(TrueUnaryMat));
	m->rows = rows;
	m->cols = cols;
	m->K = K;
	m->chunks = (cols + 63) / 64;
	m->sign = (uint64_t )aligned_alloc(64, (size_t)rows m->chunks * sizeof(uint64_t));
	m->unary = (uint64_t )aligned_alloc(64, (size_t)K rows * m->chunks * sizeof(uint64_t));
	m->scales = (float )aligned_alloc(64, rows sizeof(float));
	memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
	memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
	for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
	return m;
	}

	void tuv_free(TrueUnaryVec *v) {
	if (v) { free(v->sign); free(v->unary); free(v); }
	}
	void tum_free(TrueUnaryMat *m) {
	if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
	}

	/* ============================================================
	* FLOAT → TRUE UNARY
	*
	* Quantize: integer_val = round(float_val / scale * K)
	* Then store \|integer_val\| as that many 1-bits.
	*
	* For vector: single global scale = absmax / K
	* For matrix: per-row scale = row_absmax / K
	* ============================================================ */
	void tuv_from_float(TrueUnaryVec v, const float x) {
	int dim = v->dim, K = v->K, chunks = v->chunks;

	memset(v->sign, 0, chunks * sizeof(uint64_t));
	memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));

	float amax = 0.0f;
	for (int i = 0; i < dim; i++) {
	float a = fabsf(x[i]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { v->scale = 1.0f; return; }
	v->scale = amax / K;

	float inv = K / amax;
	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	if (x[i] < 0.0f) v->sign[c] \|= bit;

	int mag = (int)(fabsf(x[i]) * inv + 0.5f);
	if (mag > K) mag = K;

	/* TRUE UNARY: set planes 0 through mag-1 */
	for (int p = 0; p < mag; p++)
	v->unary[(size_t)p * chunks + c] \|= bit;
	}
	}

	void tuv_to_float(const TrueUnaryVec v, float out) {
	int dim = v->dim, K = v->K, chunks = v->chunks;

	for (int i = 0; i < dim; i++) {
	int c = i / 64;
	uint64_t bit = 1ULL << (i % 64);

	/* Count set planes = magnitude in base-1 */
	int mag = 0;
	for (int p = 0; p < K; p++) {
	if (v->unary[(size_t)p * chunks + c] & bit)
	mag++;
	}

	float val = (float)mag * v->scale;
	out[i] = (v->sign[c] & bit) ? -val : val;
	}
	}

	void tum_from_float(TrueUnaryMat m, const float data) {
	int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

	memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
	memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

	for (int r = 0; r < rows; r++) {
	const float row = data + (size_t)r cols;

	float amax = 0.0f;
	for (int j = 0; j < cols; j++) {
	float a = fabsf(row[j]);
	if (a > amax) amax = a;
	}
	if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
	m->scales[r] = amax / K;
	float inv = K / amax;

	uint64_t row_sign = m->sign + (size_t)r chunks;

	for (int j = 0; j < cols; j++) {
	int c = j / 64;
	uint64_t bit = 1ULL << (j % 64);

	if (row[j] < 0.0f) row_sign[c] \|= bit;

	int mag = (int)(fabsf(row[j]) * inv + 0.5f);
	if (mag > K) mag = K;

	for (int p = 0; p < mag; p++)
	m->unary[((size_t)p * rows + r) * chunks + c] \|= bit;
	}
	}
	}

	/* ============================================================
	* TRUE UNARY MATVEC: y = M @ x
	*
	* THE CORE OPERATION.
	*
	* For each output element y[i]:
	* For each pair of planes (p from weight, q from activation):
	* active = w_plane_p[i] AND x_plane_q
	* same = active AND ~(w_sign[i] XOR x_sign)
	* diff = active AND (w_sign[i] XOR x_sign)
	* acc += popcount(same) - popcount(diff)
	*
	* EVERY PLANE PAIR HAS WEIGHT = 1.
	* No shifts. No scaling between planes. No corrections.
	* The count IS the answer.
	*
	* y[i] = acc * w_scale[i] * x_scale
	* (single float multiply at the very end)
	*
	* ============================================================ */
	void tum_matvec(
	const TrueUnaryMat *M,
	const TrueUnaryVec *x,
	float y_out / float output, requantize externally if needed */
	) {
	int out_dim = M->rows;
	int chunks = M->chunks;
	int wK = M->K;
	int xK = x->K;

	#pragma omp parallel for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	const uint64_t w_sign_row = M->sign + (size_t)i chunks;
	long long acc = 0;

	for (int c = 0; c < chunks; c++) {
	uint64_t ws = w_sign_row[c];
	uint64_t xs = x->sign[c];
	uint64_t same = ~(ws ^ xs);
	uint64_t diff = ws ^ xs;

	/*
	* PURE BASE-1: every plane pair contributes weight 1.
	* acc += popcount(w_plane AND x_plane AND same_sign)
	* - popcount(w_plane AND x_plane AND diff_sign)
	*/
	for (int p = 0; p < wK; p++) {
	uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];

	for (int q = 0; q < xK; q++) {
	uint64_t xq = x->unary[(size_t)q * chunks + c];
	uint64_t active = wp & xq;
	acc += __builtin_popcountll(active & same)
	- __builtin_popcountll(active & diff);
	}
	}
	}

	/* Single float rescale per output element */
	y_out[i] = (float)acc * M->scales[i] * x->scale;
	}
	}

	/* ============================================================
	* OPTIMIZED MATVEC: collapse x planes first
	*
	* Instead of iterating wK * xK plane pairs per chunk,
	* precompute per-chunk activation sums:
	* x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
	* x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
	*
	* Then for each weight plane p:
	* This doesn't directly simplify because we need AND with wp first.
	*
	* ALTERNATIVE: precompute per-element x magnitudes in unary,
	* then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
	*
	* For now: provide both the naive and a vertically-accumulated variant.
	*
	* VERTICAL ACCUMULATE: sum all weight planes into a per-element
	* count, then multiply by x count. Reduces from O(wKxKchunks)
	* to O((wK+xK)*chunks + dim).
	* ============================================================ */
	void tum_matvec_fast(
	const TrueUnaryMat *M,
	const TrueUnaryVec *x,
	float *y_out
	) {
	int out_dim = M->rows;
	int cols = M->cols;
	int chunks = M->chunks;
	int xK = x->K;

	/* Step 1: compute x magnitudes (per-element popcount across planes)
	* x_mag[j] = number of x planes where bit j is set
	* This is O(xK * chunks) = O(xK * dim / 64)
	*/
	int16_t x_mag = (int16_t )aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
	memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));

	for (int q = 0; q < xK; q++) {
	const uint64_t xplane = x->unary + (size_t)q chunks;
	for (int c = 0; c < chunks; c++) {
	uint64_t bits = xplane[c];
	while (bits) {
	int bit = __builtin_ctzll(bits);
	int j = c * 64 + bit;
	if (j < cols) x_mag[j]++;
	bits &= bits - 1;
	}
	}
	}

	/* Apply sign to x_mag: positive if same sign as...
	* Actually we need signed x_mag relative to each weight row's sign.
	* So we keep x_mag unsigned and handle sign per output element.
	*/

	/* Step 2: for each output row, compute:
	* y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
	*
	* w_mag[i][j] = number of weight planes where bit j is set
	* sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
	*
	* We compute w_mag by vertical popcount across weight planes.
	* This is O(wK * chunks) per row.
	*/

	#pragma omp parallel
	{
	int16_t w_mag = (int16_t )aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));

	#pragma omp for schedule(dynamic, 32)
	for (int i = 0; i < out_dim; i++) {
	memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));

	/* Vertical popcount: count set planes per element */
	for (int p = 0; p < M->K; p++) {
	const uint64_t wplane = M->unary + ((size_t)p out_dim + i) * chunks;
	for (int c = 0; c < chunks; c++) {
	uint64_t bits = wplane[c];
	while (bits) {
	int bit = __builtin_ctzll(bits);
	int j = c * 64 + bit;
	if (j < cols) w_mag[j]++;
	bits &= bits - 1;
	}
	}
	}

	/* Dot product with sign */
	const uint64_t w_sign_row = M->sign + (size_t)i chunks;
	long long acc = 0;

	for (int j = 0; j < cols; j++) {
	int c = j / 64;
	uint64_t bit = 1ULL << (j % 64);
	int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
	int product = (int)w_mag[j] * (int)x_mag[j];
	acc += same_sign ? product : -product;
	}

	y_out[i] = (float)acc * M->scales[i] * x->scale;
	}

	free(w_mag);
	}

	free(x_mag);
	}

	/* ============================================================
	* BENCHMARK + ACCURACY
	* ============================================================ */
	typedef struct {
	float cosine;
	float snr_db;
	float max_rel_err;
	double ms_naive;
	double ms_fast;
	double gops_naive;
	double gops_fast;
	} TestResult;

	TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
	TestResult r = {0};
	srand(42);

	/* Random float matrix and vector (normal distribution) */
	float Mf = (float )malloc((size_t)rows * cols * sizeof(float));
	float xf = (float )malloc(cols * sizeof(float));
	float y_ref = (float )calloc(rows, sizeof(float));
	float y_naive = (float )malloc(rows * sizeof(float));
	float y_fast = (float )malloc(rows * sizeof(float));

	for (size_t i = 0; i < (size_t)rows * cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}

	/* Float reference */
	for (int i = 0; i < rows; i++)
	for (int j = 0; j < cols; j++)
	y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

	/* Convert to true unary */
	TrueUnaryMat *M = tum_alloc(rows, cols, wK);
	TrueUnaryVec *x = tuv_alloc(cols, xK);
	tum_from_float(M, Mf);
	tuv_from_float(x, xf);

	/* Naive matvec */
	struct timespec t0, t1;
	tum_matvec(M, x, y_naive); /* warmup */
	clock_gettime(CLOCK_MONOTONIC, &t0);
	for (int i = 0; i < iters; i++)
	tum_matvec(M, x, y_naive);
	clock_gettime(CLOCK_MONOTONIC, &t1);
	r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;

	/* Fast matvec */
	tum_matvec_fast(M, x, y_fast); /* warmup */
	clock_gettime(CLOCK_MONOTONIC, &t0);
	for (int i = 0; i < iters; i++)
	tum_matvec_fast(M, x, y_fast);
	clock_gettime(CLOCK_MONOTONIC, &t1);
	r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;

	/* Accuracy vs float reference */
	float dot = 0, na = 0, nb = 0, max_re = 0;
	for (int i = 0; i < rows; i++) {
	dot += y_ref[i] * y_naive[i];
	na += y_ref[i] * y_ref[i];
	nb += y_naive[i] * y_naive[i];
	float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
	if (re > max_re) max_re = re;
	}
	r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
	float noise = 0;
	for (int i = 0; i < rows; i++) {
	float e = y_ref[i] - y_naive[i]; noise += e * e;
	}
	r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
	r.max_rel_err = max_re;

	/* Verify naive == fast */
	float fast_err = 0;
	for (int i = 0; i < rows; i++) {
	float e = fabsf(y_naive[i] - y_fast[i]);
	if (e > fast_err) fast_err = e;
	}
	if (fast_err > 0.01f)
	printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err);

	double ops = 2.0 * rows * cols;
	r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
	r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);

	tum_free(M); tuv_free(x);
	free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
	return r;
	}

	/* ============================================================
	* MAIN: sweep K values, show accuracy + speed tradeoff
	* ============================================================ */
	int main() {
	printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
	printf("Every bit has weight 1. Value = count of ones.\n");
	printf("Matmul = AND + popcount, no weighting.\n\n");

	/* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
	int rows = 4096, cols = 2560;
	printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);

	printf("%4s %4s \| %8s %8s %8s \| %8s %8s \| %8s %8s \| %s\n",
	"wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
	"Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");

	struct { int wK; int xK; } configs[] = {
	{8, 4},
	{8, 8},
	{16, 8},
	{16, 16},
	{32, 8},
	{32, 16},
	{32, 32},
	{64, 16},
	{64, 32},
	};
	int n = sizeof(configs) / sizeof(configs[0]);

	for (int c = 0; c < n; c++) {
	int wK = configs[c].wK;
	int xK = configs[c].xK;
	int iters = (wK <= 16 && xK <= 16) ? 3 : 1;

	TestResult r = tum_test(rows, cols, wK, xK, iters);

	/* Memory for this layer's weights */
	size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
	size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
	size_t scale_bytes = rows * 4;
	double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;

	printf("%4d %4d \| %8.6f %8.1f %8.4f \| %8.1f %8.1f \| %8.1f %8.1f \| %.0fMB\n",
	wK, xK, r.cosine, r.snr_db, r.max_rel_err,
	r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
	}

	/* Show first 5 values for K=32,16 case */
	printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
	{
	int sr = 512, sc = 2560;
	srand(42);
	float Mf = (float )malloc((size_t)sr * sc * sizeof(float));
	float xf = (float )malloc(sc * sizeof(float));
	float y_ref = (float )calloc(sr, sizeof(float));
	float y_unary = (float )malloc(sr * sizeof(float));

	for (size_t i = 0; i < (size_t)sr * sc; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < sc; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < sr; i++)
	for (int j = 0; j < sc; j++)
	y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];

	TrueUnaryMat *M = tum_alloc(sr, sc, 32);
	TrueUnaryVec *x = tuv_alloc(sc, 16);
	tum_from_float(M, Mf);
	tuv_from_float(x, xf);
	tum_matvec(M, x, y_unary);

	printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
	for (int i = 0; i < 10; i++)
	printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);

	tum_free(M); tuv_free(x);
	free(Mf); free(xf); free(y_ref); free(y_unary);
	}

	printf("\n=== DONE ===\n");
	return 0;
	}