Add files using upload-large-folder tool

19ed98b verified 14 days ago

6.19 kB

	/*
	* Log-Unary Tensor Tests
	* Benchmarks accuracy and speed of native base-1 log-encoded tensors
	*/
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>
	#include <time.h>

	/* Forward declarations from library */
	typedef struct LogUnaryTensor LogUnaryTensor;
	typedef struct LogUnaryMatrix LogUnaryMatrix;
	typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
	typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;

	extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
	extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
	extern void lut_free(LogUnaryTensor *t);
	extern void lum_free(LogUnaryMatrix *m);
	extern void lut_from_float(LogUnaryTensor t, const float x);
	extern void lut_to_float(const LogUnaryTensor t, float out);
	extern void lum_from_float(LogUnaryMatrix m, const float data);
	extern void lum_matvec(const LogUnaryMatrix M, const LogUnaryTensor x, LogUnaryTensor *y);
	extern void lut_rmsnorm(const LogUnaryTensor x, const float weight, LogUnaryTensor *out, float eps);
	extern void lut_silu_mul(const LogUnaryTensor gate, const LogUnaryTensor up, LogUnaryTensor *out);
	extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
	extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);

	/* Test matvec correctness against float reference */
	static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
	printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);

	/* Random float matrix and vector */
	float M_float = (float )malloc((size_t)rows * cols * sizeof(float));
	float x_float = (float )malloc(cols * sizeof(float));
	float y_ref = (float )calloc(rows, sizeof(float));
	float y_lut = (float )malloc(rows * sizeof(float));

	srand(42);
	for (size_t i = 0; i < (size_t)rows * cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}
	for (int i = 0; i < cols; i++) {
	float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
	x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
	}

	/* Float reference matmul */
	for (int i = 0; i < rows; i++)
	for (int j = 0; j < cols; j++)
	y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];

	/* Log-unary matmul */
	LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
	LogUnaryTensor *x = lut_alloc(cols, planes, bias);
	LogUnaryTensor *y = lut_alloc(rows, planes, bias);

	lum_from_float(M, M_float);
	lut_from_float(x, x_float);
	lum_matvec(M, x, y);
	lut_to_float(y, y_lut);

	/* Compare */
	float dot = 0, na = 0, nb = 0, max_err = 0;
	for (int i = 0; i < rows; i++) {
	dot += y_ref[i] * y_lut[i];
	na += y_ref[i] * y_ref[i];
	nb += y_lut[i] * y_lut[i];
	float err = fabsf(y_ref[i] - y_lut[i]);
	if (err > max_err) max_err = err;
	}
	float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);

	float noise = 0;
	for (int i = 0; i < rows; i++) {
	float e = y_ref[i] - y_lut[i];
	noise += e * e;
	}
	float snr = 10.0f * log10f(na / (noise + 1e-10f));

	printf(" Cosine similarity: %.6f\n", cosine);
	printf(" SNR: %.1f dB\n", snr);
	printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));

	/* Show first few values */
	printf(" First 5 values:\n");
	for (int i = 0; i < 5 && i < rows; i++)
	printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);

	lum_free(M); lut_free(x); lut_free(y);
	free(M_float); free(x_float); free(y_ref); free(y_lut);
	}

	int main() {
	srand(time(NULL));

	printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");

	/* 1. Roundtrip accuracy at different plane counts */
	printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
	printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
	for (int np = 4; np <= 12; np += 2) {
	int bias = np / 2;
	AccuracyResult r = lut_accuracy_test(4096, np, bias);
	printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
	np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
	}

	/* 2. Matvec correctness */
	test_matvec_correctness(64, 256, 7, 3);
	test_matvec_correctness(256, 1024, 7, 3);
	test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */

	/* 3. Speed benchmarks - various configurations */
	printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
	printf("%10s %6s %6s %6s %10s %10s %10s\n",
	"Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");

	struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
	/* Qwen3-4B attention: hidden=2560, headsdim=4096 /
	{4096, 2560, 7, 4, 3, "q_proj"},
	{4096, 2560, 7, 7, 3, "q_proj_7x7"},
	{1024, 2560, 7, 4, 3, "k_proj"},
	/* Qwen3-4B MLP: inter=9728 */
	{9728, 2560, 7, 4, 3, "gate_proj"},
	{2560, 9728, 7, 4, 3, "down_proj"},
	/* Different plane counts */
	{4096, 2560, 4, 4, 2, "4x4"},
	{4096, 2560, 8, 8, 4, "8x8"},
	{4096, 2560, 10, 6, 3, "10x6"},
	};
	int n_configs = sizeof(configs) / sizeof(configs[0]);

	for (int c = 0; c < n_configs; c++) {
	int iters = 3;
	BenchResult r = lum_bench_matvec(
	configs[c].rows, configs[c].cols,
	configs[c].wp, configs[c].xp, configs[c].bias, iters);
	printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n",
	configs[c].rows, configs[c].cols,
	configs[c].wp, configs[c].xp, configs[c].bias,
	r.wall_time_s * 1000,
	r.elements_per_sec / 1e6,
	r.gops,
	configs[c].label);
	}

	printf("\n=== DONE ===\n");
	return 0;
	}