unary-quantization-research / test_logunary.c
OpenTransformer's picture
Add files using upload-large-folder tool
19ed98b verified
/*
* Log-Unary Tensor Tests
* Benchmarks accuracy and speed of native base-1 log-encoded tensors
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
/* Forward declarations from library */
typedef struct LogUnaryTensor LogUnaryTensor;
typedef struct LogUnaryMatrix LogUnaryMatrix;
typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;
extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
extern void lut_free(LogUnaryTensor *t);
extern void lum_free(LogUnaryMatrix *m);
extern void lut_from_float(LogUnaryTensor *t, const float *x);
extern void lut_to_float(const LogUnaryTensor *t, float *out);
extern void lum_from_float(LogUnaryMatrix *m, const float *data);
extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);
/* Test matvec correctness against float reference */
static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);
/* Random float matrix and vector */
float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
float *x_float = (float *)malloc(cols * sizeof(float));
float *y_ref = (float *)calloc(rows, sizeof(float));
float *y_lut = (float *)malloc(rows * sizeof(float));
srand(42);
for (size_t i = 0; i < (size_t)rows * cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
/* Float reference matmul */
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];
/* Log-unary matmul */
LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
LogUnaryTensor *x = lut_alloc(cols, planes, bias);
LogUnaryTensor *y = lut_alloc(rows, planes, bias);
lum_from_float(M, M_float);
lut_from_float(x, x_float);
lum_matvec(M, x, y);
lut_to_float(y, y_lut);
/* Compare */
float dot = 0, na = 0, nb = 0, max_err = 0;
for (int i = 0; i < rows; i++) {
dot += y_ref[i] * y_lut[i];
na += y_ref[i] * y_ref[i];
nb += y_lut[i] * y_lut[i];
float err = fabsf(y_ref[i] - y_lut[i]);
if (err > max_err) max_err = err;
}
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
float noise = 0;
for (int i = 0; i < rows; i++) {
float e = y_ref[i] - y_lut[i];
noise += e * e;
}
float snr = 10.0f * log10f(na / (noise + 1e-10f));
printf(" Cosine similarity: %.6f\n", cosine);
printf(" SNR: %.1f dB\n", snr);
printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));
/* Show first few values */
printf(" First 5 values:\n");
for (int i = 0; i < 5 && i < rows; i++)
printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);
lum_free(M); lut_free(x); lut_free(y);
free(M_float); free(x_float); free(y_ref); free(y_lut);
}
int main() {
srand(time(NULL));
printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");
/* 1. Roundtrip accuracy at different plane counts */
printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
for (int np = 4; np <= 12; np += 2) {
int bias = np / 2;
AccuracyResult r = lut_accuracy_test(4096, np, bias);
printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
}
/* 2. Matvec correctness */
test_matvec_correctness(64, 256, 7, 3);
test_matvec_correctness(256, 1024, 7, 3);
test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */
/* 3. Speed benchmarks - various configurations */
printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
printf("%10s %6s %6s %6s %10s %10s %10s\n",
"Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");
struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
/* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
{4096, 2560, 7, 4, 3, "q_proj"},
{4096, 2560, 7, 7, 3, "q_proj_7x7"},
{1024, 2560, 7, 4, 3, "k_proj"},
/* Qwen3-4B MLP: inter=9728 */
{9728, 2560, 7, 4, 3, "gate_proj"},
{2560, 9728, 7, 4, 3, "down_proj"},
/* Different plane counts */
{4096, 2560, 4, 4, 2, "4x4"},
{4096, 2560, 8, 8, 4, "8x8"},
{4096, 2560, 10, 6, 3, "10x6"},
};
int n_configs = sizeof(configs) / sizeof(configs[0]);
for (int c = 0; c < n_configs; c++) {
int iters = 3;
BenchResult r = lum_bench_matvec(
configs[c].rows, configs[c].cols,
configs[c].wp, configs[c].xp, configs[c].bias, iters);
printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n",
configs[c].rows, configs[c].cols,
configs[c].wp, configs[c].xp, configs[c].bias,
r.wall_time_s * 1000,
r.elements_per_sec / 1e6,
r.gops,
configs[c].label);
}
printf("\n=== DONE ===\n");
return 0;
}