/* * Log-Unary Tensor Tests * Benchmarks accuracy and speed of native base-1 log-encoded tensors */ #include #include #include #include /* Forward declarations from library */ typedef struct LogUnaryTensor LogUnaryTensor; typedef struct LogUnaryMatrix LogUnaryMatrix; typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult; typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult; extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias); extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias); extern void lut_free(LogUnaryTensor *t); extern void lum_free(LogUnaryMatrix *m); extern void lut_from_float(LogUnaryTensor *t, const float *x); extern void lut_to_float(const LogUnaryTensor *t, float *out); extern void lum_from_float(LogUnaryMatrix *m, const float *data); extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y); extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps); extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out); extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters); extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias); /* Test matvec correctness against float reference */ static void test_matvec_correctness(int rows, int cols, int planes, int bias) { printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias); /* Random float matrix and vector */ float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float)); float *x_float = (float *)malloc(cols * sizeof(float)); float *y_ref = (float *)calloc(rows, sizeof(float)); float *y_lut = (float *)malloc(rows * sizeof(float)); srand(42); for (size_t i = 0; i < (size_t)rows * cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } for (int i = 0; i < cols; i++) { float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); } /* Float reference matmul */ for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j]; /* Log-unary matmul */ LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias); LogUnaryTensor *x = lut_alloc(cols, planes, bias); LogUnaryTensor *y = lut_alloc(rows, planes, bias); lum_from_float(M, M_float); lut_from_float(x, x_float); lum_matvec(M, x, y); lut_to_float(y, y_lut); /* Compare */ float dot = 0, na = 0, nb = 0, max_err = 0; for (int i = 0; i < rows; i++) { dot += y_ref[i] * y_lut[i]; na += y_ref[i] * y_ref[i]; nb += y_lut[i] * y_lut[i]; float err = fabsf(y_ref[i] - y_lut[i]); if (err > max_err) max_err = err; } float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); float noise = 0; for (int i = 0; i < rows; i++) { float e = y_ref[i] - y_lut[i]; noise += e * e; } float snr = 10.0f * log10f(na / (noise + 1e-10f)); printf(" Cosine similarity: %.6f\n", cosine); printf(" SNR: %.1f dB\n", snr); printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows)); /* Show first few values */ printf(" First 5 values:\n"); for (int i = 0; i < 5 && i < rows; i++) printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]); lum_free(M); lut_free(x); lut_free(y); free(M_float); free(x_float); free(y_ref); free(y_lut); } int main() { srand(time(NULL)); printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n"); /* 1. Roundtrip accuracy at different plane counts */ printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n"); printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB"); for (int np = 4; np <= 12; np += 2) { int bias = np / 2; AccuracyResult r = lut_accuracy_test(4096, np, bias); printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n", np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db); } /* 2. Matvec correctness */ test_matvec_correctness(64, 256, 7, 3); test_matvec_correctness(256, 1024, 7, 3); test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */ /* 3. Speed benchmarks - various configurations */ printf("\n--- SPEED BENCHMARKS (16 threads) ---\n"); printf("%10s %6s %6s %6s %10s %10s %10s\n", "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s"); struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = { /* Qwen3-4B attention: hidden=2560, heads*dim=4096 */ {4096, 2560, 7, 4, 3, "q_proj"}, {4096, 2560, 7, 7, 3, "q_proj_7x7"}, {1024, 2560, 7, 4, 3, "k_proj"}, /* Qwen3-4B MLP: inter=9728 */ {9728, 2560, 7, 4, 3, "gate_proj"}, {2560, 9728, 7, 4, 3, "down_proj"}, /* Different plane counts */ {4096, 2560, 4, 4, 2, "4x4"}, {4096, 2560, 8, 8, 4, "8x8"}, {4096, 2560, 10, 6, 3, "10x6"}, }; int n_configs = sizeof(configs) / sizeof(configs[0]); for (int c = 0; c < n_configs; c++) { int iters = 3; BenchResult r = lum_bench_matvec( configs[c].rows, configs[c].cols, configs[c].wp, configs[c].xp, configs[c].bias, iters); printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n", configs[c].rows, configs[c].cols, configs[c].wp, configs[c].xp, configs[c].bias, r.wall_time_s * 1000, r.elements_per_sec / 1e6, r.gops, configs[c].label); } printf("\n=== DONE ===\n"); return 0; }