File size: 6,193 Bytes
19ed98b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | /*
* Log-Unary Tensor Tests
* Benchmarks accuracy and speed of native base-1 log-encoded tensors
*/
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
/* Forward declarations from library */
typedef struct LogUnaryTensor LogUnaryTensor;
typedef struct LogUnaryMatrix LogUnaryMatrix;
typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;
extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
extern void lut_free(LogUnaryTensor *t);
extern void lum_free(LogUnaryMatrix *m);
extern void lut_from_float(LogUnaryTensor *t, const float *x);
extern void lut_to_float(const LogUnaryTensor *t, float *out);
extern void lum_from_float(LogUnaryMatrix *m, const float *data);
extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);
/* Test matvec correctness against float reference */
static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);
/* Random float matrix and vector */
float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
float *x_float = (float *)malloc(cols * sizeof(float));
float *y_ref = (float *)calloc(rows, sizeof(float));
float *y_lut = (float *)malloc(rows * sizeof(float));
srand(42);
for (size_t i = 0; i < (size_t)rows * cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
for (int i = 0; i < cols; i++) {
float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
}
/* Float reference matmul */
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];
/* Log-unary matmul */
LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
LogUnaryTensor *x = lut_alloc(cols, planes, bias);
LogUnaryTensor *y = lut_alloc(rows, planes, bias);
lum_from_float(M, M_float);
lut_from_float(x, x_float);
lum_matvec(M, x, y);
lut_to_float(y, y_lut);
/* Compare */
float dot = 0, na = 0, nb = 0, max_err = 0;
for (int i = 0; i < rows; i++) {
dot += y_ref[i] * y_lut[i];
na += y_ref[i] * y_ref[i];
nb += y_lut[i] * y_lut[i];
float err = fabsf(y_ref[i] - y_lut[i]);
if (err > max_err) max_err = err;
}
float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
float noise = 0;
for (int i = 0; i < rows; i++) {
float e = y_ref[i] - y_lut[i];
noise += e * e;
}
float snr = 10.0f * log10f(na / (noise + 1e-10f));
printf(" Cosine similarity: %.6f\n", cosine);
printf(" SNR: %.1f dB\n", snr);
printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));
/* Show first few values */
printf(" First 5 values:\n");
for (int i = 0; i < 5 && i < rows; i++)
printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);
lum_free(M); lut_free(x); lut_free(y);
free(M_float); free(x_float); free(y_ref); free(y_lut);
}
int main() {
srand(time(NULL));
printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");
/* 1. Roundtrip accuracy at different plane counts */
printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
for (int np = 4; np <= 12; np += 2) {
int bias = np / 2;
AccuracyResult r = lut_accuracy_test(4096, np, bias);
printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
}
/* 2. Matvec correctness */
test_matvec_correctness(64, 256, 7, 3);
test_matvec_correctness(256, 1024, 7, 3);
test_matvec_correctness(512, 2560, 7, 3); /* Qwen3-4B hidden dim */
/* 3. Speed benchmarks - various configurations */
printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
printf("%10s %6s %6s %6s %10s %10s %10s\n",
"Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");
struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
/* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
{4096, 2560, 7, 4, 3, "q_proj"},
{4096, 2560, 7, 7, 3, "q_proj_7x7"},
{1024, 2560, 7, 4, 3, "k_proj"},
/* Qwen3-4B MLP: inter=9728 */
{9728, 2560, 7, 4, 3, "gate_proj"},
{2560, 9728, 7, 4, 3, "down_proj"},
/* Different plane counts */
{4096, 2560, 4, 4, 2, "4x4"},
{4096, 2560, 8, 8, 4, "8x8"},
{4096, 2560, 10, 6, 3, "10x6"},
};
int n_configs = sizeof(configs) / sizeof(configs[0]);
for (int c = 0; c < n_configs; c++) {
int iters = 3;
BenchResult r = lum_bench_matvec(
configs[c].rows, configs[c].cols,
configs[c].wp, configs[c].xp, configs[c].bias, iters);
printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n",
configs[c].rows, configs[c].cols,
configs[c].wp, configs[c].xp, configs[c].bias,
r.wall_time_s * 1000,
r.elements_per_sec / 1e6,
r.gops,
configs[c].label);
}
printf("\n=== DONE ===\n");
return 0;
}
|