| | |
| | |
| | |
| | |
| | #include <stdio.h> |
| | #include <stdlib.h> |
| | #include <math.h> |
| | #include <time.h> |
| |
|
| | |
| | typedef struct LogUnaryTensor LogUnaryTensor; |
| | typedef struct LogUnaryMatrix LogUnaryMatrix; |
| | typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult; |
| | typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult; |
| |
|
| | extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias); |
| | extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias); |
| | extern void lut_free(LogUnaryTensor *t); |
| | extern void lum_free(LogUnaryMatrix *m); |
| | extern void lut_from_float(LogUnaryTensor *t, const float *x); |
| | extern void lut_to_float(const LogUnaryTensor *t, float *out); |
| | extern void lum_from_float(LogUnaryMatrix *m, const float *data); |
| | extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y); |
| | extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps); |
| | extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out); |
| | extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters); |
| | extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias); |
| |
|
| | |
| | static void test_matvec_correctness(int rows, int cols, int planes, int bias) { |
| | printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias); |
| |
|
| | |
| | float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float)); |
| | float *x_float = (float *)malloc(cols * sizeof(float)); |
| | float *y_ref = (float *)calloc(rows, sizeof(float)); |
| | float *y_lut = (float *)malloc(rows * sizeof(float)); |
| |
|
| | srand(42); |
| | for (size_t i = 0; i < (size_t)rows * cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| |
|
| | |
| | for (int i = 0; i < rows; i++) |
| | for (int j = 0; j < cols; j++) |
| | y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j]; |
| |
|
| | |
| | LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias); |
| | LogUnaryTensor *x = lut_alloc(cols, planes, bias); |
| | LogUnaryTensor *y = lut_alloc(rows, planes, bias); |
| |
|
| | lum_from_float(M, M_float); |
| | lut_from_float(x, x_float); |
| | lum_matvec(M, x, y); |
| | lut_to_float(y, y_lut); |
| |
|
| | |
| | float dot = 0, na = 0, nb = 0, max_err = 0; |
| | for (int i = 0; i < rows; i++) { |
| | dot += y_ref[i] * y_lut[i]; |
| | na += y_ref[i] * y_ref[i]; |
| | nb += y_lut[i] * y_lut[i]; |
| | float err = fabsf(y_ref[i] - y_lut[i]); |
| | if (err > max_err) max_err = err; |
| | } |
| | float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| |
|
| | float noise = 0; |
| | for (int i = 0; i < rows; i++) { |
| | float e = y_ref[i] - y_lut[i]; |
| | noise += e * e; |
| | } |
| | float snr = 10.0f * log10f(na / (noise + 1e-10f)); |
| |
|
| | printf(" Cosine similarity: %.6f\n", cosine); |
| | printf(" SNR: %.1f dB\n", snr); |
| | printf(" Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows)); |
| |
|
| | |
| | printf(" First 5 values:\n"); |
| | for (int i = 0; i < 5 && i < rows; i++) |
| | printf(" ref=%.4f lut=%.4f err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]); |
| |
|
| | lum_free(M); lut_free(x); lut_free(y); |
| | free(M_float); free(x_float); free(y_ref); free(y_lut); |
| | } |
| |
|
| | int main() { |
| | srand(time(NULL)); |
| |
|
| | printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n"); |
| |
|
| | |
| | printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n"); |
| | printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB"); |
| | for (int np = 4; np <= 12; np += 2) { |
| | int bias = np / 2; |
| | AccuracyResult r = lut_accuracy_test(4096, np, bias); |
| | printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n", |
| | np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db); |
| | } |
| |
|
| | |
| | test_matvec_correctness(64, 256, 7, 3); |
| | test_matvec_correctness(256, 1024, 7, 3); |
| | test_matvec_correctness(512, 2560, 7, 3); |
| |
|
| | |
| | printf("\n--- SPEED BENCHMARKS (16 threads) ---\n"); |
| | printf("%10s %6s %6s %6s %10s %10s %10s\n", |
| | "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s"); |
| |
|
| | struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = { |
| | |
| | {4096, 2560, 7, 4, 3, "q_proj"}, |
| | {4096, 2560, 7, 7, 3, "q_proj_7x7"}, |
| | {1024, 2560, 7, 4, 3, "k_proj"}, |
| | |
| | {9728, 2560, 7, 4, 3, "gate_proj"}, |
| | {2560, 9728, 7, 4, 3, "down_proj"}, |
| | |
| | {4096, 2560, 4, 4, 2, "4x4"}, |
| | {4096, 2560, 8, 8, 4, "8x8"}, |
| | {4096, 2560, 10, 6, 3, "10x6"}, |
| | }; |
| | int n_configs = sizeof(configs) / sizeof(configs[0]); |
| |
|
| | for (int c = 0; c < n_configs; c++) { |
| | int iters = 3; |
| | BenchResult r = lum_bench_matvec( |
| | configs[c].rows, configs[c].cols, |
| | configs[c].wp, configs[c].xp, configs[c].bias, iters); |
| | printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG [%s]\n", |
| | configs[c].rows, configs[c].cols, |
| | configs[c].wp, configs[c].xp, configs[c].bias, |
| | r.wall_time_s * 1000, |
| | r.elements_per_sec / 1e6, |
| | r.gops, |
| | configs[c].label); |
| | } |
| |
|
| | printf("\n=== DONE ===\n"); |
| | return 0; |
| | } |
| |
|