File size: 6,193 Bytes
19ed98b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
/*
 * Log-Unary Tensor Tests
 * Benchmarks accuracy and speed of native base-1 log-encoded tensors
 */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

/* Forward declarations from library */
typedef struct LogUnaryTensor LogUnaryTensor;
typedef struct LogUnaryMatrix LogUnaryMatrix;
typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;

extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
extern void lut_free(LogUnaryTensor *t);
extern void lum_free(LogUnaryMatrix *m);
extern void lut_from_float(LogUnaryTensor *t, const float *x);
extern void lut_to_float(const LogUnaryTensor *t, float *out);
extern void lum_from_float(LogUnaryMatrix *m, const float *data);
extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);

/* Test matvec correctness against float reference */
static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
    printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);

    /* Random float matrix and vector */
    float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
    float *x_float = (float *)malloc(cols * sizeof(float));
    float *y_ref = (float *)calloc(rows, sizeof(float));
    float *y_lut = (float *)malloc(rows * sizeof(float));

    srand(42);
    for (size_t i = 0; i < (size_t)rows * cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }
    for (int i = 0; i < cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }

    /* Float reference matmul */
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];

    /* Log-unary matmul */
    LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
    LogUnaryTensor *x = lut_alloc(cols, planes, bias);
    LogUnaryTensor *y = lut_alloc(rows, planes, bias);

    lum_from_float(M, M_float);
    lut_from_float(x, x_float);
    lum_matvec(M, x, y);
    lut_to_float(y, y_lut);

    /* Compare */
    float dot = 0, na = 0, nb = 0, max_err = 0;
    for (int i = 0; i < rows; i++) {
        dot += y_ref[i] * y_lut[i];
        na += y_ref[i] * y_ref[i];
        nb += y_lut[i] * y_lut[i];
        float err = fabsf(y_ref[i] - y_lut[i]);
        if (err > max_err) max_err = err;
    }
    float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);

    float noise = 0;
    for (int i = 0; i < rows; i++) {
        float e = y_ref[i] - y_lut[i];
        noise += e * e;
    }
    float snr = 10.0f * log10f(na / (noise + 1e-10f));

    printf("  Cosine similarity: %.6f\n", cosine);
    printf("  SNR: %.1f dB\n", snr);
    printf("  Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));

    /* Show first few values */
    printf("  First 5 values:\n");
    for (int i = 0; i < 5 && i < rows; i++)
        printf("    ref=%.4f  lut=%.4f  err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);

    lum_free(M); lut_free(x); lut_free(y);
    free(M_float); free(x_float); free(y_ref); free(y_lut);
}

int main() {
    srand(time(NULL));

    printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");

    /* 1. Roundtrip accuracy at different plane counts */
    printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
    printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
    for (int np = 4; np <= 12; np += 2) {
        int bias = np / 2;
        AccuracyResult r = lut_accuracy_test(4096, np, bias);
        printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
               np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
    }

    /* 2. Matvec correctness */
    test_matvec_correctness(64, 256, 7, 3);
    test_matvec_correctness(256, 1024, 7, 3);
    test_matvec_correctness(512, 2560, 7, 3);  /* Qwen3-4B hidden dim */

    /* 3. Speed benchmarks - various configurations */
    printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
    printf("%10s %6s %6s %6s %10s %10s %10s\n",
           "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");

    struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
        /* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
        {4096, 2560, 7, 4, 3, "q_proj"},
        {4096, 2560, 7, 7, 3, "q_proj_7x7"},
        {1024, 2560, 7, 4, 3, "k_proj"},
        /* Qwen3-4B MLP: inter=9728 */
        {9728, 2560, 7, 4, 3, "gate_proj"},
        {2560, 9728, 7, 4, 3, "down_proj"},
        /* Different plane counts */
        {4096, 2560, 4, 4, 2, "4x4"},
        {4096, 2560, 8, 8, 4, "8x8"},
        {4096, 2560, 10, 6, 3, "10x6"},
    };
    int n_configs = sizeof(configs) / sizeof(configs[0]);

    for (int c = 0; c < n_configs; c++) {
        int iters = 3;
        BenchResult r = lum_bench_matvec(
            configs[c].rows, configs[c].cols,
            configs[c].wp, configs[c].xp, configs[c].bias, iters);
        printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG  [%s]\n",
               configs[c].rows, configs[c].cols,
               configs[c].wp, configs[c].xp, configs[c].bias,
               r.wall_time_s * 1000,
               r.elements_per_sec / 1e6,
               r.gops,
               configs[c].label);
    }

    printf("\n=== DONE ===\n");
    return 0;
}