/*
 * Log-Unary Tensor Tests
 * Benchmarks accuracy and speed of native base-1 log-encoded tensors
 */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

/* Forward declarations from library */
typedef struct LogUnaryTensor LogUnaryTensor;
typedef struct LogUnaryMatrix LogUnaryMatrix;
typedef struct { double total_and_ops, total_popcount_ops, wall_time_s, elements_per_sec, gops; } BenchResult;
typedef struct { float max_error, mean_error, cosine_sim, snr_db; } AccuracyResult;

extern LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias);
extern LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias);
extern void lut_free(LogUnaryTensor *t);
extern void lum_free(LogUnaryMatrix *m);
extern void lut_from_float(LogUnaryTensor *t, const float *x);
extern void lut_to_float(const LogUnaryTensor *t, float *out);
extern void lum_from_float(LogUnaryMatrix *m, const float *data);
extern void lum_matvec(const LogUnaryMatrix *M, const LogUnaryTensor *x, LogUnaryTensor *y);
extern void lut_rmsnorm(const LogUnaryTensor *x, const float *weight, LogUnaryTensor *out, float eps);
extern void lut_silu_mul(const LogUnaryTensor *gate, const LogUnaryTensor *up, LogUnaryTensor *out);
extern BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters);
extern AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias);

/* Test matvec correctness against float reference */
static void test_matvec_correctness(int rows, int cols, int planes, int bias) {
    printf("\n--- MATVEC CORRECTNESS: %dx%d, %d planes, bias=%d ---\n", rows, cols, planes, bias);

    /* Random float matrix and vector */
    float *M_float = (float *)malloc((size_t)rows * cols * sizeof(float));
    float *x_float = (float *)malloc(cols * sizeof(float));
    float *y_ref = (float *)calloc(rows, sizeof(float));
    float *y_lut = (float *)malloc(rows * sizeof(float));

    srand(42);
    for (size_t i = 0; i < (size_t)rows * cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        M_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }
    for (int i = 0; i < cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        x_float[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }

    /* Float reference matmul */
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            y_ref[i] += M_float[(size_t)i * cols + j] * x_float[j];

    /* Log-unary matmul */
    LogUnaryMatrix *M = lum_alloc(rows, cols, planes, bias);
    LogUnaryTensor *x = lut_alloc(cols, planes, bias);
    LogUnaryTensor *y = lut_alloc(rows, planes, bias);

    lum_from_float(M, M_float);
    lut_from_float(x, x_float);
    lum_matvec(M, x, y);
    lut_to_float(y, y_lut);

    /* Compare */
    float dot = 0, na = 0, nb = 0, max_err = 0;
    for (int i = 0; i < rows; i++) {
        dot += y_ref[i] * y_lut[i];
        na += y_ref[i] * y_ref[i];
        nb += y_lut[i] * y_lut[i];
        float err = fabsf(y_ref[i] - y_lut[i]);
        if (err > max_err) max_err = err;
    }
    float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);

    float noise = 0;
    for (int i = 0; i < rows; i++) {
        float e = y_ref[i] - y_lut[i];
        noise += e * e;
    }
    float snr = 10.0f * log10f(na / (noise + 1e-10f));

    printf("  Cosine similarity: %.6f\n", cosine);
    printf("  SNR: %.1f dB\n", snr);
    printf("  Max abs error: %.4f (ref max: %.4f)\n", max_err, sqrtf(na / rows));

    /* Show first few values */
    printf("  First 5 values:\n");
    for (int i = 0; i < 5 && i < rows; i++)
        printf("    ref=%.4f  lut=%.4f  err=%.4f\n", y_ref[i], y_lut[i], y_ref[i] - y_lut[i]);

    lum_free(M); lut_free(x); lut_free(y);
    free(M_float); free(x_float); free(y_ref); free(y_lut);
}

int main() {
    srand(time(NULL));

    printf("=== LOG-UNARY TENSOR LIBRARY TESTS ===\n");

    /* 1. Roundtrip accuracy at different plane counts */
    printf("\n--- ROUNDTRIP ACCURACY (dim=4096, bias=planes/2) ---\n");
    printf("%6s %6s %8s %8s %10s %8s\n", "Planes", "Bias", "Cosine", "MeanErr", "MaxErr", "SNR_dB");
    for (int np = 4; np <= 12; np += 2) {
        int bias = np / 2;
        AccuracyResult r = lut_accuracy_test(4096, np, bias);
        printf("%6d %6d %8.6f %8.5f %10.5f %8.1f\n",
               np, bias, r.cosine_sim, r.mean_error, r.max_error, r.snr_db);
    }

    /* 2. Matvec correctness */
    test_matvec_correctness(64, 256, 7, 3);
    test_matvec_correctness(256, 1024, 7, 3);
    test_matvec_correctness(512, 2560, 7, 3);  /* Qwen3-4B hidden dim */

    /* 3. Speed benchmarks - various configurations */
    printf("\n--- SPEED BENCHMARKS (16 threads) ---\n");
    printf("%10s %6s %6s %6s %10s %10s %10s\n",
           "Size", "WP", "XP", "Bias", "ms/call", "Melem/s", "GOps/s");

    struct { int rows; int cols; int wp; int xp; int bias; const char *label; } configs[] = {
        /* Qwen3-4B attention: hidden=2560, heads*dim=4096 */
        {4096, 2560, 7, 4, 3, "q_proj"},
        {4096, 2560, 7, 7, 3, "q_proj_7x7"},
        {1024, 2560, 7, 4, 3, "k_proj"},
        /* Qwen3-4B MLP: inter=9728 */
        {9728, 2560, 7, 4, 3, "gate_proj"},
        {2560, 9728, 7, 4, 3, "down_proj"},
        /* Different plane counts */
        {4096, 2560, 4, 4, 2, "4x4"},
        {4096, 2560, 8, 8, 4, "8x8"},
        {4096, 2560, 10, 6, 3, "10x6"},
    };
    int n_configs = sizeof(configs) / sizeof(configs[0]);

    for (int c = 0; c < n_configs; c++) {
        int iters = 3;
        BenchResult r = lum_bench_matvec(
            configs[c].rows, configs[c].cols,
            configs[c].wp, configs[c].xp, configs[c].bias, iters);
        printf("%5dx%-5d %4dw %4dx %4db %8.1fms %8.1fM %8.1fG  [%s]\n",
               configs[c].rows, configs[c].cols,
               configs[c].wp, configs[c].xp, configs[c].bias,
               r.wall_time_s * 1000,
               r.elements_per_sec / 1e6,
               r.gops,
               configs[c].label);
    }

    printf("\n=== DONE ===\n");
    return 0;
}