/*
 * TRUE UNARY TENSOR LIBRARY — BASE 1 ARITHMETIC
 *
 * Representation:
 *   A value of magnitude M is stored as M consecutive 1-bits.
 *   The number IS the count of ones.
 *   Every bit has weight exactly 1.
 *
 * For a vector element quantized to integer range [-K, K]:
 *   sign:      1 bit (0=positive, 1=negative)
 *   magnitude: K bit positions, first |value| are 1, rest are 0
 *
 * Storage layout for a vector of dim D with max magnitude K:
 *   sign:   uint64[(D+63)/64]           — one sign bit per element
 *   unary:  uint64[K * (D+63)/64]       — K bitplanes across D elements
 *            Plane p has bit j set iff |element_j| > p
 *            (thermometer = true unary in bitplane form)
 *
 * Multiplication: w * x = popcount of ones(w) matched with ones(x)
 *   Since every bit = 1, the dot product is JUST COUNTING.
 *   No weights, no shifts, no corrections.
 *   sum_j w_j*x_j = sum_p sum_q sum_j [w_plane_p_j AND x_plane_q_j]
 *                 = sum_p sum_q popcount(W_row_plane_p AND X_plane_q)
 *
 * YES this uses more memory. A 2560-dim vector with K=32 uses:
 *   32 * 2560 / 8 = 10 KB per vector (vs 5KB for FP16)
 *   But the MATH IS EXACT (to quantization level).
 *
 * (c) 2026 OpenTransformers Ltd / Scott Bisset
 */

#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>

/* ============================================================
 * TRUE UNARY VECTOR
 * ============================================================ */
typedef struct {
    uint64_t *sign;     /* [chunks] — 1 bit per element */
    uint64_t *unary;    /* [K * chunks] — K bitplanes, each bit = weight 1 */
    float     scale;    /* float scale: real_value = sign * count * scale */
    int       dim;
    int       chunks;   /* (dim+63)/64 */
    int       K;        /* max magnitude = number of unary bitplanes */
} TrueUnaryVec;

/* TRUE UNARY MATRIX — row-major */
typedef struct {
    uint64_t *sign;     /* [rows * chunks] */
    uint64_t *unary;    /* [K * rows * chunks] — plane p, row i at [p*rows*chunks + i*chunks] */
    float    *scales;   /* [rows] — per-row scale factors */
    int       rows;
    int       cols;
    int       chunks;   /* (cols+63)/64 */
    int       K;        /* max magnitude per element */
} TrueUnaryMat;

/* ============================================================
 * ALLOCATION
 * ============================================================ */
TrueUnaryVec* tuv_alloc(int dim, int K) {
    TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec));
    v->dim = dim;
    v->K = K;
    v->chunks = (dim + 63) / 64;
    v->scale = 1.0f;
    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
    v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t));
    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
    memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t));
    return v;
}

TrueUnaryMat* tum_alloc(int rows, int cols, int K) {
    TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat));
    m->rows = rows;
    m->cols = cols;
    m->K = K;
    m->chunks = (cols + 63) / 64;
    m->sign   = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
    m->unary  = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
    memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    for (int i = 0; i < rows; i++) m->scales[i] = 1.0f;
    return m;
}

void tuv_free(TrueUnaryVec *v) {
    if (v) { free(v->sign); free(v->unary); free(v); }
}
void tum_free(TrueUnaryMat *m) {
    if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); }
}

/* ============================================================
 * FLOAT → TRUE UNARY
 *
 * Quantize: integer_val = round(float_val / scale * K)
 * Then store |integer_val| as that many 1-bits.
 *
 * For vector: single global scale = absmax / K
 * For matrix: per-row scale = row_absmax / K
 * ============================================================ */
void tuv_from_float(TrueUnaryVec *v, const float *x) {
    int dim = v->dim, K = v->K, chunks = v->chunks;

    memset(v->sign, 0, chunks * sizeof(uint64_t));
    memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t));

    float amax = 0.0f;
    for (int i = 0; i < dim; i++) {
        float a = fabsf(x[i]);
        if (a > amax) amax = a;
    }
    if (amax == 0.0f) { v->scale = 1.0f; return; }
    v->scale = amax / K;

    float inv = K / amax;
    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        if (x[i] < 0.0f) v->sign[c] |= bit;

        int mag = (int)(fabsf(x[i]) * inv + 0.5f);
        if (mag > K) mag = K;

        /* TRUE UNARY: set planes 0 through mag-1 */
        for (int p = 0; p < mag; p++)
            v->unary[(size_t)p * chunks + c] |= bit;
    }
}

void tuv_to_float(const TrueUnaryVec *v, float *out) {
    int dim = v->dim, K = v->K, chunks = v->chunks;

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        /* Count set planes = magnitude in base-1 */
        int mag = 0;
        for (int p = 0; p < K; p++) {
            if (v->unary[(size_t)p * chunks + c] & bit)
                mag++;
        }

        float val = (float)mag * v->scale;
        out[i] = (v->sign[c] & bit) ? -val : val;
    }
}

void tum_from_float(TrueUnaryMat *m, const float *data) {
    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
    memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

    for (int r = 0; r < rows; r++) {
        const float *row = data + (size_t)r * cols;

        float amax = 0.0f;
        for (int j = 0; j < cols; j++) {
            float a = fabsf(row[j]);
            if (a > amax) amax = a;
        }
        if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
        m->scales[r] = amax / K;
        float inv = K / amax;

        uint64_t *row_sign = m->sign + (size_t)r * chunks;

        for (int j = 0; j < cols; j++) {
            int c = j / 64;
            uint64_t bit = 1ULL << (j % 64);

            if (row[j] < 0.0f) row_sign[c] |= bit;

            int mag = (int)(fabsf(row[j]) * inv + 0.5f);
            if (mag > K) mag = K;

            for (int p = 0; p < mag; p++)
                m->unary[((size_t)p * rows + r) * chunks + c] |= bit;
        }
    }
}

/* ============================================================
 * TRUE UNARY MATVEC: y = M @ x
 *
 * THE CORE OPERATION.
 *
 * For each output element y[i]:
 *   For each pair of planes (p from weight, q from activation):
 *     active = w_plane_p[i] AND x_plane_q
 *     same   = active AND ~(w_sign[i] XOR x_sign)
 *     diff   = active AND  (w_sign[i] XOR x_sign)
 *     acc += popcount(same) - popcount(diff)
 *
 *   EVERY PLANE PAIR HAS WEIGHT = 1.
 *   No shifts. No scaling between planes. No corrections.
 *   The count IS the answer.
 *
 *   y[i] = acc * w_scale[i] * x_scale
 *   (single float multiply at the very end)
 *
 * ============================================================ */
void tum_matvec(
    const TrueUnaryMat *M,
    const TrueUnaryVec *x,
    float *y_out   /* float output, requantize externally if needed */
) {
    int out_dim = M->rows;
    int chunks = M->chunks;
    int wK = M->K;
    int xK = x->K;

    #pragma omp parallel for schedule(dynamic, 32)
    for (int i = 0; i < out_dim; i++) {
        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
        long long acc = 0;

        for (int c = 0; c < chunks; c++) {
            uint64_t ws = w_sign_row[c];
            uint64_t xs = x->sign[c];
            uint64_t same = ~(ws ^ xs);
            uint64_t diff = ws ^ xs;

            /*
             * PURE BASE-1: every plane pair contributes weight 1.
             * acc += popcount(w_plane AND x_plane AND same_sign)
             *      - popcount(w_plane AND x_plane AND diff_sign)
             */
            for (int p = 0; p < wK; p++) {
                uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c];

                for (int q = 0; q < xK; q++) {
                    uint64_t xq = x->unary[(size_t)q * chunks + c];
                    uint64_t active = wp & xq;
                    acc += __builtin_popcountll(active & same)
                         - __builtin_popcountll(active & diff);
                }
            }
        }

        /* Single float rescale per output element */
        y_out[i] = (float)acc * M->scales[i] * x->scale;
    }
}

/* ============================================================
 * OPTIMIZED MATVEC: collapse x planes first
 *
 * Instead of iterating wK * xK plane pairs per chunk,
 * precompute per-chunk activation sums:
 *   x_mag_same[c] = sum_q popcount(x_plane_q[c] AND same_sign[c])
 *   x_mag_diff[c] = sum_q popcount(x_plane_q[c] AND diff_sign[c])
 *
 * Then for each weight plane p:
 *   This doesn't directly simplify because we need AND with wp first.
 *
 * ALTERNATIVE: precompute per-element x magnitudes in unary,
 * then the dot product is just: sum_j w_mag_j * x_mag_j * sign_j
 *
 * For now: provide both the naive and a vertically-accumulated variant.
 *
 * VERTICAL ACCUMULATE: sum all weight planes into a per-element
 * count, then multiply by x count. Reduces from O(wK*xK*chunks)
 * to O((wK+xK)*chunks + dim).
 * ============================================================ */
void tum_matvec_fast(
    const TrueUnaryMat *M,
    const TrueUnaryVec *x,
    float *y_out
) {
    int out_dim = M->rows;
    int cols = M->cols;
    int chunks = M->chunks;
    int xK = x->K;

    /* Step 1: compute x magnitudes (per-element popcount across planes)
     * x_mag[j] = number of x planes where bit j is set
     * This is O(xK * chunks) = O(xK * dim / 64)
     */
    int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));
    memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));

    for (int q = 0; q < xK; q++) {
        const uint64_t *xplane = x->unary + (size_t)q * chunks;
        for (int c = 0; c < chunks; c++) {
            uint64_t bits = xplane[c];
            while (bits) {
                int bit = __builtin_ctzll(bits);
                int j = c * 64 + bit;
                if (j < cols) x_mag[j]++;
                bits &= bits - 1;
            }
        }
    }

    /* Apply sign to x_mag: positive if same sign as... 
     * Actually we need signed x_mag relative to each weight row's sign.
     * So we keep x_mag unsigned and handle sign per output element.
     */

    /* Step 2: for each output row, compute:
     * y[i] = sum_j (w_mag[i][j] * x_mag[j]) * sign_agreement
     *
     * w_mag[i][j] = number of weight planes where bit j is set
     * sign_agreement = +1 if w_sign[j] == x_sign[j], else -1
     *
     * We compute w_mag by vertical popcount across weight planes.
     * This is O(wK * chunks) per row.
     */

    #pragma omp parallel
    {
        int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t));

        #pragma omp for schedule(dynamic, 32)
        for (int i = 0; i < out_dim; i++) {
            memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t));

            /* Vertical popcount: count set planes per element */
            for (int p = 0; p < M->K; p++) {
                const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks;
                for (int c = 0; c < chunks; c++) {
                    uint64_t bits = wplane[c];
                    while (bits) {
                        int bit = __builtin_ctzll(bits);
                        int j = c * 64 + bit;
                        if (j < cols) w_mag[j]++;
                        bits &= bits - 1;
                    }
                }
            }

            /* Dot product with sign */
            const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
            long long acc = 0;

            for (int j = 0; j < cols; j++) {
                int c = j / 64;
                uint64_t bit = 1ULL << (j % 64);
                int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit);
                int product = (int)w_mag[j] * (int)x_mag[j];
                acc += same_sign ? product : -product;
            }

            y_out[i] = (float)acc * M->scales[i] * x->scale;
        }

        free(w_mag);
    }

    free(x_mag);
}

/* ============================================================
 * BENCHMARK + ACCURACY
 * ============================================================ */
typedef struct {
    float cosine;
    float snr_db;
    float max_rel_err;
    double ms_naive;
    double ms_fast;
    double gops_naive;
    double gops_fast;
} TestResult;

TestResult tum_test(int rows, int cols, int wK, int xK, int iters) {
    TestResult r = {0};
    srand(42);

    /* Random float matrix and vector (normal distribution) */
    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
    float *xf = (float *)malloc(cols * sizeof(float));
    float *y_ref = (float *)calloc(rows, sizeof(float));
    float *y_naive = (float *)malloc(rows * sizeof(float));
    float *y_fast = (float *)malloc(rows * sizeof(float));

    for (size_t i = 0; i < (size_t)rows * cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }
    for (int i = 0; i < cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }

    /* Float reference */
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

    /* Convert to true unary */
    TrueUnaryMat *M = tum_alloc(rows, cols, wK);
    TrueUnaryVec *x = tuv_alloc(cols, xK);
    tum_from_float(M, Mf);
    tuv_from_float(x, xf);

    /* Naive matvec */
    struct timespec t0, t1;
    tum_matvec(M, x, y_naive);  /* warmup */
    clock_gettime(CLOCK_MONOTONIC, &t0);
    for (int i = 0; i < iters; i++)
        tum_matvec(M, x, y_naive);
    clock_gettime(CLOCK_MONOTONIC, &t1);
    r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;

    /* Fast matvec */
    tum_matvec_fast(M, x, y_fast);  /* warmup */
    clock_gettime(CLOCK_MONOTONIC, &t0);
    for (int i = 0; i < iters; i++)
        tum_matvec_fast(M, x, y_fast);
    clock_gettime(CLOCK_MONOTONIC, &t1);
    r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters;

    /* Accuracy vs float reference */
    float dot = 0, na = 0, nb = 0, max_re = 0;
    for (int i = 0; i < rows; i++) {
        dot += y_ref[i] * y_naive[i];
        na += y_ref[i] * y_ref[i];
        nb += y_naive[i] * y_naive[i];
        float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f);
        if (re > max_re) max_re = re;
    }
    r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
    float noise = 0;
    for (int i = 0; i < rows; i++) {
        float e = y_ref[i] - y_naive[i]; noise += e * e;
    }
    r.snr_db = 10.0f * log10f(na / (noise + 1e-10f));
    r.max_rel_err = max_re;

    /* Verify naive == fast */
    float fast_err = 0;
    for (int i = 0; i < rows; i++) {
        float e = fabsf(y_naive[i] - y_fast[i]);
        if (e > fast_err) fast_err = e;
    }
    if (fast_err > 0.01f)
        printf("  WARNING: naive vs fast max diff = %.4f\n", fast_err);

    double ops = 2.0 * rows * cols;
    r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6);
    r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6);

    tum_free(M); tuv_free(x);
    free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast);
    return r;
}

/* ============================================================
 * MAIN: sweep K values, show accuracy + speed tradeoff
 * ============================================================ */
int main() {
    printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n");
    printf("Every bit has weight 1. Value = count of ones.\n");
    printf("Matmul = AND + popcount, no weighting.\n\n");

    /* Sweep K for a fixed matrix size (Qwen3-4B q_proj: 4096x2560) */
    int rows = 4096, cols = 2560;
    printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols);

    printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n",
           "wK", "xK", "Cosine", "SNR_dB", "MaxRelE",
           "Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory");

    struct { int wK; int xK; } configs[] = {
        {8,   4},
        {8,   8},
        {16,  8},
        {16, 16},
        {32,  8},
        {32, 16},
        {32, 32},
        {64, 16},
        {64, 32},
    };
    int n = sizeof(configs) / sizeof(configs[0]);

    for (int c = 0; c < n; c++) {
        int wK = configs[c].wK;
        int xK = configs[c].xK;
        int iters = (wK <= 16 && xK <= 16) ? 3 : 1;

        TestResult r = tum_test(rows, cols, wK, xK, iters);

        /* Memory for this layer's weights */
        size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8;
        size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8;
        size_t scale_bytes = rows * 4;
        double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6;

        printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n",
               wK, xK, r.cosine, r.snr_db, r.max_rel_err,
               r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb);
    }

    /* Show first 5 values for K=32,16 case */
    printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n");
    {
        int sr = 512, sc = 2560;
        srand(42);
        float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float));
        float *xf = (float *)malloc(sc * sizeof(float));
        float *y_ref = (float *)calloc(sr, sizeof(float));
        float *y_unary = (float *)malloc(sr * sizeof(float));

        for (size_t i = 0; i < (size_t)sr * sc; i++) {
            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
        }
        for (int i = 0; i < sc; i++) {
            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
        }
        for (int i = 0; i < sr; i++)
            for (int j = 0; j < sc; j++)
                y_ref[i] += Mf[(size_t)i * sc + j] * xf[j];

        TrueUnaryMat *M = tum_alloc(sr, sc, 32);
        TrueUnaryVec *x = tuv_alloc(sc, 16);
        tum_from_float(M, Mf);
        tuv_from_float(x, xf);
        tum_matvec(M, x, y_unary);

        printf("%8s %8s %8s\n", "Ref", "Unary", "Error");
        for (int i = 0; i < 10; i++)
            printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]);

        tum_free(M); tuv_free(x);
        free(Mf); free(xf); free(y_ref); free(y_unary);
    }

    printf("\n=== DONE ===\n");
    return 0;
}