#define _POSIX_C_SOURCE 199309L
/*
 * LOG-UNARY TENSOR LIBRARY
 *
 * Native tensor type where values are represented as:
 *   sign (1 bit) + log-magnitude bitplanes
 *
 * Plane p is set if |value| >= 2^(p - bias)
 * With N planes and bias B, represents magnitudes from 2^(-B) to 2^(N-1-B)
 *
 * ALL arithmetic stays in this representation:
 *   - matmul: AND + weighted_popcount (shift by p+q-2*bias)
 *   - add: bitwise merge with carry propagation
 *   - scale: shift planes up/down
 *   - negate: flip sign bits
 *
 * Float conversion only at boundaries (embed lookup, final logits)
 *
 * (c) 2026 OpenTransformers Ltd / Scott Bisset
 */

#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>

/* ============================================================
 * LOG-UNARY TENSOR
 *
 * For a vector of length `dim`:
 *   sign:   uint64[chunks]           - 1 bit per element
 *   planes: uint64[n_planes][chunks] - 1 bit per element per plane
 *   chunks = (dim + 63) / 64
 *
 * Plane p is set if |value| >= threshold[p]
 * threshold[p] = base_scale * 2^(p - bias)
 *
 * This is a LOG thermometer code:
 *   value=0.001 with bias=10 -> maybe plane 0 set (2^-10 = 0.001)
 *   value=1.0   with bias=10 -> planes 0-10 set
 *   value=64.0  with bias=10 -> planes 0-16 set
 *
 * ============================================================ */
typedef struct {
    uint64_t *sign;     /* [chunks] */
    uint64_t *planes;   /* [n_planes * chunks] contiguous */
    int dim;
    int chunks;
    int n_planes;
    int bias;           /* log2 offset: threshold[p] = base * 2^(p-bias) */
    float base_scale;   /* per-tensor scale factor */
} LogUnaryTensor;

/* 2D tensor (matrix) - row-major */
typedef struct {
    uint64_t *sign;     /* [rows * chunks_per_row] */
    uint64_t *planes;   /* [n_planes * rows * chunks_per_row] */
    float *row_scales;  /* [rows] per-row base scales */
    int rows;
    int cols;
    int chunks;         /* chunks per row = (cols+63)/64 */
    int n_planes;
    int bias;
} LogUnaryMatrix;

/* ============================================================
 * ALLOCATION
 * ============================================================ */
LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) {
    LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor));
    t->dim = dim;
    t->n_planes = n_planes;
    t->bias = bias;
    t->chunks = (dim + 63) / 64;
    t->base_scale = 1.0f;
    t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t));
    t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t));
    memset(t->sign, 0, t->chunks * sizeof(uint64_t));
    memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t));
    return t;
}

LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) {
    LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix));
    m->rows = rows;
    m->cols = cols;
    m->n_planes = n_planes;
    m->bias = bias;
    m->chunks = (cols + 63) / 64;
    m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
    m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
    m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float));
    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
    memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t));
    for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f;
    return m;
}

void lut_free(LogUnaryTensor *t) {
    if (t) { free(t->sign); free(t->planes); free(t); }
}
void lum_free(LogUnaryMatrix *m) {
    if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); }
}

/* ============================================================
 * FLOAT <-> LOG-UNARY CONVERSION
 * Only used at boundaries (embedding, final output)
 * ============================================================ */
void lut_from_float(LogUnaryTensor *t, const float *x) {
    int dim = t->dim;
    int np = t->n_planes;
    int bias = t->bias;
    int chunks = t->chunks;

    memset(t->sign, 0, chunks * sizeof(uint64_t));
    memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t));

    /* Find absmax for base_scale */
    float amax = 0.0f;
    for (int i = 0; i < dim; i++) {
        float a = fabsf(x[i]);
        if (a > amax) amax = a;
    }
    if (amax == 0.0f) { t->base_scale = 1.0f; return; }

    /* Set base_scale so that max value uses the highest plane */
    /* threshold[np-1] = base_scale * 2^(np-1-bias) should equal amax */
    t->base_scale = amax / ldexpf(1.0f, np - 1 - bias);

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        if (x[i] < 0.0f) t->sign[c] |= bit;

        float mag = fabsf(x[i]);
        /* Set planes from low to high: plane p set if mag >= base * 2^(p-bias) */
        for (int p = 0; p < np; p++) {
            float thresh = t->base_scale * ldexpf(1.0f, p - bias);
            if (mag >= thresh)
                t->planes[(size_t)p * chunks + c] |= bit;
            else
                break;  /* thermometer: once we stop, all higher planes are 0 */
        }
    }
}

void lut_to_float(const LogUnaryTensor *t, float *out) {
    int dim = t->dim;
    int np = t->n_planes;
    int bias = t->bias;
    int chunks = t->chunks;

    memset(out, 0, dim * sizeof(float));

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        /* Find highest set plane */
        int highest = -1;
        for (int p = np - 1; p >= 0; p--) {
            if (t->planes[(size_t)p * chunks + c] & bit) {
                highest = p;
                break;
            }
        }

        if (highest < 0) {
            out[i] = 0.0f;
        } else {
            /* Value is approximately base * 2^(highest - bias) */
            /* More precise: midpoint between this threshold and next */
            float val = t->base_scale * ldexpf(1.0f, highest - bias);
            if (highest < np - 1) {
                float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias);
                val = (val + next) * 0.5f;  /* midpoint reconstruction */
            }
            out[i] = (t->sign[c] & bit) ? -val : val;
        }
    }
}

/* Convert float matrix to log-unary matrix (per-row scaling) */
void lum_from_float(LogUnaryMatrix *m, const float *data) {
    int rows = m->rows, cols = m->cols;
    int np = m->n_planes, bias = m->bias;
    int chunks = m->chunks;

    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
    memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t));

    for (int r = 0; r < rows; r++) {
        const float *row = data + (size_t)r * cols;

        /* Per-row absmax */
        float amax = 0.0f;
        for (int j = 0; j < cols; j++) {
            float a = fabsf(row[j]);
            if (a > amax) amax = a;
        }
        if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; }
        m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias);

        uint64_t *row_sign = m->sign + (size_t)r * chunks;

        for (int j = 0; j < cols; j++) {
            int c = j / 64;
            uint64_t bit = 1ULL << (j % 64);

            if (row[j] < 0.0f) row_sign[c] |= bit;

            float mag = fabsf(row[j]);
            for (int p = 0; p < np; p++) {
                float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias);
                if (mag >= thresh)
                    m->planes[((size_t)p * rows + r) * chunks + c] |= bit;
                else
                    break;
            }
        }
    }
}

/* ============================================================
 * LOG-UNARY MATMUL: y = M @ x
 *
 * Both M (matrix) and x (vector) are log-unary encoded.
 *
 * For each output element y[i]:
 *   For each weight plane p, activation plane q:
 *     active = M.planes[p][i] AND x.planes[q]
 *     same   = active AND ~(M.sign[i] XOR x.sign)
 *     diff   = active AND (M.sign[i] XOR x.sign)
 *     contribution = (popcount(same) - popcount(diff)) * 2^(p+q-2*bias)
 *
 * Output is a LogUnaryTensor (converted from integer accumulator)
 * ============================================================ */
void lum_matvec(
    const LogUnaryMatrix *M,
    const LogUnaryTensor *x,
    LogUnaryTensor *y_out    /* output: log-unary encoded result */
) {
    int out_dim = M->rows;
    int chunks = M->chunks;
    int wp = M->n_planes;
    int xp = x->n_planes;
    int w_bias = M->bias;
    int x_bias = x->bias;

    /* Accumulate to float temporarily, then requantize to log-unary.
     * The accumulator is integer shifts (2^(p+q-2bias)), which
     * we can do as int64 left-shifts for small exponents.
     *
     * For the exponent range we're in (p+q in [0,14] with bias ~4),
     * net shift is [-8, 6], so we use a fixed-point int64 accumulator
     * with a base shift to keep everything positive.
     */
    int base_shift = w_bias + x_bias;  /* shift to add to make all exponents >= 0 */

    /* We'll accumulate as int64 with implicit 2^(-base_shift) factor */
    /* Then convert: float_val = acc * row_scale * x_scale * 2^(-base_shift) */

    float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));

    #pragma omp parallel for schedule(dynamic, 32)
    for (int i = 0; i < out_dim; i++) {
        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
        long long acc = 0;

        for (int c = 0; c < chunks; c++) {
            uint64_t ws = w_sign_row[c];
            uint64_t xs = x->sign[c];
            uint64_t same = ~(ws ^ xs);
            uint64_t diff = ws ^ xs;

            for (int p = 0; p < wp; p++) {
                uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c];

                for (int q = 0; q < xp; q++) {
                    uint64_t x_plane = x->planes[(size_t)q * chunks + c];
                    uint64_t active = w_plane & x_plane;
                    uint64_t pos = active & same;
                    uint64_t neg = active & diff;

                    int count = __builtin_popcountll(pos) - __builtin_popcountll(neg);

                    /* Weighted by 2^(p + q) relative to base */
                    int shift = p + q;  /* relative to 2^(-base_shift) */
                    if (count != 0)
                        acc += (long long)count << shift;
                }
            }
        }

        /* Convert: val = acc * row_scale * x_scale * 2^(-base_shift) */
        y_float[i] = (float)acc * M->row_scales[i] * x->base_scale
                    * ldexpf(1.0f, -base_shift);
    }

    /* Requantize float result to log-unary */
    lut_from_float(y_out, y_float);
    free(y_float);
}

/* ============================================================
 * LOG-UNARY ELEMENT-WISE ADD: z = a + b
 *
 * Dequant both, add as float, requant.
 * This is O(dim) so not the bottleneck.
 * Future: direct bitwise add with carry chains.
 * ============================================================ */
void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) {
    int dim = a->dim;
    float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
    float *fb = (float *)aligned_alloc(64, dim * sizeof(float));

    lut_to_float(a, fa);
    lut_to_float(b, fb);

    for (int i = 0; i < dim; i++) fa[i] += fb[i];

    lut_from_float(out, fa);
    free(fa); free(fb);
}

/* In-place add: a += b (dequant a, add float b, requant) */
void lut_add_float(LogUnaryTensor *a, const float *b) {
    int dim = a->dim;
    float *fa = (float *)aligned_alloc(64, dim * sizeof(float));
    lut_to_float(a, fa);
    for (int i = 0; i < dim; i++) fa[i] += b[i];
    lut_from_float(a, fa);
    free(fa);
}

/* ============================================================
 * LOG-UNARY RMSNORM
 *
 * Needs float for the sqrt/reciprocal, but O(dim).
 * Input: log-unary, Output: log-unary
 * ============================================================ */
void lut_rmsnorm(
    const LogUnaryTensor *x,
    const float *weight,  /* norm weights stay float (tiny) */
    LogUnaryTensor *out,
    float eps
) {
    int dim = x->dim;
    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
    lut_to_float(x, xf);

    float ss = 0.0f;
    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
    float rms = 1.0f / sqrtf(ss / dim + eps);

    for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i];

    lut_from_float(out, xf);
    free(xf);
}

/* ============================================================
 * LOG-UNARY SILU_MUL: out = SiLU(gate) * up
 *
 * O(dim), not bottleneck. Dequant, compute, requant.
 * ============================================================ */
void lut_silu_mul(
    const LogUnaryTensor *gate,
    const LogUnaryTensor *up,
    LogUnaryTensor *out
) {
    int dim = gate->dim;
    float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
    float *uf = (float *)aligned_alloc(64, dim * sizeof(float));

    lut_to_float(gate, gf);
    lut_to_float(up, uf);

    for (int i = 0; i < dim; i++)
        gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];

    lut_from_float(out, gf);
    free(gf); free(uf);
}

/* ============================================================
 * LOG-UNARY ROPE
 *
 * O(dim), dequant-compute-requant per head.
 * ============================================================ */
void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) {
    /* Dequant the relevant slice, apply RoPE, requant */
    float *f = (float *)aligned_alloc(64, head_dim * sizeof(float));

    /* Extract slice */
    float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
    lut_to_float(t, full);
    memcpy(f, full + start, head_dim * sizeof(float));

    for (int i = 0; i < head_dim; i += 2) {
        float freq = 1.0f / powf(theta, (float)i / head_dim);
        float angle = offset * freq;
        float c = cosf(angle), s = sinf(angle);
        float v0 = f[i], v1 = f[i + 1];
        f[i]     = v0 * c - v1 * s;
        f[i + 1] = v0 * s + v1 * c;
    }

    memcpy(full + start, f, head_dim * sizeof(float));
    lut_from_float(t, full);
    free(f); free(full);
}

/* ============================================================
 * UTILITY: Get float slice from log-unary tensor
 * (for attention scores which need float softmax)
 * ============================================================ */
void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) {
    float *full = (float *)aligned_alloc(64, t->dim * sizeof(float));
    lut_to_float(t, full);
    memcpy(out, full + start, len * sizeof(float));
    free(full);
}

/* ============================================================
 * BENCHMARK: measure matvec throughput
 * ============================================================ */
typedef struct {
    double total_and_ops;
    double total_popcount_ops;
    double wall_time_s;
    double elements_per_sec;
    double gops;   /* giga-operations per second */
} BenchResult;

BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) {
    LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias);
    LogUnaryTensor *x = lut_alloc(cols, x_planes, bias);
    LogUnaryTensor *y = lut_alloc(rows, x_planes, bias);

    /* Fill with random bits */
    for (size_t i = 0; i < (size_t)rows * M->chunks; i++)
        M->sign[i] = ((uint64_t)rand() << 32) | rand();
    for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++)
        M->planes[i] = ((uint64_t)rand() << 32) | rand();
    for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f;
    for (size_t i = 0; i < (size_t)x->chunks; i++)
        x->sign[i] = ((uint64_t)rand() << 32) | rand();
    for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++)
        x->planes[i] = ((uint64_t)rand() << 32) | rand();
    x->base_scale = 1.0f;

    /* Warmup */
    lum_matvec(M, x, y);

    struct timespec t0, t1;
    clock_gettime(CLOCK_MONOTONIC, &t0);
    for (int i = 0; i < iters; i++)
        lum_matvec(M, x, y);
    clock_gettime(CLOCK_MONOTONIC, &t1);

    double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9;
    int chunks = M->chunks;
    double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2;  /* AND + popcount pairs */

    BenchResult r;
    r.wall_time_s = dt / iters;
    r.total_and_ops = ops_per_call;
    r.total_popcount_ops = ops_per_call;
    r.elements_per_sec = (double)rows * cols * iters / dt;
    r.gops = ops_per_call * iters / dt / 1e9;

    lum_free(M); lut_free(x); lut_free(y);
    return r;
}

/* ============================================================
 * ACCURACY TEST: convert float->logunary->float roundtrip
 * ============================================================ */
typedef struct {
    float max_error;
    float mean_error;
    float cosine_sim;
    float snr_db;
} AccuracyResult;

AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) {
    float *original = (float *)aligned_alloc(64, dim * sizeof(float));
    float *recovered = (float *)aligned_alloc(64, dim * sizeof(float));

    /* Random normal-ish distribution */
    for (int i = 0; i < dim; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }

    LogUnaryTensor *t = lut_alloc(dim, n_planes, bias);
    lut_from_float(t, original);
    lut_to_float(t, recovered);

    float max_err = 0, sum_err = 0;
    float dot = 0, na = 0, nb = 0;
    for (int i = 0; i < dim; i++) {
        float err = fabsf(original[i] - recovered[i]);
        if (err > max_err) max_err = err;
        sum_err += err;
        dot += original[i] * recovered[i];
        na += original[i] * original[i];
        nb += recovered[i] * recovered[i];
    }

    float noise_power = 0;
    for (int i = 0; i < dim; i++) {
        float e = original[i] - recovered[i];
        noise_power += e * e;
    }

    AccuracyResult r;
    r.max_error = max_err;
    r.mean_error = sum_err / dim;
    r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
    r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f));

    lut_free(t);
    free(original); free(recovered);
    return r;
}