/*
 * PROPER UNARY — ONE QUANTUM, NO SCALES
 *
 * Every single bit in the entire system has weight = 1 quantum.
 * The quantum is set ONCE for the whole model.
 * There are NO per-vector scales. NO per-row scales.
 *
 * The number 5.0 with quantum=0.1 is stored as 50 ones.
 * The number 5.0 with quantum=0.01 is stored as 500 ones.
 * More precision = more bits. That's the tradeoff.
 *
 * ADDITION = CONCATENATION. Always. No exceptions.
 * Because every bit everywhere means the same thing.
 *
 * MATMUL: y[i] = sum_j W[i][j] * x[j]
 *   = sum over all (w_slot, x_slot) pairs:
 *       popcount(w_slot[i] AND x_slot AND same_sign) * quantum²
 *     - popcount(w_slot[i] AND x_slot AND diff_sign) * quantum²
 *   = quantum² * integer_count
 *
 * Output quantum = input_quantum² (magnitude grows)
 * Or we pick output quantum = input_quantum and accept
 * that the integer count includes the scaling.
 *
 * (c) 2026 OpenTransformers Ltd / Scott Bisset
 */

#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>

/* ============================================================
 * PROPER UNARY VECTOR
 * Every bit = 1 quantum. No local scale.
 * ============================================================ */
typedef struct {
    uint64_t *sign;     /* [chunks] */
    uint64_t *slots;    /* [n_slots * chunks] */
    int       dim;
    int       chunks;
    int       n_slots;
    int       cap;      /* max slots allocated */
} UVec;

/* Proper unary matrix — same quantum as vectors */
typedef struct {
    uint64_t *sign;     /* [rows * chunks] */
    uint64_t *slots;    /* [K * rows * chunks] */
    int       rows, cols, chunks, K;
} UMat;

/* Global system quantum */
typedef struct {
    float quantum;      /* every bit = this much */
    /* quantum² is the matmul output unit */
} USystem;

/* ============================================================
 * ALLOC
 * ============================================================ */
UVec* uv_new(int dim, int cap) {
    UVec *v = (UVec *)calloc(1, sizeof(UVec));
    v->dim = dim;
    v->chunks = (dim + 63) / 64;
    v->n_slots = 0;
    v->cap = cap;
    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
    v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t));
    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
    memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t));
    return v;
}

UMat* um_new(int rows, int cols, int K) {
    UMat *m = (UMat *)calloc(1, sizeof(UMat));
    m->rows = rows; m->cols = cols; m->K = K;
    m->chunks = (cols + 63) / 64;
    m->sign  = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
    m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
    memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    return m;
}

void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} }
void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} }

/* ============================================================
 * QUANTIZE: float → proper unary
 *
 * Given global quantum q:
 *   magnitude = round(|value| / q)
 *   That many slots get bit set.
 *
 * NO per-vector absmax. NO local scale.
 * Values that exceed K are clipped.
 * ============================================================ */
void uv_from_float(UVec *v, const float *x, int K, float quantum) {
    int dim = v->dim, chunks = v->chunks;
    v->n_slots = K;

    memset(v->sign, 0, chunks * sizeof(uint64_t));
    memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));

    float inv_q = 1.0f / quantum;
    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        if (x[i] < 0.0f) v->sign[c] |= bit;

        int mag = (int)(fabsf(x[i]) * inv_q + 0.5f);
        if (mag > K) mag = K;  /* clip */
        for (int s = 0; s < mag; s++)
            v->slots[(size_t)s * chunks + c] |= bit;
    }
}

void uv_to_float(const UVec *v, float *out, float quantum) {
    int dim = v->dim, chunks = v->chunks;

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        int mag = 0;
        for (int s = 0; s < v->n_slots; s++)
            if (v->slots[(size_t)s * chunks + c] & bit)
                mag++;

        out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum;
    }
}

void um_from_float(UMat *m, const float *data, float quantum) {
    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
    memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

    float inv_q = 1.0f / quantum;
    for (int r = 0; r < rows; r++) {
        const float *row = data + (size_t)r * cols;
        uint64_t *rs = m->sign + (size_t)r * chunks;

        for (int j = 0; j < cols; j++) {
            int c = j / 64;
            uint64_t bit = 1ULL << (j % 64);
            if (row[j] < 0.0f) rs[c] |= bit;

            int mag = (int)(fabsf(row[j]) * inv_q + 0.5f);
            if (mag > K) mag = K;
            for (int s = 0; s < mag; s++)
                m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
        }
    }
}

/* ============================================================
 * CONCATENATION = ADDITION
 *
 * Since every bit everywhere = same quantum,
 * appending slots IS adding magnitudes. Period.
 *
 * Sign handling: for elements where signs differ,
 * cancel bits from existing slots.
 * ============================================================ */
void uv_concat(UVec *dst, const UVec *src) {
    int chunks = dst->chunks;

    for (int s = 0; s < src->n_slots; s++) {
        if (dst->n_slots >= dst->cap) {
            printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap);
            return;
        }

        const uint64_t *src_slot = src->slots + (size_t)s * chunks;
        uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks;

        for (int c = 0; c < chunks; c++) {
            uint64_t sb = src_slot[c];
            uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
            uint64_t disagree = dst->sign[c] ^ src->sign[c];

            /* Same sign: straight append */
            uint64_t add = sb & agree;

            /* Different sign: cancel from existing */
            uint64_t cancel = sb & disagree;
            for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) {
                uint64_t *ds = dst->slots + (size_t)d * chunks + c;
                uint64_t overlap = *ds & cancel;
                *ds &= ~overlap;
                cancel &= ~overlap;
            }
            /* Leftover cancel = src magnitude exceeds dst, flip sign */
            if (cancel) {
                dst->sign[c] ^= cancel;
                add |= cancel;
            }

            new_slot[c] = add;
        }

        /* Check if slot has any bits */
        int any = 0;
        for (int c = 0; c < chunks && !any; c++)
            if (new_slot[c]) any = 1;
        if (any) dst->n_slots++;
    }
}

/* ============================================================
 * MATMUL: y = M @ x
 *
 * Output unit = quantum² (one quantum from weight × one from activation)
 * The integer accumulator directly gives the value in units of quantum².
 *
 * To keep everything in the same quantum system:
 *   y_float[i] = acc * quantum²
 *   Then requantize to unary with the SAME global quantum.
 *   y_mag[i] = acc * quantum² / quantum = acc * quantum
 *
 * ============================================================ */
void uv_matmul(
    const UMat *M, const UVec *x,
    UVec *y, int K_out, float quantum
) {
    int out_dim = M->rows;
    int chunks = M->chunks;
    int wK = M->K;
    int xK = x->n_slots;

    float q2 = quantum * quantum;

    y->n_slots = K_out;
    memset(y->sign, 0, y->chunks * sizeof(uint64_t));
    memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t));

    /* Compute integer dot products */
    int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int));
    uint8_t *neg = (uint8_t *)calloc(out_dim, 1);

    #pragma omp parallel for schedule(dynamic, 32)
    for (int i = 0; i < out_dim; i++) {
        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
        long long a = 0;

        for (int c = 0; c < chunks; c++) {
            uint64_t same = ~(w_sign_row[c] ^ x->sign[c]);
            uint64_t diff = w_sign_row[c] ^ x->sign[c];

            for (int p = 0; p < wK; p++) {
                uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];
                for (int q = 0; q < xK; q++) {
                    uint64_t xq = x->slots[(size_t)q * chunks + c];
                    uint64_t active = wp & xq;
                    a += __builtin_popcountll(active & same)
                       - __builtin_popcountll(active & diff);
                }
            }
        }

        /* a is in units of quantum² per quantum = a * quantum gives magnitude in quantums */
        float val = (float)a * quantum;
        int mag = (int)(fabsf(val) + 0.5f);
        if (mag > K_out) mag = K_out;
        acc[i] = mag;
        neg[i] = (val < 0.0f) ? 1 : 0;
    }

    /* Encode directly to unary — no float intermediate */
    for (int i = 0; i < out_dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);
        if (neg[i]) y->sign[c] |= bit;
        for (int s = 0; s < acc[i]; s++)
            y->slots[(size_t)s * y->chunks + c] |= bit;
    }

    free(acc); free(neg);
}

/* ============================================================
 * RMSNORM — resets slot count, keeps same quantum
 * ============================================================ */
void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) {
    int dim = x->dim;
    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
    uv_to_float(x, xf, quantum);

    float ss = 0.0f;
    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
    float rms = 1.0f / sqrtf(ss / dim + eps);
    for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];

    uv_from_float(out, xf, K_out, quantum);
    free(xf);
}

/* ============================================================
 * TESTS
 * ============================================================ */

void test_concat_correct() {
    printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n");

    float quantum = 0.25f;  /* every bit = 0.25 */
    int dim = 8;

    /* A = [3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0]
     * In quantum=0.25: magnitudes = [12, 8, 20, 4, 0, 16, 8, 28]
     * Need K >= 28 slots to hold 7.0
     */
    float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0};
    float b_vals[] = {2.0,  1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0};
    float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0};

    int K = 32;
    UVec *a = uv_new(dim, 128);
    UVec *b = uv_new(dim, 128);

    uv_from_float(a, a_vals, K, quantum);
    uv_from_float(b, b_vals, K, quantum);

    float a_rec[8], b_rec[8];
    uv_to_float(a, a_rec, quantum);
    uv_to_float(b, b_rec, quantum);

    printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum);
    printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n");
    printf("A unary:    "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n");
    printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n");
    printf("B unary:    "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n");

    printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots);
    uv_concat(a, b);
    printf("After concat: %d slots\n\n", a->n_slots);

    float result[8];
    uv_to_float(a, result, quantum);

    printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n");
    printf("Concat  A+B:  "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n");
    printf("Error:        "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n");

    uv_free(a); uv_free(b);
}

void test_chain_concat() {
    printf("\n=== CHAINED CONCAT (5 additions) ===\n\n");

    float quantum = 0.1f;
    int dim = 4;
    int K = 64;

    float vals[] = {1.0, -2.0, 3.0, -0.5};
    UVec *acc = uv_new(dim, 512);
    uv_from_float(acc, vals, K, quantum);

    printf("Start: "); 
    float tmp[4]; 
    uv_to_float(acc, tmp, quantum);
    for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]); 
    printf(" (%d slots)\n", acc->n_slots);

    float expected[] = {1.0, -2.0, 3.0, -0.5};

    for (int step = 0; step < 5; step++) {
        float add_vals[] = {0.5, 0.3, -1.0, 0.7};
        UVec *delta = uv_new(dim, K);
        uv_from_float(delta, add_vals, K, quantum);

        uv_concat(acc, delta);

        for (int i = 0; i < 4; i++) expected[i] += add_vals[i];

        uv_to_float(acc, tmp, quantum);
        printf("  +[0.5,0.3,-1.0,0.7] = ");
        for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]);
        printf(" (%d slots) expect:", acc->n_slots);
        for(int i=0;i<4;i++) printf("%6.2f ",expected[i]);

        /* Check error */
        float max_err = 0;
        for(int i=0;i<4;i++) {
            float e = fabsf(expected[i] - tmp[i]);
            if (e > max_err) max_err = e;
        }
        printf(" err=%.2f\n", max_err);

        uv_free(delta);
    }

    uv_free(acc);
}

void test_matmul() {
    printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n");

    int rows = 512, cols = 256;
    int wK = 32, xK = 32;

    srand(42);
    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
    float *xf = (float *)malloc(cols * sizeof(float));
    float *y_ref = (float *)calloc(rows, sizeof(float));

    /* Small values so magnitudes fit in K slots */
    for (size_t i = 0; i < (size_t)rows * cols; i++)
        Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
    for (int i = 0; i < cols; i++)
        xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f;
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

    /* Find quantum that fits the data range */
    float data_max = 0;
    for (size_t i = 0; i < (size_t)rows * cols; i++) {
        float a = fabsf(Mf[i]);
        if (a > data_max) data_max = a;
    }
    for (int i = 0; i < cols; i++) {
        float a = fabsf(xf[i]);
        if (a > data_max) data_max = a;
    }
    float quantum = data_max / wK;

    printf("Data range: [-%.2f, %.2f]\n", data_max, data_max);
    printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK);
    printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK);

    UMat *M = um_new(rows, cols, wK);
    UVec *x = uv_new(cols, xK);

    um_from_float(M, Mf, quantum);
    uv_from_float(x, xf, xK, quantum);

    /* Output needs enough K for the matmul result range */
    float ymax = 0;
    for (int i = 0; i < rows; i++) {
        float a = fabsf(y_ref[i]);
        if (a > ymax) ymax = a;
    }
    int K_out = (int)(ymax / quantum + 1);
    if (K_out > 4096) K_out = 4096;
    printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out);

    UVec *y = uv_new(rows, K_out);

    struct timespec t0, t1;
    clock_gettime(CLOCK_MONOTONIC, &t0);
    uv_matmul(M, x, y, K_out, quantum);
    clock_gettime(CLOCK_MONOTONIC, &t1);
    double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;

    float *yf = (float *)malloc(rows * sizeof(float));
    uv_to_float(y, yf, quantum);

    float dot = 0, na = 0, nb = 0, noise = 0;
    for (int i = 0; i < rows; i++) {
        dot += y_ref[i] * yf[i];
        na += y_ref[i] * y_ref[i];
        nb += yf[i] * yf[i];
        float e = y_ref[i] - yf[i]; noise += e * e;
    }
    float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
    float snr = 10.0f * log10f(na / (noise + 1e-10f));

    printf("\nCosine: %.6f\n", cosine);
    printf("SNR:    %.1f dB\n", snr);
    printf("Time:   %.1f ms\n", ms);

    printf("\nFirst 10 values:\n");
    printf("%10s %10s %10s\n", "Ref", "Unary", "Error");
    for (int i = 0; i < 10; i++)
        printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]);

    um_free(M); uv_free(x); uv_free(y);
    free(Mf); free(xf); free(y_ref); free(yf);
}

void test_residual_chain() {
    printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n");

    float quantum = 0.05f;
    int dim = 1024;
    int K = 128;  /* fits values up to 6.4 */

    srand(123);
    float *embed = (float *)malloc(dim * sizeof(float));
    for (int i = 0; i < dim; i++)
        embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f;

    /* Float reference: accumulate residuals */
    float *ref = (float *)malloc(dim * sizeof(float));
    memcpy(ref, embed, dim * sizeof(float));

    /* Unary: grow via concat */
    int total_cap = K + 10 * K;  /* room for 10 concat operations */
    UVec *residual = uv_new(dim, total_cap);
    uv_from_float(residual, embed, K, quantum);

    printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim);
    printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr");

    for (int step = 0; step < 10; step++) {
        float *delta = (float *)malloc(dim * sizeof(float));
        for (int i = 0; i < dim; i++)
            delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f;

        /* Float reference */
        for (int i = 0; i < dim; i++) ref[i] += delta[i];

        /* Unary: concat */
        UVec *d = uv_new(dim, K);
        uv_from_float(d, delta, K, quantum);
        uv_concat(residual, d);

        /* Compare */
        float *rec = (float *)malloc(dim * sizeof(float));
        uv_to_float(residual, rec, quantum);

        float dot = 0, na = 0, nb = 0, max_err = 0;
        for (int i = 0; i < dim; i++) {
            dot += ref[i] * rec[i];
            na += ref[i] * ref[i];
            nb += rec[i] * rec[i];
            float e = fabsf(ref[i] - rec[i]);
            if (e > max_err) max_err = e;
        }
        float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);

        printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err);

        uv_free(d); free(delta); free(rec);
    }

    uv_free(residual);
    free(embed); free(ref);
}

int main() {
    printf("================================================\n");
    printf("  PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n");
    printf("  Every bit = 1 quantum. Concat = Add.\n");
    printf("================================================\n\n");

    test_concat_correct();
    test_chain_concat();
    test_matmul();
    test_residual_chain();

    printf("\n=== DONE ===\n");
    return 0;
}