/*
 * CONCATENATIVE UNARY ENGINE
 *
 * In base-1, the value IS the count of ones.
 * Addition = concatenation of bitstreams.
 * Multiplication = AND + count.
 *
 * REPRESENTATION:
 *   Each element of a vector has:
 *     - A sign bit (positive/negative)
 *     - A magnitude = number of 1-bits across K "slots"
 *
 *   But crucially, when we ADD two unary vectors (residual connection),
 *   we DON'T dequantize-add-requantize. We CONCATENATE the slots.
 *
 *   If vector A has K_a slots and vector B has K_b slots,
 *   A + B has K_a + K_b slots. The magnitude of element j is
 *   just the total count of 1-bits at position j across ALL slots.
 *
 *   This means the residual stream GROWS through the network:
 *     After embed:  K_0 slots
 *     After layer 1: K_0 + K_attn + K_mlp slots
 *     After layer L: K_0 + L*(K_attn + K_mlp) slots
 *
 *   No information is ever destroyed by requantization.
 *
 * MATMUL:
 *   y = W @ x where W has K_w slots and x has K_x slots.
 *   For each output element y[i]:
 *     For each slot pair (p from W, q from x):
 *       count += popcount(W_slot_p[i] AND x_slot_q AND same_sign)
 *              - popcount(W_slot_p[i] AND x_slot_q AND diff_sign)
 *   Output gets K_out = some fixed number of slots (requantized)
 *   because matmul output magnitude is in a different scale.
 *
 * SAME-SIGN ADD (residual):
 *   Just append slots. Zero compute.
 *   For different signs: need cancellation.
 *   In practice residual connections are same-sign-dominant,
 *   so we track sign separately and concat magnitudes,
 *   deferring cancellation to the next norm.
 *
 * (c) 2026 OpenTransformers Ltd / Scott Bisset
 */

#define _POSIX_C_SOURCE 199309L
#include <immintrin.h>
#include <omp.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include <time.h>

/* ============================================================
 * GROWABLE UNARY VECTOR
 *
 * The key data structure. Slots can be appended (concat = add).
 * Each slot is a bitplane of dim bits packed into uint64 chunks.
 *
 * sign:     uint64[chunks]        — per-element sign
 * slots:    uint64[n_slots * chunks] — each slot is chunks uint64s
 * n_slots:  current number of slots (grows via concat)
 * max_slots: allocated capacity
 *
 * For element j:
 *   magnitude = number of slots where bit j is set
 *   value = sign * magnitude * scale
 *
 * ============================================================ */
typedef struct {
    uint64_t *sign;
    uint64_t *slots;     /* contiguous: slot 0 at [0..chunks-1], slot 1 at [chunks..2*chunks-1], etc */
    float     scale;     /* per-vector scale factor */
    int       dim;
    int       chunks;    /* (dim+63)/64 */
    int       n_slots;   /* current slot count */
    int       max_slots; /* allocated capacity */
} GrowVec;

/* Fixed-size unary matrix (weights don't grow) */
typedef struct {
    uint64_t *sign;     /* [rows * chunks] */
    uint64_t *slots;    /* [K * rows * chunks] */
    float    *scales;   /* [rows] per-row scale */
    int       rows, cols, chunks, K;
} FixedMat;

/* ============================================================
 * ALLOCATION
 * ============================================================ */
GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) {
    GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec));
    v->dim = dim;
    v->chunks = (dim + 63) / 64;
    v->n_slots = 0;
    v->max_slots = max_slots;
    v->scale = 1.0f;
    v->sign  = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t));
    v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t));
    memset(v->sign, 0, v->chunks * sizeof(uint64_t));
    memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t));
    return v;
}

void gv_free(GrowVec *v) {
    if (v) { free(v->sign); free(v->slots); free(v); }
}

FixedMat* fm_alloc(int rows, int cols, int K) {
    FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat));
    m->rows = rows; m->cols = cols; m->K = K;
    m->chunks = (cols + 63) / 64;
    m->sign   = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t));
    m->slots  = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    m->scales = (float *)aligned_alloc(64, rows * sizeof(float));
    memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t));
    memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t));
    return m;
}

void fm_free(FixedMat *m) {
    if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); }
}

/* ============================================================
 * FLOAT → UNARY CONVERSION (only at boundaries)
 * ============================================================ */
void gv_from_float(GrowVec *v, const float *x, int K) {
    int dim = v->dim, chunks = v->chunks;

    v->n_slots = K;
    memset(v->sign, 0, chunks * sizeof(uint64_t));
    memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t));

    float amax = 0.0f;
    for (int i = 0; i < dim; i++) {
        float a = fabsf(x[i]);
        if (a > amax) amax = a;
    }
    if (amax == 0.0f) { v->scale = 1.0f; return; }
    v->scale = amax / K;
    float inv = K / amax;

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        if (x[i] < 0.0f) v->sign[c] |= bit;

        int mag = (int)(fabsf(x[i]) * inv + 0.5f);
        if (mag > K) mag = K;
        for (int s = 0; s < mag; s++)
            v->slots[(size_t)s * chunks + c] |= bit;
    }
}

void gv_to_float(const GrowVec *v, float *out) {
    int dim = v->dim, chunks = v->chunks;

    for (int i = 0; i < dim; i++) {
        int c = i / 64;
        uint64_t bit = 1ULL << (i % 64);

        int mag = 0;
        for (int s = 0; s < v->n_slots; s++) {
            if (v->slots[(size_t)s * chunks + c] & bit)
                mag++;
        }

        float val = (float)mag * v->scale;
        out[i] = (v->sign[c] & bit) ? -val : val;
    }
}

void fm_from_float(FixedMat *m, const float *data) {
    int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks;

    memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t));
    memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t));

    for (int r = 0; r < rows; r++) {
        const float *row = data + (size_t)r * cols;
        float amax = 0.0f;
        for (int j = 0; j < cols; j++) {
            float a = fabsf(row[j]);
            if (a > amax) amax = a;
        }
        if (amax == 0.0f) { m->scales[r] = 1.0f; continue; }
        m->scales[r] = amax / K;
        float inv = K / amax;

        uint64_t *rs = m->sign + (size_t)r * chunks;
        for (int j = 0; j < cols; j++) {
            int c = j / 64;
            uint64_t bit = 1ULL << (j % 64);
            if (row[j] < 0.0f) rs[c] |= bit;

            int mag = (int)(fabsf(row[j]) * inv + 0.5f);
            if (mag > K) mag = K;
            for (int s = 0; s < mag; s++)
                m->slots[((size_t)s * rows + r) * chunks + c] |= bit;
        }
    }
}

/* ============================================================
 * CONCATENATION = ADDITION
 *
 * gv_concat(dst, src):
 *   Appends src's slots to dst.
 *   Same-sign: just append.
 *   Different-sign: cancel bits (remove from both).
 *
 * For efficiency with residual connections where scales differ:
 *   We track a "slot_scales" or use a single scale with normalization.
 *
 * SIMPLE VERSION: assumes same scale (works after norm).
 * ============================================================ */

/* Simple concat: append src slots to dst. Handles sign cancellation. */
void gv_concat(GrowVec *dst, const GrowVec *src) {
    int chunks = dst->chunks;

    /* For each source slot, process element-wise:
     * Where signs agree: copy bit to new dst slot
     * Where signs differ: cancel - find a dst slot with that bit set and clear it
     *
     * Optimization: for most transformer residuals, signs mostly agree.
     * So we do the simple thing: compute per-element sign agreement,
     * then for agreeing elements just append, for disagreeing elements cancel.
     */

    /* Sign agreement mask */
    /* agree[c] = ~(dst_sign[c] ^ src_sign[c])  — bits where signs match */

    for (int s = 0; s < src->n_slots; s++) {
        const uint64_t *src_slot = src->slots + (size_t)s * chunks;

        /* Split into agree and disagree portions */
        int new_slot = dst->n_slots;
        if (new_slot >= dst->max_slots) {
            /* Out of room — would need realloc in production */
            printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots);
            return;
        }
        uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks;

        for (int c = 0; c < chunks; c++) {
            uint64_t src_bits = src_slot[c];
            uint64_t agree = ~(dst->sign[c] ^ src->sign[c]);
            uint64_t disagree = dst->sign[c] ^ src->sign[c];

            /* Same sign: just append to new slot */
            uint64_t to_add = src_bits & agree;

            /* Different sign: cancel from existing dst slots */
            uint64_t to_cancel = src_bits & disagree;

            /* Cancel by walking backwards through dst slots */
            for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) {
                uint64_t *dslot = dst->slots + (size_t)d * chunks + c;
                uint64_t overlap = *dslot & to_cancel;
                *dslot &= ~overlap;      /* clear cancelled bits in dst */
                to_cancel &= ~overlap;   /* mark as cancelled */
            }

            /* Any remaining to_cancel means src > dst for those elements
             * — flip the sign and add to new slot */
            if (to_cancel) {
                dst->sign[c] ^= to_cancel;  /* flip sign for these elements */
                to_add |= to_cancel;
            }

            dst_new[c] = to_add;
        }

        /* Only increment if new slot is non-empty */
        int non_empty = 0;
        for (int c = 0; c < chunks && !non_empty; c++)
            if (dst_new[c]) non_empty = 1;
        if (non_empty)
            dst->n_slots++;
    }
}

/* Fast concat for SAME SCALE, SAME SIGN pattern (most common in residuals) */
void gv_concat_fast(GrowVec *dst, const GrowVec *src) {
    int chunks = dst->chunks;
    int src_slots = src->n_slots;

    if (dst->n_slots + src_slots > dst->max_slots) {
        printf("WARNING: GrowVec overflow\n");
        src_slots = dst->max_slots - dst->n_slots;
    }

    /* Just memcpy the slots — handles same-sign correctly,
     * defers opposite-sign cancellation to next norm */
    memcpy(dst->slots + (size_t)dst->n_slots * chunks,
           src->slots,
           (size_t)src_slots * chunks * sizeof(uint64_t));
    dst->n_slots += src_slots;
}

/* ============================================================
 * MATMUL: y = M @ x
 *
 * M is fixed (K_w slots), x is growable (n_slots slots).
 * Output is a NEW GrowVec with K_out slots.
 *
 * Core: for each output element i, accumulate:
 *   acc += popcount(M_slot_p[i] AND x_slot_q AND agree_sign)
 *        - popcount(M_slot_p[i] AND x_slot_q AND disagree_sign)
 *
 * Then quantize acc to K_out unary slots.
 * ============================================================ */
void gv_matmul(
    const FixedMat *M,
    const GrowVec *x,
    GrowVec *y,         /* output — gets filled with K_out slots */
    int K_out           /* how many output slots */
) {
    int out_dim = M->rows;
    int chunks = M->chunks;
    int wK = M->K;
    int xK = x->n_slots;

    float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float));

    #pragma omp parallel for schedule(dynamic, 32)
    for (int i = 0; i < out_dim; i++) {
        const uint64_t *w_sign_row = M->sign + (size_t)i * chunks;
        long long acc = 0;

        for (int c = 0; c < chunks; c++) {
            uint64_t ws = w_sign_row[c];
            uint64_t xs = x->sign[c];
            uint64_t same = ~(ws ^ xs);
            uint64_t diff = ws ^ xs;

            for (int p = 0; p < wK; p++) {
                uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c];

                for (int q = 0; q < xK; q++) {
                    uint64_t xq = x->slots[(size_t)q * chunks + c];
                    uint64_t active = wp & xq;
                    acc += __builtin_popcountll(active & same)
                         - __builtin_popcountll(active & diff);
                }
            }
        }

        y_float[i] = (float)acc * M->scales[i] * x->scale;
    }

    /* Quantize to K_out slots */
    gv_from_float(y, y_float, K_out);
    free(y_float);
}

/* ============================================================
 * NORM: GrowVec → GrowVec with controlled slot count
 *
 * RMSNorm dequantizes (counting), normalizes (float),
 * then requantizes to a fixed K.
 * This is where slot count gets reset.
 * ============================================================ */
void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) {
    int dim = x->dim;
    float *xf = (float *)aligned_alloc(64, dim * sizeof(float));
    gv_to_float(x, xf);

    float ss = 0.0f;
    for (int i = 0; i < dim; i++) ss += xf[i] * xf[i];
    float rms = 1.0f / sqrtf(ss / dim + eps);
    for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i];

    gv_from_float(out, xf, K_out);
    free(xf);
}

/* ============================================================
 * SILU_MUL: out = SiLU(gate) * up
 * Dequant, compute, requant. O(dim).
 * ============================================================ */
void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) {
    int dim = gate->dim;
    float *gf = (float *)aligned_alloc(64, dim * sizeof(float));
    float *uf = (float *)aligned_alloc(64, dim * sizeof(float));
    gv_to_float(gate, gf);
    gv_to_float(up, uf);

    for (int i = 0; i < dim; i++)
        gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i];

    gv_from_float(out, gf, K_out);
    free(gf); free(uf);
}

/* ============================================================
 * TEST: demonstrate growing residual stream
 * ============================================================ */
void test_concat_add() {
    printf("=== CONCATENATION = ADDITION TEST ===\n\n");

    int dim = 16;

    /* Create vector A = [3, -2, 5, 1, ...] quantized to K=8 */
    float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4};
    float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1};

    GrowVec *a = gv_alloc(dim, 8, 64);
    GrowVec *b = gv_alloc(dim, 8, 64);
    gv_from_float(a, a_vals, 8);
    gv_from_float(b, b_vals, 8);

    printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale);
    float af[16], bf[16];
    gv_to_float(a, af);
    printf("  Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n");
    printf("  Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n");

    printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale);
    gv_to_float(b, bf);
    printf("  Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n");
    printf("  Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n");

    /* Concatenate (= add) */
    printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots);

    /* Need same scale for concat to work correctly */
    /* In a real network, both come from norm so they have comparable scale */
    /* For this test, use fast concat (no cancellation) */
    gv_concat(a, b);
    printf(" -> %d):\n", a->n_slots);

    float result[16], ref[16];
    gv_to_float(a, result);
    for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i];

    /* NOTE: concat addition only works correctly when scales match.
     * When scales differ, we'd need to adjust. In a transformer,
     * the norm before each sublayer ensures comparable scales. */

    printf("  Float A+B:  "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n");
    printf("  Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n");

    gv_free(a); gv_free(b);
}

void test_growing_residual() {
    printf("\n=== GROWING RESIDUAL STREAM TEST ===\n");
    printf("Simulating 6 transformer layers with concat residuals\n\n");

    int dim = 2560;
    int K_embed = 16;      /* initial embedding quantization */
    int K_sublayer = 8;    /* each sublayer output */
    int n_layers = 6;

    /* Create random embedding */
    float *embed = (float *)malloc(dim * sizeof(float));
    srand(42);
    for (int i = 0; i < dim; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }

    /* Max slots: K_embed + n_layers * 2 * K_sublayer (attn + mlp per layer) */
    int max_slots = K_embed + n_layers * 2 * K_sublayer + 64;
    GrowVec *residual = gv_alloc(dim, K_embed, max_slots);
    gv_from_float(residual, embed, K_embed);

    printf("After embedding: %d slots (%.1f KB)\n",
           residual->n_slots,
           (float)residual->n_slots * residual->chunks * 8 / 1024);

    for (int l = 0; l < n_layers; l++) {
        /* Simulate attention output */
        GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer);
        float *fake_attn = (float *)malloc(dim * sizeof(float));
        for (int i = 0; i < dim; i++) {
            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
        }
        gv_from_float(attn_out, fake_attn, K_sublayer);
        /* Scale must match for concat to work — in real net, norm handles this */
        attn_out->scale = residual->scale;

        /* RESIDUAL ADD = CONCATENATION */
        gv_concat_fast(residual, attn_out);

        /* Simulate MLP output */
        GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer);
        float *fake_mlp = (float *)malloc(dim * sizeof(float));
        for (int i = 0; i < dim; i++) {
            float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
            fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f;
        }
        gv_from_float(mlp_out, fake_mlp, K_sublayer);
        mlp_out->scale = residual->scale;

        /* RESIDUAL ADD = CONCATENATION */
        gv_concat_fast(residual, mlp_out);

        printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n",
               l + 1, residual->n_slots,
               (float)residual->n_slots * residual->chunks * 8 / 1024,
               K_sublayer, K_sublayer);

        gv_free(attn_out); gv_free(mlp_out);
        free(fake_attn); free(fake_mlp);
    }

    printf("\nResidual grew from %d to %d slots through %d layers\n",
           K_embed, residual->n_slots, n_layers);
    printf("Information accumulated, never lost to requantization\n");

    gv_free(residual);
    free(embed);
}

void test_matmul_accuracy() {
    printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n");

    int rows = 512, cols = 2560;
    int wK = 32;

    printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK);
    printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms");

    srand(42);
    float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float));
    float *xf = (float *)malloc(cols * sizeof(float));
    float *y_ref = (float *)calloc(rows, sizeof(float));

    for (size_t i = 0; i < (size_t)rows * cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }
    for (int i = 0; i < cols; i++) {
        float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f);
        xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2);
    }
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            y_ref[i] += Mf[(size_t)i * cols + j] * xf[j];

    FixedMat *M = fm_alloc(rows, cols, wK);
    fm_from_float(M, Mf);

    /* Test with different x slot counts (simulating growing residual) */
    int x_slots[] = {8, 16, 32, 48, 64, 96};
    for (int t = 0; t < 6; t++) {
        int xK = x_slots[t];
        GrowVec *x = gv_alloc(cols, xK, xK);
        GrowVec *y = gv_alloc(rows, xK, xK);
        gv_from_float(x, xf, xK);

        struct timespec t0, t1;
        float *yf = (float *)malloc(rows * sizeof(float));

        clock_gettime(CLOCK_MONOTONIC, &t0);
        gv_matmul(M, x, y, xK);
        clock_gettime(CLOCK_MONOTONIC, &t1);
        double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6;

        gv_to_float(y, yf);

        float dot = 0, na = 0, nb = 0, noise = 0;
        for (int i = 0; i < rows; i++) {
            dot += y_ref[i] * yf[i];
            na += y_ref[i] * y_ref[i];
            nb += yf[i] * yf[i];
            float e = y_ref[i] - yf[i];
            noise += e * e;
        }
        float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f);
        float snr = 10.0f * log10f(na / (noise + 1e-10f));

        printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms);

        gv_free(x); gv_free(y); free(yf);
    }

    fm_free(M);
    free(Mf); free(xf); free(y_ref);
}

int main() {
    printf("========================================\n");
    printf("  CONCATENATIVE UNARY ENGINE TESTS\n");
    printf("  Addition = Concatenation\n");
    printf("  Value = Count of Ones\n");
    printf("========================================\n");

    test_concat_add();
    test_growing_residual();
    test_matmul_accuracy();

    printf("\n=== ALL TESTS DONE ===\n");
    return 0;
}