| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #define _POSIX_C_SOURCE 199309L |
| | #include <immintrin.h> |
| | #include <omp.h> |
| | #include <stdint.h> |
| | #include <stdlib.h> |
| | #include <string.h> |
| | #include <math.h> |
| | #include <stdio.h> |
| | #include <time.h> |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *slots; |
| | float scale; |
| | int dim; |
| | int chunks; |
| | int n_slots; |
| | int max_slots; |
| | } GrowVec; |
| |
|
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *slots; |
| | float *scales; |
| | int rows, cols, chunks, K; |
| | } FixedMat; |
| |
|
| | |
| | |
| | |
| | GrowVec* gv_alloc(int dim, int initial_slots, int max_slots) { |
| | GrowVec *v = (GrowVec *)calloc(1, sizeof(GrowVec)); |
| | v->dim = dim; |
| | v->chunks = (dim + 63) / 64; |
| | v->n_slots = 0; |
| | v->max_slots = max_slots; |
| | v->scale = 1.0f; |
| | v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); |
| | v->slots = (uint64_t *)aligned_alloc(64, (size_t)max_slots * v->chunks * sizeof(uint64_t)); |
| | memset(v->sign, 0, v->chunks * sizeof(uint64_t)); |
| | memset(v->slots, 0, (size_t)max_slots * v->chunks * sizeof(uint64_t)); |
| | return v; |
| | } |
| |
|
| | void gv_free(GrowVec *v) { |
| | if (v) { free(v->sign); free(v->slots); free(v); } |
| | } |
| |
|
| | FixedMat* fm_alloc(int rows, int cols, int K) { |
| | FixedMat *m = (FixedMat *)calloc(1, sizeof(FixedMat)); |
| | m->rows = rows; m->cols = cols; m->K = K; |
| | m->chunks = (cols + 63) / 64; |
| | m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | m->scales = (float *)aligned_alloc(64, rows * sizeof(float)); |
| | memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | return m; |
| | } |
| |
|
| | void fm_free(FixedMat *m) { |
| | if (m) { free(m->sign); free(m->slots); free(m->scales); free(m); } |
| | } |
| |
|
| | |
| | |
| | |
| | void gv_from_float(GrowVec *v, const float *x, int K) { |
| | int dim = v->dim, chunks = v->chunks; |
| |
|
| | v->n_slots = K; |
| | memset(v->sign, 0, chunks * sizeof(uint64_t)); |
| | memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t)); |
| |
|
| | float amax = 0.0f; |
| | for (int i = 0; i < dim; i++) { |
| | float a = fabsf(x[i]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { v->scale = 1.0f; return; } |
| | v->scale = amax / K; |
| | float inv = K / amax; |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | if (x[i] < 0.0f) v->sign[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(x[i]) * inv + 0.5f); |
| | if (mag > K) mag = K; |
| | for (int s = 0; s < mag; s++) |
| | v->slots[(size_t)s * chunks + c] |= bit; |
| | } |
| | } |
| |
|
| | void gv_to_float(const GrowVec *v, float *out) { |
| | int dim = v->dim, chunks = v->chunks; |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | int mag = 0; |
| | for (int s = 0; s < v->n_slots; s++) { |
| | if (v->slots[(size_t)s * chunks + c] & bit) |
| | mag++; |
| | } |
| |
|
| | float val = (float)mag * v->scale; |
| | out[i] = (v->sign[c] & bit) ? -val : val; |
| | } |
| | } |
| |
|
| | void fm_from_float(FixedMat *m, const float *data) { |
| | int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; |
| |
|
| | memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); |
| | memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); |
| |
|
| | for (int r = 0; r < rows; r++) { |
| | const float *row = data + (size_t)r * cols; |
| | float amax = 0.0f; |
| | for (int j = 0; j < cols; j++) { |
| | float a = fabsf(row[j]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { m->scales[r] = 1.0f; continue; } |
| | m->scales[r] = amax / K; |
| | float inv = K / amax; |
| |
|
| | uint64_t *rs = m->sign + (size_t)r * chunks; |
| | for (int j = 0; j < cols; j++) { |
| | int c = j / 64; |
| | uint64_t bit = 1ULL << (j % 64); |
| | if (row[j] < 0.0f) rs[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(row[j]) * inv + 0.5f); |
| | if (mag > K) mag = K; |
| | for (int s = 0; s < mag; s++) |
| | m->slots[((size_t)s * rows + r) * chunks + c] |= bit; |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | void gv_concat(GrowVec *dst, const GrowVec *src) { |
| | int chunks = dst->chunks; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | for (int s = 0; s < src->n_slots; s++) { |
| | const uint64_t *src_slot = src->slots + (size_t)s * chunks; |
| |
|
| | |
| | int new_slot = dst->n_slots; |
| | if (new_slot >= dst->max_slots) { |
| | |
| | printf("WARNING: GrowVec overflow (%d >= %d slots)\n", new_slot, dst->max_slots); |
| | return; |
| | } |
| | uint64_t *dst_new = dst->slots + (size_t)new_slot * chunks; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t src_bits = src_slot[c]; |
| | uint64_t agree = ~(dst->sign[c] ^ src->sign[c]); |
| | uint64_t disagree = dst->sign[c] ^ src->sign[c]; |
| |
|
| | |
| | uint64_t to_add = src_bits & agree; |
| |
|
| | |
| | uint64_t to_cancel = src_bits & disagree; |
| |
|
| | |
| | for (int d = dst->n_slots - 1; d >= 0 && to_cancel; d--) { |
| | uint64_t *dslot = dst->slots + (size_t)d * chunks + c; |
| | uint64_t overlap = *dslot & to_cancel; |
| | *dslot &= ~overlap; |
| | to_cancel &= ~overlap; |
| | } |
| |
|
| | |
| | |
| | if (to_cancel) { |
| | dst->sign[c] ^= to_cancel; |
| | to_add |= to_cancel; |
| | } |
| |
|
| | dst_new[c] = to_add; |
| | } |
| |
|
| | |
| | int non_empty = 0; |
| | for (int c = 0; c < chunks && !non_empty; c++) |
| | if (dst_new[c]) non_empty = 1; |
| | if (non_empty) |
| | dst->n_slots++; |
| | } |
| | } |
| |
|
| | |
| | void gv_concat_fast(GrowVec *dst, const GrowVec *src) { |
| | int chunks = dst->chunks; |
| | int src_slots = src->n_slots; |
| |
|
| | if (dst->n_slots + src_slots > dst->max_slots) { |
| | printf("WARNING: GrowVec overflow\n"); |
| | src_slots = dst->max_slots - dst->n_slots; |
| | } |
| |
|
| | |
| | |
| | memcpy(dst->slots + (size_t)dst->n_slots * chunks, |
| | src->slots, |
| | (size_t)src_slots * chunks * sizeof(uint64_t)); |
| | dst->n_slots += src_slots; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void gv_matmul( |
| | const FixedMat *M, |
| | const GrowVec *x, |
| | GrowVec *y, |
| | int K_out |
| | ) { |
| | int out_dim = M->rows; |
| | int chunks = M->chunks; |
| | int wK = M->K; |
| | int xK = x->n_slots; |
| |
|
| | float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float)); |
| |
|
| | #pragma omp parallel for schedule(dynamic, 32) |
| | for (int i = 0; i < out_dim; i++) { |
| | const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| | long long acc = 0; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t ws = w_sign_row[c]; |
| | uint64_t xs = x->sign[c]; |
| | uint64_t same = ~(ws ^ xs); |
| | uint64_t diff = ws ^ xs; |
| |
|
| | for (int p = 0; p < wK; p++) { |
| | uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c]; |
| |
|
| | for (int q = 0; q < xK; q++) { |
| | uint64_t xq = x->slots[(size_t)q * chunks + c]; |
| | uint64_t active = wp & xq; |
| | acc += __builtin_popcountll(active & same) |
| | - __builtin_popcountll(active & diff); |
| | } |
| | } |
| | } |
| |
|
| | y_float[i] = (float)acc * M->scales[i] * x->scale; |
| | } |
| |
|
| | |
| | gv_from_float(y, y_float, K_out); |
| | free(y_float); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void gv_rmsnorm(const GrowVec *x, const float *weight, GrowVec *out, int K_out, float eps) { |
| | int dim = x->dim; |
| | float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | gv_to_float(x, xf); |
| |
|
| | float ss = 0.0f; |
| | for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; |
| | float rms = 1.0f / sqrtf(ss / dim + eps); |
| | for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i]; |
| |
|
| | gv_from_float(out, xf, K_out); |
| | free(xf); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | void gv_silu_mul(const GrowVec *gate, const GrowVec *up, GrowVec *out, int K_out) { |
| | int dim = gate->dim; |
| | float *gf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | float *uf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | gv_to_float(gate, gf); |
| | gv_to_float(up, uf); |
| |
|
| | for (int i = 0; i < dim; i++) |
| | gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i]; |
| |
|
| | gv_from_float(out, gf, K_out); |
| | free(gf); free(uf); |
| | } |
| |
|
| | |
| | |
| | |
| | void test_concat_add() { |
| | printf("=== CONCATENATION = ADDITION TEST ===\n\n"); |
| |
|
| | int dim = 16; |
| |
|
| | |
| | float a_vals[] = {3, -2, 5, 1, 0, -4, 2, 7, -1, 3, 6, -5, 2, 0, -3, 4}; |
| | float b_vals[] = {2, 1, -3, 4, 1, 2, -1, -2, 3, -1, 1, 2, -2, 5, 1, -1}; |
| |
|
| | GrowVec *a = gv_alloc(dim, 8, 64); |
| | GrowVec *b = gv_alloc(dim, 8, 64); |
| | gv_from_float(a, a_vals, 8); |
| | gv_from_float(b, b_vals, 8); |
| |
|
| | printf("A (K=%d slots, scale=%.3f):\n", a->n_slots, a->scale); |
| | float af[16], bf[16]; |
| | gv_to_float(a, af); |
| | printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", a_vals[i]); printf("\n"); |
| | printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", af[i]); printf("\n"); |
| |
|
| | printf("\nB (K=%d slots, scale=%.3f):\n", b->n_slots, b->scale); |
| | gv_to_float(b, bf); |
| | printf(" Original: "); for (int i = 0; i < 8; i++) printf("%6.2f ", b_vals[i]); printf("\n"); |
| | printf(" Recovered:"); for (int i = 0; i < 8; i++) printf("%6.2f ", bf[i]); printf("\n"); |
| |
|
| | |
| | printf("\nA + B via CONCATENATION (slots: %d + %d", a->n_slots, b->n_slots); |
| |
|
| | |
| | |
| | |
| | gv_concat(a, b); |
| | printf(" -> %d):\n", a->n_slots); |
| |
|
| | float result[16], ref[16]; |
| | gv_to_float(a, result); |
| | for (int i = 0; i < 16; i++) ref[i] = a_vals[i] + b_vals[i]; |
| |
|
| | |
| | |
| | |
| |
|
| | printf(" Float A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", ref[i]); printf("\n"); |
| | printf(" Concat A+B: "); for (int i = 0; i < 8; i++) printf("%6.2f ", result[i]); printf("\n"); |
| |
|
| | gv_free(a); gv_free(b); |
| | } |
| |
|
| | void test_growing_residual() { |
| | printf("\n=== GROWING RESIDUAL STREAM TEST ===\n"); |
| | printf("Simulating 6 transformer layers with concat residuals\n\n"); |
| |
|
| | int dim = 2560; |
| | int K_embed = 16; |
| | int K_sublayer = 8; |
| | int n_layers = 6; |
| |
|
| | |
| | float *embed = (float *)malloc(dim * sizeof(float)); |
| | srand(42); |
| | for (int i = 0; i < dim; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | embed[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| |
|
| | |
| | int max_slots = K_embed + n_layers * 2 * K_sublayer + 64; |
| | GrowVec *residual = gv_alloc(dim, K_embed, max_slots); |
| | gv_from_float(residual, embed, K_embed); |
| |
|
| | printf("After embedding: %d slots (%.1f KB)\n", |
| | residual->n_slots, |
| | (float)residual->n_slots * residual->chunks * 8 / 1024); |
| |
|
| | for (int l = 0; l < n_layers; l++) { |
| | |
| | GrowVec *attn_out = gv_alloc(dim, K_sublayer, K_sublayer); |
| | float *fake_attn = (float *)malloc(dim * sizeof(float)); |
| | for (int i = 0; i < dim; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | fake_attn[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f; |
| | } |
| | gv_from_float(attn_out, fake_attn, K_sublayer); |
| | |
| | attn_out->scale = residual->scale; |
| |
|
| | |
| | gv_concat_fast(residual, attn_out); |
| |
|
| | |
| | GrowVec *mlp_out = gv_alloc(dim, K_sublayer, K_sublayer); |
| | float *fake_mlp = (float *)malloc(dim * sizeof(float)); |
| | for (int i = 0; i < dim; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | fake_mlp[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2) * 0.1f; |
| | } |
| | gv_from_float(mlp_out, fake_mlp, K_sublayer); |
| | mlp_out->scale = residual->scale; |
| |
|
| | |
| | gv_concat_fast(residual, mlp_out); |
| |
|
| | printf("After layer %d: %d slots (%.1f KB) [+%d attn +%d mlp]\n", |
| | l + 1, residual->n_slots, |
| | (float)residual->n_slots * residual->chunks * 8 / 1024, |
| | K_sublayer, K_sublayer); |
| |
|
| | gv_free(attn_out); gv_free(mlp_out); |
| | free(fake_attn); free(fake_mlp); |
| | } |
| |
|
| | printf("\nResidual grew from %d to %d slots through %d layers\n", |
| | K_embed, residual->n_slots, n_layers); |
| | printf("Information accumulated, never lost to requantization\n"); |
| |
|
| | gv_free(residual); |
| | free(embed); |
| | } |
| |
|
| | void test_matmul_accuracy() { |
| | printf("\n=== MATMUL ACCURACY WITH GROWING VECTORS ===\n"); |
| |
|
| | int rows = 512, cols = 2560; |
| | int wK = 32; |
| |
|
| | printf("Matrix: %dx%d, wK=%d\n", rows, cols, wK); |
| | printf("\n%6s %8s %8s %8s\n", "xSlots", "Cosine", "SNR_dB", "ms"); |
| |
|
| | srand(42); |
| | float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); |
| | float *xf = (float *)malloc(cols * sizeof(float)); |
| | float *y_ref = (float *)calloc(rows, sizeof(float)); |
| |
|
| | for (size_t i = 0; i < (size_t)rows * cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < rows; i++) |
| | for (int j = 0; j < cols; j++) |
| | y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; |
| |
|
| | FixedMat *M = fm_alloc(rows, cols, wK); |
| | fm_from_float(M, Mf); |
| |
|
| | |
| | int x_slots[] = {8, 16, 32, 48, 64, 96}; |
| | for (int t = 0; t < 6; t++) { |
| | int xK = x_slots[t]; |
| | GrowVec *x = gv_alloc(cols, xK, xK); |
| | GrowVec *y = gv_alloc(rows, xK, xK); |
| | gv_from_float(x, xf, xK); |
| |
|
| | struct timespec t0, t1; |
| | float *yf = (float *)malloc(rows * sizeof(float)); |
| |
|
| | clock_gettime(CLOCK_MONOTONIC, &t0); |
| | gv_matmul(M, x, y, xK); |
| | clock_gettime(CLOCK_MONOTONIC, &t1); |
| | double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6; |
| |
|
| | gv_to_float(y, yf); |
| |
|
| | float dot = 0, na = 0, nb = 0, noise = 0; |
| | for (int i = 0; i < rows; i++) { |
| | dot += y_ref[i] * yf[i]; |
| | na += y_ref[i] * y_ref[i]; |
| | nb += yf[i] * yf[i]; |
| | float e = y_ref[i] - yf[i]; |
| | noise += e * e; |
| | } |
| | float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| | float snr = 10.0f * log10f(na / (noise + 1e-10f)); |
| |
|
| | printf("%6d %8.6f %8.1f %8.1f\n", xK, cosine, snr, ms); |
| |
|
| | gv_free(x); gv_free(y); free(yf); |
| | } |
| |
|
| | fm_free(M); |
| | free(Mf); free(xf); free(y_ref); |
| | } |
| |
|
| | int main() { |
| | printf("========================================\n"); |
| | printf(" CONCATENATIVE UNARY ENGINE TESTS\n"); |
| | printf(" Addition = Concatenation\n"); |
| | printf(" Value = Count of Ones\n"); |
| | printf("========================================\n"); |
| |
|
| | test_concat_add(); |
| | test_growing_residual(); |
| | test_matmul_accuracy(); |
| |
|
| | printf("\n=== ALL TESTS DONE ===\n"); |
| | return 0; |
| | } |
| |
|