| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #define _POSIX_C_SOURCE 199309L |
| | #include <immintrin.h> |
| | #include <omp.h> |
| | #include <stdint.h> |
| | #include <stdlib.h> |
| | #include <string.h> |
| | #include <math.h> |
| | #include <stdio.h> |
| | #include <time.h> |
| |
|
| | |
| | |
| | |
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *slots; |
| | int dim; |
| | int chunks; |
| | int n_slots; |
| | int cap; |
| | } UVec; |
| |
|
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *slots; |
| | int rows, cols, chunks, K; |
| | } UMat; |
| |
|
| | |
| | typedef struct { |
| | float quantum; |
| | |
| | } USystem; |
| |
|
| | |
| | |
| | |
| | UVec* uv_new(int dim, int cap) { |
| | UVec *v = (UVec *)calloc(1, sizeof(UVec)); |
| | v->dim = dim; |
| | v->chunks = (dim + 63) / 64; |
| | v->n_slots = 0; |
| | v->cap = cap; |
| | v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); |
| | v->slots = (uint64_t *)aligned_alloc(64, (size_t)cap * v->chunks * sizeof(uint64_t)); |
| | memset(v->sign, 0, v->chunks * sizeof(uint64_t)); |
| | memset(v->slots, 0, (size_t)cap * v->chunks * sizeof(uint64_t)); |
| | return v; |
| | } |
| |
|
| | UMat* um_new(int rows, int cols, int K) { |
| | UMat *m = (UMat *)calloc(1, sizeof(UMat)); |
| | m->rows = rows; m->cols = cols; m->K = K; |
| | m->chunks = (cols + 63) / 64; |
| | m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | m->slots = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | memset(m->slots, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | return m; |
| | } |
| |
|
| | void uv_free(UVec *v) { if(v){free(v->sign);free(v->slots);free(v);} } |
| | void um_free(UMat *m) { if(m){free(m->sign);free(m->slots);free(m);} } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void uv_from_float(UVec *v, const float *x, int K, float quantum) { |
| | int dim = v->dim, chunks = v->chunks; |
| | v->n_slots = K; |
| |
|
| | memset(v->sign, 0, chunks * sizeof(uint64_t)); |
| | memset(v->slots, 0, (size_t)K * chunks * sizeof(uint64_t)); |
| |
|
| | float inv_q = 1.0f / quantum; |
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | if (x[i] < 0.0f) v->sign[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(x[i]) * inv_q + 0.5f); |
| | if (mag > K) mag = K; |
| | for (int s = 0; s < mag; s++) |
| | v->slots[(size_t)s * chunks + c] |= bit; |
| | } |
| | } |
| |
|
| | void uv_to_float(const UVec *v, float *out, float quantum) { |
| | int dim = v->dim, chunks = v->chunks; |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | int mag = 0; |
| | for (int s = 0; s < v->n_slots; s++) |
| | if (v->slots[(size_t)s * chunks + c] & bit) |
| | mag++; |
| |
|
| | out[i] = (v->sign[c] & bit) ? -(float)mag * quantum : (float)mag * quantum; |
| | } |
| | } |
| |
|
| | void um_from_float(UMat *m, const float *data, float quantum) { |
| | int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; |
| |
|
| | memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); |
| | memset(m->slots, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); |
| |
|
| | float inv_q = 1.0f / quantum; |
| | for (int r = 0; r < rows; r++) { |
| | const float *row = data + (size_t)r * cols; |
| | uint64_t *rs = m->sign + (size_t)r * chunks; |
| |
|
| | for (int j = 0; j < cols; j++) { |
| | int c = j / 64; |
| | uint64_t bit = 1ULL << (j % 64); |
| | if (row[j] < 0.0f) rs[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(row[j]) * inv_q + 0.5f); |
| | if (mag > K) mag = K; |
| | for (int s = 0; s < mag; s++) |
| | m->slots[((size_t)s * rows + r) * chunks + c] |= bit; |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void uv_concat(UVec *dst, const UVec *src) { |
| | int chunks = dst->chunks; |
| |
|
| | for (int s = 0; s < src->n_slots; s++) { |
| | if (dst->n_slots >= dst->cap) { |
| | printf("OVERFLOW: %d/%d slots\n", dst->n_slots, dst->cap); |
| | return; |
| | } |
| |
|
| | const uint64_t *src_slot = src->slots + (size_t)s * chunks; |
| | uint64_t *new_slot = dst->slots + (size_t)dst->n_slots * chunks; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t sb = src_slot[c]; |
| | uint64_t agree = ~(dst->sign[c] ^ src->sign[c]); |
| | uint64_t disagree = dst->sign[c] ^ src->sign[c]; |
| |
|
| | |
| | uint64_t add = sb & agree; |
| |
|
| | |
| | uint64_t cancel = sb & disagree; |
| | for (int d = dst->n_slots - 1; d >= 0 && cancel; d--) { |
| | uint64_t *ds = dst->slots + (size_t)d * chunks + c; |
| | uint64_t overlap = *ds & cancel; |
| | *ds &= ~overlap; |
| | cancel &= ~overlap; |
| | } |
| | |
| | if (cancel) { |
| | dst->sign[c] ^= cancel; |
| | add |= cancel; |
| | } |
| |
|
| | new_slot[c] = add; |
| | } |
| |
|
| | |
| | int any = 0; |
| | for (int c = 0; c < chunks && !any; c++) |
| | if (new_slot[c]) any = 1; |
| | if (any) dst->n_slots++; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void uv_matmul( |
| | const UMat *M, const UVec *x, |
| | UVec *y, int K_out, float quantum |
| | ) { |
| | int out_dim = M->rows; |
| | int chunks = M->chunks; |
| | int wK = M->K; |
| | int xK = x->n_slots; |
| |
|
| | float q2 = quantum * quantum; |
| |
|
| | y->n_slots = K_out; |
| | memset(y->sign, 0, y->chunks * sizeof(uint64_t)); |
| | memset(y->slots, 0, (size_t)K_out * y->chunks * sizeof(uint64_t)); |
| |
|
| | |
| | int *acc = (int *)aligned_alloc(64, out_dim * sizeof(int)); |
| | uint8_t *neg = (uint8_t *)calloc(out_dim, 1); |
| |
|
| | #pragma omp parallel for schedule(dynamic, 32) |
| | for (int i = 0; i < out_dim; i++) { |
| | const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| | long long a = 0; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t same = ~(w_sign_row[c] ^ x->sign[c]); |
| | uint64_t diff = w_sign_row[c] ^ x->sign[c]; |
| |
|
| | for (int p = 0; p < wK; p++) { |
| | uint64_t wp = M->slots[((size_t)p * out_dim + i) * chunks + c]; |
| | for (int q = 0; q < xK; q++) { |
| | uint64_t xq = x->slots[(size_t)q * chunks + c]; |
| | uint64_t active = wp & xq; |
| | a += __builtin_popcountll(active & same) |
| | - __builtin_popcountll(active & diff); |
| | } |
| | } |
| | } |
| |
|
| | |
| | float val = (float)a * quantum; |
| | int mag = (int)(fabsf(val) + 0.5f); |
| | if (mag > K_out) mag = K_out; |
| | acc[i] = mag; |
| | neg[i] = (val < 0.0f) ? 1 : 0; |
| | } |
| |
|
| | |
| | for (int i = 0; i < out_dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| | if (neg[i]) y->sign[c] |= bit; |
| | for (int s = 0; s < acc[i]; s++) |
| | y->slots[(size_t)s * y->chunks + c] |= bit; |
| | } |
| |
|
| | free(acc); free(neg); |
| | } |
| |
|
| | |
| | |
| | |
| | void uv_rmsnorm(const UVec *x, const float *weight, UVec *out, int K_out, float quantum, float eps) { |
| | int dim = x->dim; |
| | float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | uv_to_float(x, xf, quantum); |
| |
|
| | float ss = 0.0f; |
| | for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; |
| | float rms = 1.0f / sqrtf(ss / dim + eps); |
| | for (int i = 0; i < dim; i++) xf[i] *= rms * weight[i]; |
| |
|
| | uv_from_float(out, xf, K_out, quantum); |
| | free(xf); |
| | } |
| |
|
| | |
| | |
| | |
| |
|
| | void test_concat_correct() { |
| | printf("=== CONCAT = ADD (SAME QUANTUM) ===\n\n"); |
| |
|
| | float quantum = 0.25f; |
| | int dim = 8; |
| |
|
| | |
| | |
| | |
| | |
| | float a_vals[] = {3.0, -2.0, 5.0, 1.0, 0.0, -4.0, 2.0, 7.0}; |
| | float b_vals[] = {2.0, 1.0, -3.0, 4.0, 1.0, 2.0, -1.0, -2.0}; |
| | float expect[] = {5.0, -1.0, 2.0, 5.0, 1.0, -2.0, 1.0, 5.0}; |
| |
|
| | int K = 32; |
| | UVec *a = uv_new(dim, 128); |
| | UVec *b = uv_new(dim, 128); |
| |
|
| | uv_from_float(a, a_vals, K, quantum); |
| | uv_from_float(b, b_vals, K, quantum); |
| |
|
| | float a_rec[8], b_rec[8]; |
| | uv_to_float(a, a_rec, quantum); |
| | uv_to_float(b, b_rec, quantum); |
| |
|
| | printf("Quantum = %.2f (every bit = %.2f)\n\n", quantum, quantum); |
| | printf("A original: "); for(int i=0;i<8;i++) printf("%6.2f ",a_vals[i]); printf("\n"); |
| | printf("A unary: "); for(int i=0;i<8;i++) printf("%6.2f ",a_rec[i]); printf("\n"); |
| | printf("B original: "); for(int i=0;i<8;i++) printf("%6.2f ",b_vals[i]); printf("\n"); |
| | printf("B unary: "); for(int i=0;i<8;i++) printf("%6.2f ",b_rec[i]); printf("\n\n"); |
| |
|
| | printf("A slots: %d, B slots: %d\n", a->n_slots, b->n_slots); |
| | uv_concat(a, b); |
| | printf("After concat: %d slots\n\n", a->n_slots); |
| |
|
| | float result[8]; |
| | uv_to_float(a, result, quantum); |
| |
|
| | printf("Expected A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]); printf("\n"); |
| | printf("Concat A+B: "); for(int i=0;i<8;i++) printf("%6.2f ",result[i]); printf("\n"); |
| | printf("Error: "); for(int i=0;i<8;i++) printf("%6.2f ",expect[i]-result[i]); printf("\n"); |
| |
|
| | uv_free(a); uv_free(b); |
| | } |
| |
|
| | void test_chain_concat() { |
| | printf("\n=== CHAINED CONCAT (5 additions) ===\n\n"); |
| |
|
| | float quantum = 0.1f; |
| | int dim = 4; |
| | int K = 64; |
| |
|
| | float vals[] = {1.0, -2.0, 3.0, -0.5}; |
| | UVec *acc = uv_new(dim, 512); |
| | uv_from_float(acc, vals, K, quantum); |
| |
|
| | printf("Start: "); |
| | float tmp[4]; |
| | uv_to_float(acc, tmp, quantum); |
| | for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]); |
| | printf(" (%d slots)\n", acc->n_slots); |
| |
|
| | float expected[] = {1.0, -2.0, 3.0, -0.5}; |
| |
|
| | for (int step = 0; step < 5; step++) { |
| | float add_vals[] = {0.5, 0.3, -1.0, 0.7}; |
| | UVec *delta = uv_new(dim, K); |
| | uv_from_float(delta, add_vals, K, quantum); |
| |
|
| | uv_concat(acc, delta); |
| |
|
| | for (int i = 0; i < 4; i++) expected[i] += add_vals[i]; |
| |
|
| | uv_to_float(acc, tmp, quantum); |
| | printf(" +[0.5,0.3,-1.0,0.7] = "); |
| | for(int i=0;i<4;i++) printf("%6.2f ",tmp[i]); |
| | printf(" (%d slots) expect:", acc->n_slots); |
| | for(int i=0;i<4;i++) printf("%6.2f ",expected[i]); |
| |
|
| | |
| | float max_err = 0; |
| | for(int i=0;i<4;i++) { |
| | float e = fabsf(expected[i] - tmp[i]); |
| | if (e > max_err) max_err = e; |
| | } |
| | printf(" err=%.2f\n", max_err); |
| |
|
| | uv_free(delta); |
| | } |
| |
|
| | uv_free(acc); |
| | } |
| |
|
| | void test_matmul() { |
| | printf("\n=== MATMUL (GLOBAL QUANTUM) ===\n\n"); |
| |
|
| | int rows = 512, cols = 256; |
| | int wK = 32, xK = 32; |
| |
|
| | srand(42); |
| | float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); |
| | float *xf = (float *)malloc(cols * sizeof(float)); |
| | float *y_ref = (float *)calloc(rows, sizeof(float)); |
| |
|
| | |
| | for (size_t i = 0; i < (size_t)rows * cols; i++) |
| | Mf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; |
| | for (int i = 0; i < cols; i++) |
| | xf[i] = ((float)rand() / RAND_MAX - 0.5f) * 2.0f; |
| | for (int i = 0; i < rows; i++) |
| | for (int j = 0; j < cols; j++) |
| | y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; |
| |
|
| | |
| | float data_max = 0; |
| | for (size_t i = 0; i < (size_t)rows * cols; i++) { |
| | float a = fabsf(Mf[i]); |
| | if (a > data_max) data_max = a; |
| | } |
| | for (int i = 0; i < cols; i++) { |
| | float a = fabsf(xf[i]); |
| | if (a > data_max) data_max = a; |
| | } |
| | float quantum = data_max / wK; |
| |
|
| | printf("Data range: [-%.2f, %.2f]\n", data_max, data_max); |
| | printf("Quantum: %.4f (K=%d gives range [-%d*q, %d*q])\n", quantum, wK, wK, wK); |
| | printf("Matrix: %dx%d, wK=%d, xK=%d\n\n", rows, cols, wK, xK); |
| |
|
| | UMat *M = um_new(rows, cols, wK); |
| | UVec *x = uv_new(cols, xK); |
| |
|
| | um_from_float(M, Mf, quantum); |
| | uv_from_float(x, xf, xK, quantum); |
| |
|
| | |
| | float ymax = 0; |
| | for (int i = 0; i < rows; i++) { |
| | float a = fabsf(y_ref[i]); |
| | if (a > ymax) ymax = a; |
| | } |
| | int K_out = (int)(ymax / quantum + 1); |
| | if (K_out > 4096) K_out = 4096; |
| | printf("Output range: [-%.2f, %.2f], K_out=%d\n", ymax, ymax, K_out); |
| |
|
| | UVec *y = uv_new(rows, K_out); |
| |
|
| | struct timespec t0, t1; |
| | clock_gettime(CLOCK_MONOTONIC, &t0); |
| | uv_matmul(M, x, y, K_out, quantum); |
| | clock_gettime(CLOCK_MONOTONIC, &t1); |
| | double ms = (t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6; |
| |
|
| | float *yf = (float *)malloc(rows * sizeof(float)); |
| | uv_to_float(y, yf, quantum); |
| |
|
| | float dot = 0, na = 0, nb = 0, noise = 0; |
| | for (int i = 0; i < rows; i++) { |
| | dot += y_ref[i] * yf[i]; |
| | na += y_ref[i] * y_ref[i]; |
| | nb += yf[i] * yf[i]; |
| | float e = y_ref[i] - yf[i]; noise += e * e; |
| | } |
| | float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| | float snr = 10.0f * log10f(na / (noise + 1e-10f)); |
| |
|
| | printf("\nCosine: %.6f\n", cosine); |
| | printf("SNR: %.1f dB\n", snr); |
| | printf("Time: %.1f ms\n", ms); |
| |
|
| | printf("\nFirst 10 values:\n"); |
| | printf("%10s %10s %10s\n", "Ref", "Unary", "Error"); |
| | for (int i = 0; i < 10; i++) |
| | printf("%10.4f %10.4f %10.4f\n", y_ref[i], yf[i], y_ref[i] - yf[i]); |
| |
|
| | um_free(M); uv_free(x); uv_free(y); |
| | free(Mf); free(xf); free(y_ref); free(yf); |
| | } |
| |
|
| | void test_residual_chain() { |
| | printf("\n=== RESIDUAL CHAIN — CONCAT PRESERVES INFORMATION ===\n\n"); |
| |
|
| | float quantum = 0.05f; |
| | int dim = 1024; |
| | int K = 128; |
| |
|
| | srand(123); |
| | float *embed = (float *)malloc(dim * sizeof(float)); |
| | for (int i = 0; i < dim; i++) |
| | embed[i] = ((float)rand() / RAND_MAX - 0.5f) * 4.0f; |
| |
|
| | |
| | float *ref = (float *)malloc(dim * sizeof(float)); |
| | memcpy(ref, embed, dim * sizeof(float)); |
| |
|
| | |
| | int total_cap = K + 10 * K; |
| | UVec *residual = uv_new(dim, total_cap); |
| | uv_from_float(residual, embed, K, quantum); |
| |
|
| | printf("Quantum=%.2f, K=%d per sublayer, dim=%d\n\n", quantum, K, dim); |
| | printf("%6s %6s %8s %8s\n", "Step", "Slots", "Cosine", "MaxErr"); |
| |
|
| | for (int step = 0; step < 10; step++) { |
| | float *delta = (float *)malloc(dim * sizeof(float)); |
| | for (int i = 0; i < dim; i++) |
| | delta[i] = ((float)rand() / RAND_MAX - 0.5f) * 0.5f; |
| |
|
| | |
| | for (int i = 0; i < dim; i++) ref[i] += delta[i]; |
| |
|
| | |
| | UVec *d = uv_new(dim, K); |
| | uv_from_float(d, delta, K, quantum); |
| | uv_concat(residual, d); |
| |
|
| | |
| | float *rec = (float *)malloc(dim * sizeof(float)); |
| | uv_to_float(residual, rec, quantum); |
| |
|
| | float dot = 0, na = 0, nb = 0, max_err = 0; |
| | for (int i = 0; i < dim; i++) { |
| | dot += ref[i] * rec[i]; |
| | na += ref[i] * ref[i]; |
| | nb += rec[i] * rec[i]; |
| | float e = fabsf(ref[i] - rec[i]); |
| | if (e > max_err) max_err = e; |
| | } |
| | float cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| |
|
| | printf("%6d %6d %8.6f %8.4f\n", step + 1, residual->n_slots, cosine, max_err); |
| |
|
| | uv_free(d); free(delta); free(rec); |
| | } |
| |
|
| | uv_free(residual); |
| | free(embed); free(ref); |
| | } |
| |
|
| | int main() { |
| | printf("================================================\n"); |
| | printf(" PROPER UNARY — GLOBAL QUANTUM, NO LOCAL SCALES\n"); |
| | printf(" Every bit = 1 quantum. Concat = Add.\n"); |
| | printf("================================================\n\n"); |
| |
|
| | test_concat_correct(); |
| | test_chain_concat(); |
| | test_matmul(); |
| | test_residual_chain(); |
| |
|
| | printf("\n=== DONE ===\n"); |
| | return 0; |
| | } |
| |
|