| | #define _POSIX_C_SOURCE 199309L |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #include <immintrin.h> |
| | #include <omp.h> |
| | #include <stdint.h> |
| | #include <stdlib.h> |
| | #include <string.h> |
| | #include <math.h> |
| | #include <stdio.h> |
| | #include <time.h> |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *planes; |
| | int dim; |
| | int chunks; |
| | int n_planes; |
| | int bias; |
| | float base_scale; |
| | } LogUnaryTensor; |
| |
|
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *planes; |
| | float *row_scales; |
| | int rows; |
| | int cols; |
| | int chunks; |
| | int n_planes; |
| | int bias; |
| | } LogUnaryMatrix; |
| |
|
| | |
| | |
| | |
| | LogUnaryTensor* lut_alloc(int dim, int n_planes, int bias) { |
| | LogUnaryTensor *t = (LogUnaryTensor *)calloc(1, sizeof(LogUnaryTensor)); |
| | t->dim = dim; |
| | t->n_planes = n_planes; |
| | t->bias = bias; |
| | t->chunks = (dim + 63) / 64; |
| | t->base_scale = 1.0f; |
| | t->sign = (uint64_t *)aligned_alloc(64, t->chunks * sizeof(uint64_t)); |
| | t->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * t->chunks * sizeof(uint64_t)); |
| | memset(t->sign, 0, t->chunks * sizeof(uint64_t)); |
| | memset(t->planes, 0, (size_t)n_planes * t->chunks * sizeof(uint64_t)); |
| | return t; |
| | } |
| |
|
| | LogUnaryMatrix* lum_alloc(int rows, int cols, int n_planes, int bias) { |
| | LogUnaryMatrix *m = (LogUnaryMatrix *)calloc(1, sizeof(LogUnaryMatrix)); |
| | m->rows = rows; |
| | m->cols = cols; |
| | m->n_planes = n_planes; |
| | m->bias = bias; |
| | m->chunks = (cols + 63) / 64; |
| | m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | m->planes = (uint64_t *)aligned_alloc(64, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); |
| | m->row_scales = (float *)aligned_alloc(64, rows * sizeof(float)); |
| | memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | memset(m->planes, 0, (size_t)n_planes * rows * m->chunks * sizeof(uint64_t)); |
| | for (int i = 0; i < rows; i++) m->row_scales[i] = 1.0f; |
| | return m; |
| | } |
| |
|
| | void lut_free(LogUnaryTensor *t) { |
| | if (t) { free(t->sign); free(t->planes); free(t); } |
| | } |
| | void lum_free(LogUnaryMatrix *m) { |
| | if (m) { free(m->sign); free(m->planes); free(m->row_scales); free(m); } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | void lut_from_float(LogUnaryTensor *t, const float *x) { |
| | int dim = t->dim; |
| | int np = t->n_planes; |
| | int bias = t->bias; |
| | int chunks = t->chunks; |
| |
|
| | memset(t->sign, 0, chunks * sizeof(uint64_t)); |
| | memset(t->planes, 0, (size_t)np * chunks * sizeof(uint64_t)); |
| |
|
| | |
| | float amax = 0.0f; |
| | for (int i = 0; i < dim; i++) { |
| | float a = fabsf(x[i]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { t->base_scale = 1.0f; return; } |
| |
|
| | |
| | |
| | t->base_scale = amax / ldexpf(1.0f, np - 1 - bias); |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | if (x[i] < 0.0f) t->sign[c] |= bit; |
| |
|
| | float mag = fabsf(x[i]); |
| | |
| | for (int p = 0; p < np; p++) { |
| | float thresh = t->base_scale * ldexpf(1.0f, p - bias); |
| | if (mag >= thresh) |
| | t->planes[(size_t)p * chunks + c] |= bit; |
| | else |
| | break; |
| | } |
| | } |
| | } |
| |
|
| | void lut_to_float(const LogUnaryTensor *t, float *out) { |
| | int dim = t->dim; |
| | int np = t->n_planes; |
| | int bias = t->bias; |
| | int chunks = t->chunks; |
| |
|
| | memset(out, 0, dim * sizeof(float)); |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | |
| | int highest = -1; |
| | for (int p = np - 1; p >= 0; p--) { |
| | if (t->planes[(size_t)p * chunks + c] & bit) { |
| | highest = p; |
| | break; |
| | } |
| | } |
| |
|
| | if (highest < 0) { |
| | out[i] = 0.0f; |
| | } else { |
| | |
| | |
| | float val = t->base_scale * ldexpf(1.0f, highest - bias); |
| | if (highest < np - 1) { |
| | float next = t->base_scale * ldexpf(1.0f, highest + 1 - bias); |
| | val = (val + next) * 0.5f; |
| | } |
| | out[i] = (t->sign[c] & bit) ? -val : val; |
| | } |
| | } |
| | } |
| |
|
| | |
| | void lum_from_float(LogUnaryMatrix *m, const float *data) { |
| | int rows = m->rows, cols = m->cols; |
| | int np = m->n_planes, bias = m->bias; |
| | int chunks = m->chunks; |
| |
|
| | memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); |
| | memset(m->planes, 0, (size_t)np * rows * chunks * sizeof(uint64_t)); |
| |
|
| | for (int r = 0; r < rows; r++) { |
| | const float *row = data + (size_t)r * cols; |
| |
|
| | |
| | float amax = 0.0f; |
| | for (int j = 0; j < cols; j++) { |
| | float a = fabsf(row[j]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { m->row_scales[r] = 1.0f; continue; } |
| | m->row_scales[r] = amax / ldexpf(1.0f, np - 1 - bias); |
| |
|
| | uint64_t *row_sign = m->sign + (size_t)r * chunks; |
| |
|
| | for (int j = 0; j < cols; j++) { |
| | int c = j / 64; |
| | uint64_t bit = 1ULL << (j % 64); |
| |
|
| | if (row[j] < 0.0f) row_sign[c] |= bit; |
| |
|
| | float mag = fabsf(row[j]); |
| | for (int p = 0; p < np; p++) { |
| | float thresh = m->row_scales[r] * ldexpf(1.0f, p - bias); |
| | if (mag >= thresh) |
| | m->planes[((size_t)p * rows + r) * chunks + c] |= bit; |
| | else |
| | break; |
| | } |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void lum_matvec( |
| | const LogUnaryMatrix *M, |
| | const LogUnaryTensor *x, |
| | LogUnaryTensor *y_out |
| | ) { |
| | int out_dim = M->rows; |
| | int chunks = M->chunks; |
| | int wp = M->n_planes; |
| | int xp = x->n_planes; |
| | int w_bias = M->bias; |
| | int x_bias = x->bias; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | int base_shift = w_bias + x_bias; |
| |
|
| | |
| | |
| |
|
| | float *y_float = (float *)aligned_alloc(64, out_dim * sizeof(float)); |
| |
|
| | #pragma omp parallel for schedule(dynamic, 32) |
| | for (int i = 0; i < out_dim; i++) { |
| | const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| | long long acc = 0; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t ws = w_sign_row[c]; |
| | uint64_t xs = x->sign[c]; |
| | uint64_t same = ~(ws ^ xs); |
| | uint64_t diff = ws ^ xs; |
| |
|
| | for (int p = 0; p < wp; p++) { |
| | uint64_t w_plane = M->planes[((size_t)p * out_dim + i) * chunks + c]; |
| |
|
| | for (int q = 0; q < xp; q++) { |
| | uint64_t x_plane = x->planes[(size_t)q * chunks + c]; |
| | uint64_t active = w_plane & x_plane; |
| | uint64_t pos = active & same; |
| | uint64_t neg = active & diff; |
| |
|
| | int count = __builtin_popcountll(pos) - __builtin_popcountll(neg); |
| |
|
| | |
| | int shift = p + q; |
| | if (count != 0) |
| | acc += (long long)count << shift; |
| | } |
| | } |
| | } |
| |
|
| | |
| | y_float[i] = (float)acc * M->row_scales[i] * x->base_scale |
| | * ldexpf(1.0f, -base_shift); |
| | } |
| |
|
| | |
| | lut_from_float(y_out, y_float); |
| | free(y_float); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void lut_add(const LogUnaryTensor *a, const LogUnaryTensor *b, LogUnaryTensor *out) { |
| | int dim = a->dim; |
| | float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | float *fb = (float *)aligned_alloc(64, dim * sizeof(float)); |
| |
|
| | lut_to_float(a, fa); |
| | lut_to_float(b, fb); |
| |
|
| | for (int i = 0; i < dim; i++) fa[i] += fb[i]; |
| |
|
| | lut_from_float(out, fa); |
| | free(fa); free(fb); |
| | } |
| |
|
| | |
| | void lut_add_float(LogUnaryTensor *a, const float *b) { |
| | int dim = a->dim; |
| | float *fa = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | lut_to_float(a, fa); |
| | for (int i = 0; i < dim; i++) fa[i] += b[i]; |
| | lut_from_float(a, fa); |
| | free(fa); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | void lut_rmsnorm( |
| | const LogUnaryTensor *x, |
| | const float *weight, |
| | LogUnaryTensor *out, |
| | float eps |
| | ) { |
| | int dim = x->dim; |
| | float *xf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | lut_to_float(x, xf); |
| |
|
| | float ss = 0.0f; |
| | for (int i = 0; i < dim; i++) ss += xf[i] * xf[i]; |
| | float rms = 1.0f / sqrtf(ss / dim + eps); |
| |
|
| | for (int i = 0; i < dim; i++) xf[i] = xf[i] * rms * weight[i]; |
| |
|
| | lut_from_float(out, xf); |
| | free(xf); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | void lut_silu_mul( |
| | const LogUnaryTensor *gate, |
| | const LogUnaryTensor *up, |
| | LogUnaryTensor *out |
| | ) { |
| | int dim = gate->dim; |
| | float *gf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | float *uf = (float *)aligned_alloc(64, dim * sizeof(float)); |
| |
|
| | lut_to_float(gate, gf); |
| | lut_to_float(up, uf); |
| |
|
| | for (int i = 0; i < dim; i++) |
| | gf[i] = (gf[i] / (1.0f + expf(-gf[i]))) * uf[i]; |
| |
|
| | lut_from_float(out, gf); |
| | free(gf); free(uf); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | void lut_rope(LogUnaryTensor *t, int offset, int start, int head_dim, float theta) { |
| | |
| | float *f = (float *)aligned_alloc(64, head_dim * sizeof(float)); |
| |
|
| | |
| | float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); |
| | lut_to_float(t, full); |
| | memcpy(f, full + start, head_dim * sizeof(float)); |
| |
|
| | for (int i = 0; i < head_dim; i += 2) { |
| | float freq = 1.0f / powf(theta, (float)i / head_dim); |
| | float angle = offset * freq; |
| | float c = cosf(angle), s = sinf(angle); |
| | float v0 = f[i], v1 = f[i + 1]; |
| | f[i] = v0 * c - v1 * s; |
| | f[i + 1] = v0 * s + v1 * c; |
| | } |
| |
|
| | memcpy(full + start, f, head_dim * sizeof(float)); |
| | lut_from_float(t, full); |
| | free(f); free(full); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | void lut_to_float_slice(const LogUnaryTensor *t, int start, int len, float *out) { |
| | float *full = (float *)aligned_alloc(64, t->dim * sizeof(float)); |
| | lut_to_float(t, full); |
| | memcpy(out, full + start, len * sizeof(float)); |
| | free(full); |
| | } |
| |
|
| | |
| | |
| | |
| | typedef struct { |
| | double total_and_ops; |
| | double total_popcount_ops; |
| | double wall_time_s; |
| | double elements_per_sec; |
| | double gops; |
| | } BenchResult; |
| |
|
| | BenchResult lum_bench_matvec(int rows, int cols, int w_planes, int x_planes, int bias, int iters) { |
| | LogUnaryMatrix *M = lum_alloc(rows, cols, w_planes, bias); |
| | LogUnaryTensor *x = lut_alloc(cols, x_planes, bias); |
| | LogUnaryTensor *y = lut_alloc(rows, x_planes, bias); |
| |
|
| | |
| | for (size_t i = 0; i < (size_t)rows * M->chunks; i++) |
| | M->sign[i] = ((uint64_t)rand() << 32) | rand(); |
| | for (size_t i = 0; i < (size_t)w_planes * rows * M->chunks; i++) |
| | M->planes[i] = ((uint64_t)rand() << 32) | rand(); |
| | for (int i = 0; i < rows; i++) M->row_scales[i] = 1.0f; |
| | for (size_t i = 0; i < (size_t)x->chunks; i++) |
| | x->sign[i] = ((uint64_t)rand() << 32) | rand(); |
| | for (size_t i = 0; i < (size_t)x_planes * x->chunks; i++) |
| | x->planes[i] = ((uint64_t)rand() << 32) | rand(); |
| | x->base_scale = 1.0f; |
| |
|
| | |
| | lum_matvec(M, x, y); |
| |
|
| | struct timespec t0, t1; |
| | clock_gettime(CLOCK_MONOTONIC, &t0); |
| | for (int i = 0; i < iters; i++) |
| | lum_matvec(M, x, y); |
| | clock_gettime(CLOCK_MONOTONIC, &t1); |
| |
|
| | double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) * 1e-9; |
| | int chunks = M->chunks; |
| | double ops_per_call = (double)rows * chunks * w_planes * x_planes * 2; |
| |
|
| | BenchResult r; |
| | r.wall_time_s = dt / iters; |
| | r.total_and_ops = ops_per_call; |
| | r.total_popcount_ops = ops_per_call; |
| | r.elements_per_sec = (double)rows * cols * iters / dt; |
| | r.gops = ops_per_call * iters / dt / 1e9; |
| |
|
| | lum_free(M); lut_free(x); lut_free(y); |
| | return r; |
| | } |
| |
|
| | |
| | |
| | |
| | typedef struct { |
| | float max_error; |
| | float mean_error; |
| | float cosine_sim; |
| | float snr_db; |
| | } AccuracyResult; |
| |
|
| | AccuracyResult lut_accuracy_test(int dim, int n_planes, int bias) { |
| | float *original = (float *)aligned_alloc(64, dim * sizeof(float)); |
| | float *recovered = (float *)aligned_alloc(64, dim * sizeof(float)); |
| |
|
| | |
| | for (int i = 0; i < dim; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | original[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| |
|
| | LogUnaryTensor *t = lut_alloc(dim, n_planes, bias); |
| | lut_from_float(t, original); |
| | lut_to_float(t, recovered); |
| |
|
| | float max_err = 0, sum_err = 0; |
| | float dot = 0, na = 0, nb = 0; |
| | for (int i = 0; i < dim; i++) { |
| | float err = fabsf(original[i] - recovered[i]); |
| | if (err > max_err) max_err = err; |
| | sum_err += err; |
| | dot += original[i] * recovered[i]; |
| | na += original[i] * original[i]; |
| | nb += recovered[i] * recovered[i]; |
| | } |
| |
|
| | float noise_power = 0; |
| | for (int i = 0; i < dim; i++) { |
| | float e = original[i] - recovered[i]; |
| | noise_power += e * e; |
| | } |
| |
|
| | AccuracyResult r; |
| | r.max_error = max_err; |
| | r.mean_error = sum_err / dim; |
| | r.cosine_sim = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| | r.snr_db = 10.0f * log10f(na / (noise_power + 1e-10f)); |
| |
|
| | lut_free(t); |
| | free(original); free(recovered); |
| | return r; |
| | } |
| |
|