| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #define _POSIX_C_SOURCE 199309L |
| | #include <immintrin.h> |
| | #include <omp.h> |
| | #include <stdint.h> |
| | #include <stdlib.h> |
| | #include <string.h> |
| | #include <math.h> |
| | #include <stdio.h> |
| | #include <time.h> |
| |
|
| | |
| | |
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *unary; |
| | float scale; |
| | int dim; |
| | int chunks; |
| | int K; |
| | } TrueUnaryVec; |
| |
|
| | |
| | typedef struct { |
| | uint64_t *sign; |
| | uint64_t *unary; |
| | float *scales; |
| | int rows; |
| | int cols; |
| | int chunks; |
| | int K; |
| | } TrueUnaryMat; |
| |
|
| | |
| | |
| | |
| | TrueUnaryVec* tuv_alloc(int dim, int K) { |
| | TrueUnaryVec *v = (TrueUnaryVec *)calloc(1, sizeof(TrueUnaryVec)); |
| | v->dim = dim; |
| | v->K = K; |
| | v->chunks = (dim + 63) / 64; |
| | v->scale = 1.0f; |
| | v->sign = (uint64_t *)aligned_alloc(64, v->chunks * sizeof(uint64_t)); |
| | v->unary = (uint64_t *)aligned_alloc(64, (size_t)K * v->chunks * sizeof(uint64_t)); |
| | memset(v->sign, 0, v->chunks * sizeof(uint64_t)); |
| | memset(v->unary, 0, (size_t)K * v->chunks * sizeof(uint64_t)); |
| | return v; |
| | } |
| |
|
| | TrueUnaryMat* tum_alloc(int rows, int cols, int K) { |
| | TrueUnaryMat *m = (TrueUnaryMat *)calloc(1, sizeof(TrueUnaryMat)); |
| | m->rows = rows; |
| | m->cols = cols; |
| | m->K = K; |
| | m->chunks = (cols + 63) / 64; |
| | m->sign = (uint64_t *)aligned_alloc(64, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | m->unary = (uint64_t *)aligned_alloc(64, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | m->scales = (float *)aligned_alloc(64, rows * sizeof(float)); |
| | memset(m->sign, 0, (size_t)rows * m->chunks * sizeof(uint64_t)); |
| | memset(m->unary, 0, (size_t)K * rows * m->chunks * sizeof(uint64_t)); |
| | for (int i = 0; i < rows; i++) m->scales[i] = 1.0f; |
| | return m; |
| | } |
| |
|
| | void tuv_free(TrueUnaryVec *v) { |
| | if (v) { free(v->sign); free(v->unary); free(v); } |
| | } |
| | void tum_free(TrueUnaryMat *m) { |
| | if (m) { free(m->sign); free(m->unary); free(m->scales); free(m); } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void tuv_from_float(TrueUnaryVec *v, const float *x) { |
| | int dim = v->dim, K = v->K, chunks = v->chunks; |
| |
|
| | memset(v->sign, 0, chunks * sizeof(uint64_t)); |
| | memset(v->unary, 0, (size_t)K * chunks * sizeof(uint64_t)); |
| |
|
| | float amax = 0.0f; |
| | for (int i = 0; i < dim; i++) { |
| | float a = fabsf(x[i]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { v->scale = 1.0f; return; } |
| | v->scale = amax / K; |
| |
|
| | float inv = K / amax; |
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | if (x[i] < 0.0f) v->sign[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(x[i]) * inv + 0.5f); |
| | if (mag > K) mag = K; |
| |
|
| | |
| | for (int p = 0; p < mag; p++) |
| | v->unary[(size_t)p * chunks + c] |= bit; |
| | } |
| | } |
| |
|
| | void tuv_to_float(const TrueUnaryVec *v, float *out) { |
| | int dim = v->dim, K = v->K, chunks = v->chunks; |
| |
|
| | for (int i = 0; i < dim; i++) { |
| | int c = i / 64; |
| | uint64_t bit = 1ULL << (i % 64); |
| |
|
| | |
| | int mag = 0; |
| | for (int p = 0; p < K; p++) { |
| | if (v->unary[(size_t)p * chunks + c] & bit) |
| | mag++; |
| | } |
| |
|
| | float val = (float)mag * v->scale; |
| | out[i] = (v->sign[c] & bit) ? -val : val; |
| | } |
| | } |
| |
|
| | void tum_from_float(TrueUnaryMat *m, const float *data) { |
| | int rows = m->rows, cols = m->cols, K = m->K, chunks = m->chunks; |
| |
|
| | memset(m->sign, 0, (size_t)rows * chunks * sizeof(uint64_t)); |
| | memset(m->unary, 0, (size_t)K * rows * chunks * sizeof(uint64_t)); |
| |
|
| | for (int r = 0; r < rows; r++) { |
| | const float *row = data + (size_t)r * cols; |
| |
|
| | float amax = 0.0f; |
| | for (int j = 0; j < cols; j++) { |
| | float a = fabsf(row[j]); |
| | if (a > amax) amax = a; |
| | } |
| | if (amax == 0.0f) { m->scales[r] = 1.0f; continue; } |
| | m->scales[r] = amax / K; |
| | float inv = K / amax; |
| |
|
| | uint64_t *row_sign = m->sign + (size_t)r * chunks; |
| |
|
| | for (int j = 0; j < cols; j++) { |
| | int c = j / 64; |
| | uint64_t bit = 1ULL << (j % 64); |
| |
|
| | if (row[j] < 0.0f) row_sign[c] |= bit; |
| |
|
| | int mag = (int)(fabsf(row[j]) * inv + 0.5f); |
| | if (mag > K) mag = K; |
| |
|
| | for (int p = 0; p < mag; p++) |
| | m->unary[((size_t)p * rows + r) * chunks + c] |= bit; |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void tum_matvec( |
| | const TrueUnaryMat *M, |
| | const TrueUnaryVec *x, |
| | float *y_out |
| | ) { |
| | int out_dim = M->rows; |
| | int chunks = M->chunks; |
| | int wK = M->K; |
| | int xK = x->K; |
| |
|
| | #pragma omp parallel for schedule(dynamic, 32) |
| | for (int i = 0; i < out_dim; i++) { |
| | const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| | long long acc = 0; |
| |
|
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t ws = w_sign_row[c]; |
| | uint64_t xs = x->sign[c]; |
| | uint64_t same = ~(ws ^ xs); |
| | uint64_t diff = ws ^ xs; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | for (int p = 0; p < wK; p++) { |
| | uint64_t wp = M->unary[((size_t)p * out_dim + i) * chunks + c]; |
| |
|
| | for (int q = 0; q < xK; q++) { |
| | uint64_t xq = x->unary[(size_t)q * chunks + c]; |
| | uint64_t active = wp & xq; |
| | acc += __builtin_popcountll(active & same) |
| | - __builtin_popcountll(active & diff); |
| | } |
| | } |
| | } |
| |
|
| | |
| | y_out[i] = (float)acc * M->scales[i] * x->scale; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | void tum_matvec_fast( |
| | const TrueUnaryMat *M, |
| | const TrueUnaryVec *x, |
| | float *y_out |
| | ) { |
| | int out_dim = M->rows; |
| | int cols = M->cols; |
| | int chunks = M->chunks; |
| | int xK = x->K; |
| |
|
| | |
| | |
| | |
| | |
| | int16_t *x_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t)); |
| | memset(x_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t)); |
| |
|
| | for (int q = 0; q < xK; q++) { |
| | const uint64_t *xplane = x->unary + (size_t)q * chunks; |
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t bits = xplane[c]; |
| | while (bits) { |
| | int bit = __builtin_ctzll(bits); |
| | int j = c * 64 + bit; |
| | if (j < cols) x_mag[j]++; |
| | bits &= bits - 1; |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | #pragma omp parallel |
| | { |
| | int16_t *w_mag = (int16_t *)aligned_alloc(64, ((cols + 15) & ~15) * sizeof(int16_t)); |
| |
|
| | #pragma omp for schedule(dynamic, 32) |
| | for (int i = 0; i < out_dim; i++) { |
| | memset(w_mag, 0, ((cols + 15) & ~15) * sizeof(int16_t)); |
| |
|
| | |
| | for (int p = 0; p < M->K; p++) { |
| | const uint64_t *wplane = M->unary + ((size_t)p * out_dim + i) * chunks; |
| | for (int c = 0; c < chunks; c++) { |
| | uint64_t bits = wplane[c]; |
| | while (bits) { |
| | int bit = __builtin_ctzll(bits); |
| | int j = c * 64 + bit; |
| | if (j < cols) w_mag[j]++; |
| | bits &= bits - 1; |
| | } |
| | } |
| | } |
| |
|
| | |
| | const uint64_t *w_sign_row = M->sign + (size_t)i * chunks; |
| | long long acc = 0; |
| |
|
| | for (int j = 0; j < cols; j++) { |
| | int c = j / 64; |
| | uint64_t bit = 1ULL << (j % 64); |
| | int same_sign = !((w_sign_row[c] ^ x->sign[c]) & bit); |
| | int product = (int)w_mag[j] * (int)x_mag[j]; |
| | acc += same_sign ? product : -product; |
| | } |
| |
|
| | y_out[i] = (float)acc * M->scales[i] * x->scale; |
| | } |
| |
|
| | free(w_mag); |
| | } |
| |
|
| | free(x_mag); |
| | } |
| |
|
| | |
| | |
| | |
| | typedef struct { |
| | float cosine; |
| | float snr_db; |
| | float max_rel_err; |
| | double ms_naive; |
| | double ms_fast; |
| | double gops_naive; |
| | double gops_fast; |
| | } TestResult; |
| |
|
| | TestResult tum_test(int rows, int cols, int wK, int xK, int iters) { |
| | TestResult r = {0}; |
| | srand(42); |
| |
|
| | |
| | float *Mf = (float *)malloc((size_t)rows * cols * sizeof(float)); |
| | float *xf = (float *)malloc(cols * sizeof(float)); |
| | float *y_ref = (float *)calloc(rows, sizeof(float)); |
| | float *y_naive = (float *)malloc(rows * sizeof(float)); |
| | float *y_fast = (float *)malloc(rows * sizeof(float)); |
| |
|
| | for (size_t i = 0; i < (size_t)rows * cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < cols; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| |
|
| | |
| | for (int i = 0; i < rows; i++) |
| | for (int j = 0; j < cols; j++) |
| | y_ref[i] += Mf[(size_t)i * cols + j] * xf[j]; |
| |
|
| | |
| | TrueUnaryMat *M = tum_alloc(rows, cols, wK); |
| | TrueUnaryVec *x = tuv_alloc(cols, xK); |
| | tum_from_float(M, Mf); |
| | tuv_from_float(x, xf); |
| |
|
| | |
| | struct timespec t0, t1; |
| | tum_matvec(M, x, y_naive); |
| | clock_gettime(CLOCK_MONOTONIC, &t0); |
| | for (int i = 0; i < iters; i++) |
| | tum_matvec(M, x, y_naive); |
| | clock_gettime(CLOCK_MONOTONIC, &t1); |
| | r.ms_naive = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters; |
| |
|
| | |
| | tum_matvec_fast(M, x, y_fast); |
| | clock_gettime(CLOCK_MONOTONIC, &t0); |
| | for (int i = 0; i < iters; i++) |
| | tum_matvec_fast(M, x, y_fast); |
| | clock_gettime(CLOCK_MONOTONIC, &t1); |
| | r.ms_fast = ((t1.tv_sec - t0.tv_sec) * 1e3 + (t1.tv_nsec - t0.tv_nsec) * 1e-6) / iters; |
| |
|
| | |
| | float dot = 0, na = 0, nb = 0, max_re = 0; |
| | for (int i = 0; i < rows; i++) { |
| | dot += y_ref[i] * y_naive[i]; |
| | na += y_ref[i] * y_ref[i]; |
| | nb += y_naive[i] * y_naive[i]; |
| | float re = fabsf(y_ref[i] - y_naive[i]) / (fabsf(y_ref[i]) + 1e-8f); |
| | if (re > max_re) max_re = re; |
| | } |
| | r.cosine = dot / (sqrtf(na) * sqrtf(nb) + 1e-10f); |
| | float noise = 0; |
| | for (int i = 0; i < rows; i++) { |
| | float e = y_ref[i] - y_naive[i]; noise += e * e; |
| | } |
| | r.snr_db = 10.0f * log10f(na / (noise + 1e-10f)); |
| | r.max_rel_err = max_re; |
| |
|
| | |
| | float fast_err = 0; |
| | for (int i = 0; i < rows; i++) { |
| | float e = fabsf(y_naive[i] - y_fast[i]); |
| | if (e > fast_err) fast_err = e; |
| | } |
| | if (fast_err > 0.01f) |
| | printf(" WARNING: naive vs fast max diff = %.4f\n", fast_err); |
| |
|
| | double ops = 2.0 * rows * cols; |
| | r.gops_naive = ops * iters / (r.ms_naive * iters * 1e6); |
| | r.gops_fast = ops * iters / (r.ms_fast * iters * 1e6); |
| |
|
| | tum_free(M); tuv_free(x); |
| | free(Mf); free(xf); free(y_ref); free(y_naive); free(y_fast); |
| | return r; |
| | } |
| |
|
| | |
| | |
| | |
| | int main() { |
| | printf("=== TRUE UNARY (BASE-1) TENSOR TESTS ===\n"); |
| | printf("Every bit has weight 1. Value = count of ones.\n"); |
| | printf("Matmul = AND + popcount, no weighting.\n\n"); |
| |
|
| | |
| | int rows = 4096, cols = 2560; |
| | printf("Matrix: %d x %d (Qwen3-4B q_proj equivalent)\n\n", rows, cols); |
| |
|
| | printf("%4s %4s | %8s %8s %8s | %8s %8s | %8s %8s | %s\n", |
| | "wK", "xK", "Cosine", "SNR_dB", "MaxRelE", |
| | "Naive_ms", "Fast_ms", "GOPS_n", "GOPS_f", "Memory"); |
| |
|
| | struct { int wK; int xK; } configs[] = { |
| | {8, 4}, |
| | {8, 8}, |
| | {16, 8}, |
| | {16, 16}, |
| | {32, 8}, |
| | {32, 16}, |
| | {32, 32}, |
| | {64, 16}, |
| | {64, 32}, |
| | }; |
| | int n = sizeof(configs) / sizeof(configs[0]); |
| |
|
| | for (int c = 0; c < n; c++) { |
| | int wK = configs[c].wK; |
| | int xK = configs[c].xK; |
| | int iters = (wK <= 16 && xK <= 16) ? 3 : 1; |
| |
|
| | TestResult r = tum_test(rows, cols, wK, xK, iters); |
| |
|
| | |
| | size_t sign_bytes = (size_t)rows * ((cols+63)/64) * 8; |
| | size_t unary_bytes = (size_t)wK * rows * ((cols+63)/64) * 8; |
| | size_t scale_bytes = rows * 4; |
| | double mb = (sign_bytes + unary_bytes + scale_bytes) / 1e6; |
| |
|
| | printf("%4d %4d | %8.6f %8.1f %8.4f | %8.1f %8.1f | %8.1f %8.1f | %.0fMB\n", |
| | wK, xK, r.cosine, r.snr_db, r.max_rel_err, |
| | r.ms_naive, r.ms_fast, r.gops_naive, r.gops_fast, mb); |
| | } |
| |
|
| | |
| | printf("\n--- Sample values for wK=32 xK=16 (512x2560) ---\n"); |
| | { |
| | int sr = 512, sc = 2560; |
| | srand(42); |
| | float *Mf = (float *)malloc((size_t)sr * sc * sizeof(float)); |
| | float *xf = (float *)malloc(sc * sizeof(float)); |
| | float *y_ref = (float *)calloc(sr, sizeof(float)); |
| | float *y_unary = (float *)malloc(sr * sizeof(float)); |
| |
|
| | for (size_t i = 0; i < (size_t)sr * sc; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | Mf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < sc; i++) { |
| | float u1 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | float u2 = (float)(rand() + 1) / (RAND_MAX + 1.0f); |
| | xf[i] = sqrtf(-2.0f * logf(u1)) * cosf(6.2832f * u2); |
| | } |
| | for (int i = 0; i < sr; i++) |
| | for (int j = 0; j < sc; j++) |
| | y_ref[i] += Mf[(size_t)i * sc + j] * xf[j]; |
| |
|
| | TrueUnaryMat *M = tum_alloc(sr, sc, 32); |
| | TrueUnaryVec *x = tuv_alloc(sc, 16); |
| | tum_from_float(M, Mf); |
| | tuv_from_float(x, xf); |
| | tum_matvec(M, x, y_unary); |
| |
|
| | printf("%8s %8s %8s\n", "Ref", "Unary", "Error"); |
| | for (int i = 0; i < 10; i++) |
| | printf("%8.3f %8.3f %8.3f\n", y_ref[i], y_unary[i], y_ref[i] - y_unary[i]); |
| |
|
| | tum_free(M); tuv_free(x); |
| | free(Mf); free(xf); free(y_ref); free(y_unary); |
| | } |
| |
|
| | printf("\n=== DONE ===\n"); |
| | return 0; |
| | } |
| |
|