| | #define GGML_COMMON_IMPL_CPP
|
| | #define GGML_COMMON_DECL_CPP
|
| | #include "ggml-common.h"
|
| | #include "ggml-backend-impl.h"
|
| |
|
| | #include "ggml-impl.h"
|
| | #include "ggml-cpu.h"
|
| | #include "ggml-cpu-impl.h"
|
| | #include "simd-mappings.h"
|
| | #include "traits.h"
|
| |
|
| | #include "arch-fallback.h"
|
| |
|
| | #include <cmath>
|
| | #include <cstring>
|
| | #include <cassert>
|
| | #include <cstdio>
|
| |
|
| | #include "repack.h"
|
| |
|
| | #if defined(__GNUC__)
|
| | #pragma GCC diagnostic ignored "-Woverlength-strings"
|
| | #endif
|
| |
|
| | #define UNUSED GGML_UNUSED
|
| |
|
| | static inline int nearest_int(float fval) {
|
| | assert(fabsf(fval) <= 4194303.f);
|
| | float val = fval + 12582912.f;
|
| | int i; memcpy(&i, &val, sizeof(int));
|
| | return (i & 0x007fffff) - 0x00400000;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | extern "C" {
|
| |
|
| | void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
| | assert(QK8_0 == 32);
|
| | assert(k % QK8_0 == 0);
|
| | const int nb = k / QK8_0;
|
| |
|
| | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
| |
|
| |
|
| | const int blck_size_interleave = 4;
|
| | float srcv[4][QK8_0];
|
| | float id[4];
|
| |
|
| | for (int i = 0; i < nb; i++) {
|
| | for (int row_iter = 0; row_iter < 4; row_iter++) {
|
| | float amax = 0.0f;
|
| |
|
| | for (int j = 0; j < QK8_0; j++) {
|
| | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
| | amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
| | }
|
| |
|
| | const float d = amax / ((1 << 7) - 1);
|
| | id[row_iter] = d ? 1.0f / d : 0.0f;
|
| |
|
| | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
| | }
|
| |
|
| | for (int j = 0; j < QK8_0 * 4; j++) {
|
| | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
| | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
| | src_offset += (j % blck_size_interleave);
|
| |
|
| | float x0 = srcv[src_id][src_offset] * id[src_id];
|
| | y[i].qs[j] = roundf(x0);
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
| | assert(QK8_0 == 32);
|
| | assert(k % QK8_0 == 0);
|
| | const int nb = k / QK8_0;
|
| |
|
| | block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy;
|
| |
|
| |
|
| | const int blck_size_interleave = 8;
|
| | float srcv[4][QK8_0];
|
| | float id[4];
|
| |
|
| | for (int i = 0; i < nb; i++) {
|
| | for (int row_iter = 0; row_iter < 4; row_iter++) {
|
| | float amax = 0.0f;
|
| |
|
| | for (int j = 0; j < QK8_0; j++) {
|
| | srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j];
|
| | amax = MAX(amax, fabsf(srcv[row_iter][j]));
|
| | }
|
| |
|
| | const float d = amax / ((1 << 7) - 1);
|
| | id[row_iter] = d ? 1.0f / d : 0.0f;
|
| |
|
| | y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d);
|
| | }
|
| |
|
| | for (int j = 0; j < QK8_0 * 4; j++) {
|
| | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
| | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
| | src_offset += (j % blck_size_interleave);
|
| |
|
| | float x0 = srcv[src_id][src_offset] * id[src_id];
|
| | y[i].qs[j] = roundf(x0);
|
| | }
|
| | }
|
| | }
|
| |
|
| |
|
| | void ggml_quantize_mat_q8_K_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
| | assert(QK_K == 256);
|
| | assert(k % QK_K == 0);
|
| | const int nb = k / QK_K;
|
| |
|
| | block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
| |
|
| |
|
| | const int blck_size_interleave = 4;
|
| | float srcv[4][QK_K];
|
| | float iscale[4];
|
| |
|
| | for (int i = 0; i < nb; i++) {
|
| | for (int row_iter = 0; row_iter < 4; row_iter++) {
|
| | float amax = 0.0f;
|
| | float max = 0;
|
| |
|
| | for (int j = 0; j < QK_K; j++) {
|
| | srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
| |
|
| | if(amax < fabsf(srcv[row_iter][j])) {
|
| | amax = fabsf(srcv[row_iter][j]);
|
| | max = srcv[row_iter][j];
|
| | }
|
| | }
|
| |
|
| | iscale[row_iter] = amax ? -127.f/max : 0;
|
| |
|
| | y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
| | }
|
| |
|
| | for (int j = 0; j < QK_K / 4; j++) {
|
| | y[i].bsums[j] = 0;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| | for (int j = 0; j < QK_K * 4; j++) {
|
| | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
| | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
| | src_offset += (j % blck_size_interleave);
|
| | int index = (((j & 15) >> 2) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
| |
|
| | float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
| | y[i].qs[j] = nearest_int(x0);
|
| | y[i].bsums[index] += y[i].qs[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
|
| | assert(QK_K == 256);
|
| | assert(k % QK_K == 0);
|
| | const int nb = k / QK_K;
|
| |
|
| | block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy;
|
| |
|
| |
|
| | const int blck_size_interleave = 8;
|
| | float srcv[4][QK_K];
|
| | float iscale[4];
|
| |
|
| | for (int i = 0; i < nb; i++) {
|
| | for (int row_iter = 0; row_iter < 4; row_iter++) {
|
| | float amax = 0.0f;
|
| | float max = 0;
|
| |
|
| | for (int j = 0; j < QK_K; j++) {
|
| | srcv[row_iter][j] = x[row_iter * k + i * QK_K + j];
|
| |
|
| | if(amax < fabsf(srcv[row_iter][j])) {
|
| | amax = fabsf(srcv[row_iter][j]);
|
| | max = srcv[row_iter][j];
|
| | }
|
| | }
|
| |
|
| | iscale[row_iter] = amax ? -127.f/max : 0;
|
| |
|
| | y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0;
|
| | }
|
| |
|
| | for (int j = 0; j < QK_K / 4; j++) {
|
| | y[i].bsums[j] = 0;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| | for (int j = 0; j < QK_K * 4; j++) {
|
| | int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave;
|
| | int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave;
|
| | src_offset += (j % blck_size_interleave);
|
| | int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3);
|
| |
|
| | float x0 = srcv[src_id][src_offset] * iscale[src_id];
|
| | y[i].qs[j] = nearest_int(x0);
|
| | y[i].bsums[index] += y[i].qs[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | }
|
| |
|
| | template <int64_t INTER_SIZE, ggml_type PARAM_TYPE>
|
| | void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row);
|
| |
|
| | template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
| | assert(nrow == 4);
|
| | UNUSED(nrow);
|
| | ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row);
|
| | }
|
| |
|
| | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
| | assert(nrow == 4);
|
| | UNUSED(nrow);
|
| | ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row);
|
| | }
|
| |
|
| | template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
| | assert(nrow == 4);
|
| | UNUSED(nrow);
|
| | ggml_quantize_mat_q8_K_4x4(x, vy, n_per_row);
|
| | }
|
| |
|
| | template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) {
|
| | assert(nrow == 4);
|
| | UNUSED(nrow);
|
| | ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row);
|
| | }
|
| |
|
| | template <int M, int N>
|
| | static void ggml_gemv_q6_K_NxM_q8_K_generic_impl(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | constexpr int blocklen = M;
|
| | constexpr int ncols_interleaved = N;
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int blocks_per_half = 64 / blocklen;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[8];
|
| |
|
| | const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0f;
|
| | }
|
| |
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
| | const int base_h = base_l + 64;
|
| |
|
| | const int scale_idx_l = base_l / 16;
|
| | const int scale_idx_h = base_h / 16;
|
| |
|
| | const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
| | const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
| |
|
| | const int qh_half_l = (base_l / 128) * 32;
|
| | const int qh_half_h = (base_h / 128) * 32;
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
| | const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
| |
|
| | int sumi_l = 0;
|
| | int sumi_h = 0;
|
| |
|
| | for (int i = 0; i < blocklen; i++) {
|
| | const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
| | const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
| | const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
| |
|
| | const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
| | const int qh_chunk_l = qh_idx_l / blocklen;
|
| | const int qh_pos_l = qh_idx_l % blocklen;
|
| | const int qh_offset_l = qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
| | const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
| |
|
| | const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
| | const int qh_chunk_h = qh_idx_h / blocklen;
|
| | const int qh_pos_h = qh_idx_h % blocklen;
|
| | const int qh_offset_h = qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
| | const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
| |
|
| | const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
| | const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
| |
|
| | const int8_t a_l = a_ptr[l].qs[base_l + i];
|
| | const int8_t a_h = a_ptr[l].qs[base_h + i];
|
| |
|
| | sumi_l += q_l * a_l;
|
| | sumi_h += q_h * a_h;
|
| | }
|
| |
|
| | sumf[j] +=
|
| | (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | }
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | template <int M, int N>
|
| | static void ggml_gemm_q6_K_NxM_q8_K_generic_impl(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | constexpr int blocklen = M;
|
| | constexpr int ncols_interleaved = N;
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int blocks_per_half = 64 / blocklen;
|
| | const int q8_half_stride = 512;
|
| | const int q8_low_high_step = 256;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| |
|
| | float sumf[4][8];
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q6_Kx8 * b_ptr = (const block_q6_Kx8 *) vx + (x * nb);
|
| |
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0f;
|
| | }
|
| | }
|
| |
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | const int base_l = (k / blocks_per_half) * 128 + (k % blocks_per_half) * blocklen;
|
| | const int base_h = base_l + 64;
|
| |
|
| | const int scale_idx_l = base_l / 16;
|
| | const int scale_idx_h = base_h / 16;
|
| |
|
| | const int qh_shift_l = ((base_l % 128) / 32) * 2;
|
| | const int qh_shift_h = ((base_h % 128) / 32) * 2;
|
| |
|
| | const int qh_half_l = (base_l / 128) * 32;
|
| | const int qh_half_h = (base_h / 128) * 32;
|
| |
|
| | const int q8_base = (k / blocks_per_half) * q8_half_stride + (k % blocks_per_half) * (blocklen * 4);
|
| |
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | const int8_t scale_l = b_ptr[l].scales[scale_idx_l * ncols_interleaved + j];
|
| | const int8_t scale_h = b_ptr[l].scales[scale_idx_h * ncols_interleaved + j];
|
| |
|
| | int sumi_l = 0;
|
| | int sumi_h = 0;
|
| |
|
| | for (int i = 0; i < blocklen; i++) {
|
| | const int ql_pos = k * ncols_interleaved * blocklen + j * blocklen + i;
|
| | const int l_4 = b_ptr[l].ql[ql_pos] & 0xF;
|
| | const int hi_4 = (b_ptr[l].ql[ql_pos] >> 4) & 0xF;
|
| |
|
| | const int qh_idx_l = qh_half_l + ((base_l + i) % 32);
|
| | const int qh_chunk_l = qh_idx_l / blocklen;
|
| | const int qh_pos_l = qh_idx_l % blocklen;
|
| | const int qh_offset_l =
|
| | qh_chunk_l * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_l;
|
| | const int hi_2_l = (b_ptr[l].qh[qh_offset_l] >> qh_shift_l) & 0x3;
|
| |
|
| | const int qh_idx_h = qh_half_h + ((base_h + i) % 32);
|
| | const int qh_chunk_h = qh_idx_h / blocklen;
|
| | const int qh_pos_h = qh_idx_h % blocklen;
|
| | const int qh_offset_h =
|
| | qh_chunk_h * (blocklen * ncols_interleaved) + j * blocklen + qh_pos_h;
|
| | const int hi_2_h = (b_ptr[l].qh[qh_offset_h] >> qh_shift_h) & 0x3;
|
| |
|
| | const int q_l = ((hi_2_l << 4) | l_4) - 32;
|
| | const int q_h = ((hi_2_h << 4) | hi_4) - 32;
|
| |
|
| | const int8_t q8_l = a_ptr[l].qs[q8_base + m * blocklen + i];
|
| | const int8_t q8_h = a_ptr[l].qs[q8_base + m * blocklen + i + q8_low_high_step];
|
| |
|
| | sumi_l += q_l * q8_l;
|
| | sumi_h += q_h * q8_h;
|
| | }
|
| |
|
| | sumf[m][j] += (sumi_l * scale_l + sumi_h * scale_h) * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) *
|
| | a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | template <int M, int N>
|
| | static void ggml_gemv_q5_K_NxM_q8_K_generic_impl(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | constexpr int blocklen = M;
|
| | constexpr int ncols_interleaved = N;
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[ncols_interleaved];
|
| | float sum_minf[ncols_interleaved];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | sum_minf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | constexpr int scale_stride = 32;
|
| | uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
| | uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
| |
|
| | const int qh_shift = (k / (32 / blocklen)) * 2;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
| |
|
| | const int qh_idx = (k * blocklen + i) % 32;
|
| | const int qh_chunk = qh_idx / blocklen;
|
| | const int qh_pos = qh_idx % blocklen;
|
| | const int b_qh_offset = qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
| |
|
| | const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
| | const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
| | const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
| |
|
| | const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
| | const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
| |
|
| | const int q8_offset = (k / (32 / blocklen)) * 64 + (k % (32 / blocklen)) * blocklen + i;
|
| |
|
| | sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
| | sumi2 = (v1 * a_ptr[l].qs[q8_offset + 32]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) *
|
| | GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | template <int M, int N>
|
| | static void ggml_gemm_q5_K_NxM_q8_K_generic_impl(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | constexpr int blocklen = M;
|
| | constexpr int ncols_interleaved = N;
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][ncols_interleaved];
|
| | float sum_minf[4][ncols_interleaved];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q5_Kx8 * b_ptr = (const block_q5_Kx8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | sum_minf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * K_SCALE_SIZE, K_SCALE_SIZE);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | constexpr int scale_stride = 32;
|
| | uint8_t * scales_0 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride;
|
| | uint8_t * scales_1 = (uint8_t *) utmp + (k / (32 / blocklen)) * scale_stride + 16;
|
| |
|
| | const int qh_shift = (k / (32 / blocklen)) * 2;
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int b_qs_offset = k * ncols_interleaved * blocklen + j * blocklen + i;
|
| |
|
| | const int qh_idx = (k * blocklen + i) % 32;
|
| | const int qh_chunk = qh_idx / blocklen;
|
| | const int qh_pos = qh_idx % blocklen;
|
| | const int b_qh_offset =
|
| | qh_chunk * (blocklen * ncols_interleaved) + j * blocklen + qh_pos;
|
| |
|
| | const uint8_t qh_val = b_ptr[l].qh[b_qh_offset];
|
| | const uint8_t h0 = (qh_val >> qh_shift) & 1;
|
| | const uint8_t h1 = (qh_val >> (qh_shift + 1)) & 1;
|
| |
|
| | const int v0 = (int8_t) ((b_ptr[l].qs[b_qs_offset] & 0xF) | (h0 << 4));
|
| | const int v1 = (int8_t) ((b_ptr[l].qs[b_qs_offset] >> 4) | (h1 << 4));
|
| |
|
| | const int q8_offset = (k / (32 / blocklen)) * 256 +
|
| | (k % (32 / blocklen)) * 4 * blocklen + m * blocklen + i;
|
| |
|
| | sumi1 = (v0 * a_ptr[l].qs[q8_offset]);
|
| | sumi2 = (v1 * a_ptr[l].qs[q8_offset + 128]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
| | for (int m = 0; m < 4; m++) {
|
| | const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) *
|
| | GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | extern "C" {
|
| |
|
| | void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[8];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 4;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[8];
|
| | float sum_minf[8];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | sum_minf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
| | uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 64 + (k % 8) * blocklen + i + 32]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[8];
|
| | float sum_minf[8];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | const block_q8_K * a_ptr = (const block_q8_K *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | sum_minf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
| | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[8];
|
| | float sum_minf[8];
|
| | int sumi1,sumi2,sumi3,sumi4;
|
| | int sumi;
|
| |
|
| | const block_q8_K * a_ptr = (const block_q8_K *)vy;
|
| | for(int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | sum_minf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
| | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
| | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
| | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
| | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi3 = 0;
|
| | sumi4 = 0;
|
| | sumi = 0;
|
| | int offset = ((k / 2) % 2) + j * 2;
|
| | for (int i = 0; i < blocklen; ++i){
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
| | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
| | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
| | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 32]);
|
| | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 64]);
|
| | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 128 + (k % 4) * blocklen + i + 96]);
|
| |
|
| | sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
| | sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
| | sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
| | sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
| | sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
| | }
|
| | sumf[j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | for(int sb = 0; sb < 8; sb++) {
|
| | const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
| | for(int j = 0; j < ncols_interleaved; j++){
|
| | sum_minf[j] += ((mins[j * 2] >> 4) * a_ptr[l].bsums[sb * 2] + (mins[(j * 2)+ 1] >> 4) * a_ptr[l].bsums[sb * 2 + 1]) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d;
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemv_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemv_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemv_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| |
|
| | void ggml_gemv_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemv_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemv_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemv_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[8];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[8];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0;
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2]));
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q8_0_4x4_q8_0_generic(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / blocklen); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
| | sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemv_q8_0_4x8_q8_0_generic(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 8;
|
| |
|
| | assert(nr == 1);
|
| | assert(n % qk == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| | UNUSED(nr);
|
| |
|
| | float sumf[4];
|
| | int sumi;
|
| |
|
| | const block_q8_0 * a_ptr = (const block_q8_0 *) vy;
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
| |
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / blocklen); k++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
| | sumi += v0 * a_ptr[l].qs[k * blocklen + i];
|
| | }
|
| | sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d);
|
| | }
|
| | }
|
| | }
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[x * ncols_interleaved + j] = sumf[j];
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | {
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4][8];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0);
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q4_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 4;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4][8];
|
| | float sum_minf[4][8];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | sum_minf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | uint8_t * scales_0 = (uint8_t *) utmp + (k / 8) * 32;
|
| | uint8_t * scales_1 = (uint8_t *) utmp + (k / 8) * 32 + 16;
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k / 8) * 256 + (k % 8) * 4 * blocklen + m * blocklen + i + 128]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t * mins = (uint8_t *) utmp + 8 + sb * 16;
|
| | for(int m = 0; m < 4; m++) {
|
| | const int16_t * bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
| | for(int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| | static const uint32_t kmask1 = 0x3f3f3f3f;
|
| | static const uint32_t kmask2 = 0x0f0f0f0f;
|
| | static const uint32_t kmask3 = 0x03030303;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(bs);
|
| |
|
| | float sumf[4][8];
|
| | float sum_minf[4][8];
|
| | uint32_t utmp[32];
|
| | int sumi1;
|
| | int sumi2;
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | sum_minf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12);
|
| | utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4);
|
| | const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1;
|
| | utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4);
|
| | utmp[sb * 4 + 2] = uaux_0;
|
| | utmp[sb * 4 + 0] &= kmask1;
|
| | }
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32;
|
| | uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16;
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF);
|
| | const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
| | sumi1 = sumi1 * scales_0[j];
|
| | sumi2 = sumi2 * scales_1[j];
|
| | sumi += sumi1 + sumi2;
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | for (int sb = 0; sb < 8; sb++) {
|
| | uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16;
|
| | for(int m = 0; m < 4; m++) {
|
| | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
| | for(int j = 0; j < ncols_interleaved; j++) {
|
| | sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK_K;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | float sumf[4][8];
|
| | float sum_minf[4][8];
|
| | int sumi1, sumi2, sumi3, sumi4;
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q2_Kx8 * b_ptr = (const block_q2_Kx8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | sum_minf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (4 * blocklen)); k++) {
|
| |
|
| | const uint8_t *scales_0 = b_ptr[l].scales + (k / 4) * 64 ;
|
| | const uint8_t *scales_1 = b_ptr[l].scales + (k / 4) * 64 + 16;
|
| | const uint8_t *scales_2 = b_ptr[l].scales + (k / 4) * 64 + 32;
|
| | const uint8_t *scales_3 = b_ptr[l].scales + (k / 4) * 64 + 48;
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi1 = 0;
|
| | sumi2 = 0;
|
| | sumi3 = 0;
|
| | sumi4 = 0;
|
| | sumi = 0;
|
| | int offset = ((k / 2) % 2) + j * 2;
|
| | for (int i = 0; i < blocklen; ++i){
|
| | const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 3);
|
| | const int v1 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 2 ) & 3);
|
| | const int v2 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4 ) & 3);
|
| | const int v3 = (int8_t) ((b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 6 ) & 3);
|
| | sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i]);
|
| | sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]);
|
| | sumi3 = (v2 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 256]);
|
| | sumi4 = (v3 * a_ptr[l].qs[(k >> 2) * 512 + (k % 4) * 4 * blocklen + m * blocklen + i + 384]);
|
| | sumi1 = sumi1 * (scales_0[offset] & 0xF);
|
| | sumi2 = sumi2 * (scales_1[offset] & 0xF);
|
| | sumi3 = sumi3 * (scales_2[offset] & 0xF);
|
| | sumi4 = sumi4 * (scales_3[offset] & 0xF);
|
| | sumi += sumi1 + sumi2 + sumi3 + sumi4;
|
| | }
|
| | sumf[m][j] += sumi * GGML_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | for(int sb = 0; sb < 8; sb++) {
|
| | const uint8_t *mins = b_ptr[l].scales + sb * 16;
|
| | for(int m = 0; m < 4; m++) {
|
| | const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6);
|
| | for(int j = 0; j < ncols_interleaved; j++) {
|
| | int mins_prod = ((mins[j * 2] >> 4) * bsums[0] + (mins[(j * 2)+ 1] >> 4) * bsums[1]);
|
| | sum_minf[m][j] += (mins_prod) * GGML_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q5_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemm_q5_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemm_q5_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemm_q5_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemm_q6_K_8x4_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemm_q6_K_NxM_q8_K_generic_impl<4, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemm_q6_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | ggml_gemm_q6_K_NxM_q8_K_generic_impl<8, 8>(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert (n % qk == 0);
|
| | assert (nr % 4 == 0);
|
| | assert (nc % ncols_interleaved == 0);
|
| |
|
| | UNUSED(s);
|
| | UNUSED(bs);
|
| | UNUSED(vx);
|
| | UNUSED(vy);
|
| | UNUSED(nr);
|
| | UNUSED(nc);
|
| | UNUSED(nb);
|
| | UNUSED(ncols_interleaved);
|
| | UNUSED(blocklen);
|
| |
|
| | {
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][8];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_iq4_nlx8 * b_ptr = (const block_iq4_nlx8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_mxfp4_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_mxfp4x4 * b_ptr = (const block_mxfp4x4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_mxfp4_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 8;
|
| | const int blocklen = 8;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][8];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_mxfp4x8 * b_ptr = (const block_mxfp4x8 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0;
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / (2 * blocklen)); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F];
|
| | const int v1 = kvalues_mxfp4[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4];
|
| | sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) +
|
| | (v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4]));
|
| | }
|
| | sumf[m][j] += sumi * GGML_CPU_E8M0_TO_FP32_HALF(b_ptr[l].e[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++)
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q8_0_4x4_q8_0_generic(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 4;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / blocklen); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
| | sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
| | }
|
| | sumf[m][j] +=
|
| | sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | void ggml_gemm_q8_0_4x8_q8_0_generic(int n,
|
| | float * GGML_RESTRICT s,
|
| | size_t bs,
|
| | const void * GGML_RESTRICT vx,
|
| | const void * GGML_RESTRICT vy,
|
| | int nr,
|
| | int nc) {
|
| | const int qk = QK8_0;
|
| | const int nb = n / qk;
|
| | const int ncols_interleaved = 4;
|
| | const int blocklen = 8;
|
| |
|
| | assert(n % qk == 0);
|
| | assert(nr % 4 == 0);
|
| | assert(nc % ncols_interleaved == 0);
|
| |
|
| | float sumf[4][4];
|
| | int sumi;
|
| |
|
| | for (int y = 0; y < nr / 4; y++) {
|
| | const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb);
|
| | for (int x = 0; x < nc / ncols_interleaved; x++) {
|
| | const block_q8_0x4 * b_ptr = (const block_q8_0x4 *) vx + (x * nb);
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumf[m][j] = 0.0;
|
| | }
|
| | }
|
| | for (int l = 0; l < nb; l++) {
|
| | for (int k = 0; k < (qk / blocklen); k++) {
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | sumi = 0;
|
| | for (int i = 0; i < blocklen; ++i) {
|
| | const int v0 = b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i];
|
| | sumi += v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i];
|
| | }
|
| | sumf[m][j] +=
|
| | sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]);
|
| | }
|
| | }
|
| | }
|
| | }
|
| | for (int m = 0; m < 4; m++) {
|
| | for (int j = 0; j < ncols_interleaved; j++) {
|
| | s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j];
|
| | }
|
| | }
|
| | }
|
| | }
|
| | }
|
| |
|
| | }
|
| |
|
| | static block_q8_0x4 make_block_q8_0x4(block_q8_0 * in, unsigned int blck_size_interleave) {
|
| | block_q8_0x4 out;
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end = QK8_0 * 4 / blck_size_interleave;
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 4;
|
| | int src_offset = (i / 4) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
| | }
|
| | return out;
|
| | }
|
| |
|
| | static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| | block_q4_0x4 out;
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end = QK4_0 * 2 / blck_size_interleave;
|
| |
|
| | if (blck_size_interleave == 8) {
|
| | const uint64_t xor_mask = 0x8888888888888888ULL;
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 4;
|
| | int src_offset = (i / 4) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint64_t elems;
|
| |
|
| | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| | elems ^= xor_mask;
|
| | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| | }
|
| | } else if (blck_size_interleave == 4) {
|
| | const uint32_t xor_mask = 0x88888888;
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 4;
|
| | int src_offset = (i / 4) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint32_t elems;
|
| | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t));
|
| | elems ^= xor_mask;
|
| | memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t));
|
| | }
|
| | } else {
|
| | GGML_ASSERT(false);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) {
|
| | block_q4_0x8 out;
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end = QK4_0 * 4 / blck_size_interleave;
|
| | const uint64_t xor_mask = 0x8888888888888888ULL;
|
| |
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint64_t elems;
|
| | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| | elems ^= xor_mask;
|
| | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) {
|
| | block_q4_Kx8 out;
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
| | }
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
| | }
|
| |
|
| | const int end = QK_K * 4 / blck_size_interleave;
|
| |
|
| |
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| |
|
| | uint64_t elems;
|
| | memcpy(&elems, &in[src_id].qs[src_offset], blck_size_interleave);
|
| | memcpy(&out.qs[dst_offset], &elems, blck_size_interleave);
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | uint8_t s[8], m[8];
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | for (int j = 0; j < 8; j++) {
|
| | s[j] = in[j].scales[i] & 63;
|
| | m[j] = in[j].scales[i + 4] & 63;
|
| | }
|
| |
|
| | out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
|
| | out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
|
| | out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
|
| | out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
|
| | out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
|
| | out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
|
| | out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
|
| | out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
|
| | out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
|
| | out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
| | out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
| | out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
| |
|
| | }
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | for (int j = 0; j < 8; j++) {
|
| | s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15);
|
| | m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4);
|
| | }
|
| |
|
| | out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
| | out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
|
| | out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
|
| | out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
|
| | out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
|
| | out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
|
| | out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
|
| | out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
|
| | out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
|
| | out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
| | out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
| | out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
| |
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static block_q2_Kx8 make_block_q2_Kx8(block_q2_K * in, unsigned int blck_size_interleave) {
|
| | block_q2_Kx8 out;
|
| |
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
| | }
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
| | }
|
| |
|
| | const int end = QK_K * 2 / blck_size_interleave;
|
| |
|
| |
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint64_t elems;
|
| | memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| | memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t));
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | for (int i = 0; i < 128; i++) {
|
| |
|
| | int src1 = (i % 16) / 2;
|
| |
|
| | int src2 = ((i / 16) * 2) + (i % 2);
|
| |
|
| | out.scales[i] = in[src1].scales[src2];
|
| | }
|
| | return out;
|
| | }
|
| |
|
| | static block_q5_Kx8 make_block_q5_Kx8(block_q5_K * in, unsigned int blck_size_interleave) {
|
| | block_q5_Kx8 out;
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d;
|
| | }
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin;
|
| | }
|
| |
|
| | const int end = QK_K * 4 / blck_size_interleave;
|
| |
|
| |
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], blck_size_interleave);
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | for (int i = 0; i < end / 4; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qh[dst_offset], &in[src_id].qh[src_offset], blck_size_interleave);
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | uint8_t s[8], m[8];
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | for (int j = 0; j < 8; j++) {
|
| | s[j] = in[j].scales[i] & 63;
|
| | m[j] = in[j].scales[i + 4] & 63;
|
| | }
|
| |
|
| | out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2);
|
| | out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2);
|
| | out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2);
|
| | out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2);
|
| | out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2);
|
| | out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2);
|
| | out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2);
|
| | out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2);
|
| | out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4);
|
| | out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4);
|
| | out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4);
|
| | out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4);
|
| | }
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | for (int j = 0; j < 8; j++) {
|
| | s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i + 8] & 15);
|
| | m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i + 8] & 240) >> 4);
|
| | }
|
| |
|
| | out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2);
|
| | out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2);
|
| | out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2);
|
| | out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2);
|
| | out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2);
|
| | out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2);
|
| | out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2);
|
| | out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2);
|
| | out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4);
|
| | out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4);
|
| | out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4);
|
| | out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static block_q6_Kx8 make_block_q6_Kx8(block_q6_K * in, unsigned int blck_size_interleave) {
|
| | block_q6_Kx8 out;
|
| | constexpr int n_blocks = 8;
|
| | for (int i = 0; i < n_blocks; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end_ls = QK_K * 4 / blck_size_interleave;
|
| |
|
| | for (int i = 0; i < end_ls; ++i) {
|
| | int src_id = i % n_blocks;
|
| | int src_offset = (i / n_blocks) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint64_t elem_ls;
|
| | memcpy(&elem_ls, &in[src_id].ql[src_offset], blck_size_interleave);
|
| | memcpy(&out.ql[dst_offset], &elem_ls, blck_size_interleave);
|
| | }
|
| |
|
| |
|
| | const int end_hs = end_ls / 2;
|
| | for (int i = 0; i < end_hs; ++i) {
|
| | int src_id = i % n_blocks;
|
| | int src_offset = (i / n_blocks) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | uint64_t elem_hs;
|
| | memcpy(&elem_hs, &in[src_id].qh[src_offset], blck_size_interleave);
|
| | memcpy(&out.qh[dst_offset], &elem_hs, blck_size_interleave);
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | constexpr int n_scales = QK_K / 16;
|
| |
|
| | for (int i = 0; i < n_blocks; i++) {
|
| | for (int j = 0; j < n_scales; j++) {
|
| | out.scales[j * n_blocks + i] = in[i].scales[j];
|
| | }
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
| | GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
| | constexpr int nrows_interleaved = 4;
|
| |
|
| | block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
|
| | const block_q4_0 * src = (const block_q4_0 *)data;
|
| | block_q4_0 dst_tmp[4];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK4_0;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q4_0x4(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q4_K);
|
| | GGML_ASSERT(interleave_block == 8 || interleave_block == 4);
|
| | constexpr int nrows_interleaved = 8;
|
| |
|
| | block_q4_Kx8 * dst = (block_q4_Kx8*)t->data;
|
| | const block_q4_K * src = (const block_q4_K*) data;
|
| | block_q4_K dst_tmp[8];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK_K;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++ ) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q4_Kx8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static int repack_q2_K_to_q2_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q2_K);
|
| | GGML_ASSERT(interleave_block == 8);
|
| | constexpr int nrows_interleaved = 8;
|
| |
|
| | block_q2_Kx8 * dst = (block_q2_Kx8*)t->data;
|
| | const block_q2_K * src = (const block_q2_K*) data;
|
| | block_q2_K dst_tmp[8];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK_K;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q2_K));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q2_Kx8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static int repack_q5_K_to_q5_K_8_bl(struct ggml_tensor * t,
|
| | int interleave_block,
|
| | const void * GGML_RESTRICT data,
|
| | size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q5_K);
|
| | GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
| | constexpr int nrows_interleaved = 8;
|
| |
|
| | block_q5_Kx8 * dst = (block_q5_Kx8 *) t->data;
|
| | const block_q5_K * src = (const block_q5_K *) data;
|
| | block_q5_K dst_tmp[8];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK_K;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q5_K));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q5_Kx8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| | }
|
| |
|
| | static int repack_q6_K_to_q6_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q6_K);
|
| | GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
| | constexpr int nrows_interleaved = 8;
|
| |
|
| | block_q6_Kx8 * dst = (block_q6_Kx8 *)t->data;
|
| | const block_q6_K * src = (const block_q6_K *) data;
|
| | block_q6_K dst_tmp[8];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK_K;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q6_K));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q6_Kx8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| | }
|
| |
|
| | static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
|
| | GGML_ASSERT(interleave_block == 8);
|
| | constexpr int nrows_interleaved = 8;
|
| |
|
| | block_q4_0x8 * dst = (block_q4_0x8*)t->data;
|
| | const block_q4_0 * src = (const block_q4_0*) data;
|
| | block_q4_0 dst_tmp[8];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK4_0;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++ ) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q4_0x8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static int repack_q8_0_to_q8_0_4_bl(struct ggml_tensor * t,
|
| | int interleave_block,
|
| | const void * GGML_RESTRICT data,
|
| | size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_Q8_0);
|
| | GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
|
| | constexpr int nrows_interleaved = 4;
|
| |
|
| | block_q8_0x4 * dst = (block_q8_0x4 *) t->data;
|
| | const block_q8_0 * src = (const block_q8_0 *) data;
|
| | block_q8_0 dst_tmp[4];
|
| | int nrow = ggml_nrows(t);
|
| | int nblocks = t->ne[0] / QK8_0;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q8_0));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_q8_0x4(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| | }
|
| |
|
| | static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
| | block_iq4_nlx4 out;
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end = QK4_NL * 2 / blck_size_interleave;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | if (blck_size_interleave == 4) {
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 4;
|
| | int src_offset = (i / 4) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
| | }
|
| | } else {
|
| | GGML_ASSERT(false);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
| | GGML_ASSERT(interleave_block == 4);
|
| |
|
| | const block_iq4_nl * src = (const block_iq4_nl *)data;
|
| | block_iq4_nlx4 * dst = ( block_iq4_nlx4 *)t->data;
|
| |
|
| | block_iq4_nl dst_tmp[4];
|
| |
|
| | int nrow = ggml_nrows(t);
|
| | int nrows_interleaved = 4;
|
| | int nblocks = t->ne[0] / QK4_NL;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static block_iq4_nlx8 make_block_iq4_nlx8(block_iq4_nl * in, unsigned int blck_size_interleave) {
|
| | block_iq4_nlx8 out;
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.d[i] = in[i].d;
|
| | }
|
| |
|
| | const int end = QK4_NL * 4 / blck_size_interleave;
|
| |
|
| | if (blck_size_interleave == 8) {
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| | }
|
| | } else {
|
| | GGML_ASSERT(false);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static int repack_iq4_nl_to_iq4_nl_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL);
|
| | GGML_ASSERT(interleave_block == 8);
|
| |
|
| | const block_iq4_nl * src = (const block_iq4_nl *)data;
|
| | block_iq4_nlx8 * dst = ( block_iq4_nlx8 *)t->data;
|
| |
|
| | block_iq4_nl dst_tmp[8];
|
| |
|
| | int nrow = ggml_nrows(t);
|
| | int nrows_interleaved = 8;
|
| | int nblocks = t->ne[0] / QK4_NL;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_iq4_nlx8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| |
|
| | static block_mxfp4x4 make_block_mxfp4x4(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
| | block_mxfp4x4 out;
|
| |
|
| | for (int i = 0; i < 4; i++) {
|
| | out.e[i] = in[i].e;
|
| | }
|
| |
|
| | const int end = QK_MXFP4 * 2 / blck_size_interleave;
|
| |
|
| | if (blck_size_interleave == 4) {
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 4;
|
| | int src_offset = (i / 4) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t));
|
| | }
|
| | } else {
|
| | GGML_ASSERT(false);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static int repack_mxfp4_to_mxfp4_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
| | GGML_ASSERT(interleave_block == 4);
|
| |
|
| | const block_mxfp4 * src = (const block_mxfp4 *)data;
|
| | block_mxfp4x4 * dst = ( block_mxfp4x4 *)t->data;
|
| |
|
| | block_mxfp4 dst_tmp[4];
|
| |
|
| | int nrow = ggml_nrows(t);
|
| | int nrows_interleaved = 4;
|
| | int nblocks = t->ne[0] / QK_MXFP4;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_mxfp4x4(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | static block_mxfp4x8 make_block_mxfp4x8(block_mxfp4 * in, unsigned int blck_size_interleave) {
|
| | block_mxfp4x8 out;
|
| |
|
| | for (int i = 0; i < 8; i++) {
|
| | out.e[i] = in[i].e;
|
| | }
|
| |
|
| | const int end = QK_MXFP4 * 4 / blck_size_interleave;
|
| |
|
| | if (blck_size_interleave == 8) {
|
| | for (int i = 0; i < end; ++i) {
|
| | int src_id = i % 8;
|
| | int src_offset = (i / 8) * blck_size_interleave;
|
| | int dst_offset = i * blck_size_interleave;
|
| |
|
| | memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint64_t));
|
| | }
|
| | } else {
|
| | GGML_ASSERT(false);
|
| | }
|
| |
|
| | return out;
|
| | }
|
| |
|
| | static int repack_mxfp4_to_mxfp4_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) {
|
| | GGML_ASSERT(t->type == GGML_TYPE_MXFP4);
|
| | GGML_ASSERT(interleave_block == 8);
|
| |
|
| | const block_mxfp4 * src = (const block_mxfp4 *)data;
|
| | block_mxfp4x8 * dst = ( block_mxfp4x8 *)t->data;
|
| |
|
| | block_mxfp4 dst_tmp[8];
|
| |
|
| | int nrow = ggml_nrows(t);
|
| | int nrows_interleaved = 8;
|
| | int nblocks = t->ne[0] / QK_MXFP4;
|
| |
|
| | GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_mxfp4));
|
| |
|
| | if (t->ne[1] % nrows_interleaved != 0) {
|
| | return -1;
|
| | }
|
| |
|
| | for (int b = 0; b < nrow; b += nrows_interleaved) {
|
| | for (int64_t x = 0; x < nblocks; x++) {
|
| | for (int i = 0; i < nrows_interleaved; i++) {
|
| | dst_tmp[i] = src[x + i * nblocks];
|
| | }
|
| | *dst++ = make_block_mxfp4x8(dst_tmp, interleave_block);
|
| | }
|
| | src += nrows_interleaved * nblocks;
|
| | }
|
| | return 0;
|
| |
|
| | GGML_UNUSED(data_size);
|
| | }
|
| |
|
| | namespace ggml::cpu::repack {
|
| |
|
| | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS>
|
| | int repack(struct ggml_tensor *, const void *, size_t);
|
| |
|
| |
|
| | template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q4_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q4_K_to_q4_K_8_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q2_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q2_K_to_q2_K_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q5_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q5_K_to_q5_K_8_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q5_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q5_K_to_q5_K_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q6_K, 4, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q6_K_to_q6_K_8_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q6_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q6_K_to_q6_K_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | template <> int repack<block_iq4_nl, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_iq4_nl_to_iq4_nl_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_mxfp4, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_mxfp4_to_mxfp4_4_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_mxfp4, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_mxfp4_to_mxfp4_8_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q8_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q8_0_to_q8_0_4_bl(t, 4, data, data_size);
|
| | }
|
| |
|
| | template <> int repack<block_q8_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) {
|
| | return repack_q8_0_to_q8_0_4_bl(t, 8, data, data_size);
|
| | }
|
| |
|
| |
|
| | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
| | void gemv(int, float *, size_t, const void *, const void *, int, int);
|
| |
|
| | template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <>
|
| | void gemv<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n,
|
| | float * s,
|
| | size_t bs,
|
| | const void * vx,
|
| | const void * vy,
|
| | int nr,
|
| | int nc) {
|
| | ggml_gemv_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemv<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemv_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| |
|
| | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE>
|
| | void gemm(int, float *, size_t, const void *, const void *, int, int);
|
| |
|
| | template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <>
|
| | void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n,
|
| | float * s,
|
| | size_t bs,
|
| | const void * vx,
|
| | const void * vy,
|
| | int nr,
|
| | int nc) {
|
| | ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q2_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q2_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q4_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q4_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q5_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q5_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q5_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q5_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q6_K, 4, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q6_K_8x4_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q6_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q6_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_iq4_nl_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_mxfp4, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_mxfp4_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_mxfp4, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_mxfp4_8x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q8_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q8_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | template <> void gemm<block_q8_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) {
|
| | ggml_gemm_q8_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc);
|
| | }
|
| |
|
| | class tensor_traits_base : public ggml::cpu::tensor_traits {
|
| | public:
|
| | virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0;
|
| | };
|
| |
|
| | template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base {
|
| |
|
| | bool work_size(int , const struct ggml_tensor * op, size_t & size) override {
|
| |
|
| | switch (op->op) {
|
| | case GGML_OP_MUL_MAT:
|
| | {
|
| | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
| | return true;
|
| | }
|
| | case GGML_OP_MUL_MAT_ID:
|
| | {
|
| | size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
|
| | size = GGML_PAD(size, sizeof(int64_t));
|
| |
|
| | const int64_t ne02 = op->src[0]->ne[2];
|
| | const int64_t ne12 = op->src[1]->ne[2];
|
| |
|
| | const size_t sizeof_mmid_row_mapping = sizeof(int64_t);
|
| |
|
| | size += sizeof_mmid_row_mapping*ne02*(ne12 + 1);
|
| |
|
| | return true;
|
| | }
|
| | default:
|
| |
|
| | break;
|
| | }
|
| | return false;
|
| | }
|
| |
|
| | bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override {
|
| | switch (op->op) {
|
| | case GGML_OP_MUL_MAT:
|
| | forward_mul_mat(params, op);
|
| | return true;
|
| | case GGML_OP_MUL_MAT_ID:
|
| | forward_mul_mat_id(params, op);
|
| | return true;
|
| | default:
|
| |
|
| | break;
|
| | }
|
| | return false;
|
| | }
|
| |
|
| | void forward_mul_mat_one_chunk(ggml_compute_params * params,
|
| | ggml_tensor * op,
|
| | int64_t src0_start,
|
| | int64_t src0_end,
|
| | int64_t src1_start,
|
| | int64_t src1_end) {
|
| | const ggml_tensor * src0 = op->src[0];
|
| | const ggml_tensor * src1 = op->src[1];
|
| | ggml_tensor * dst = op;
|
| |
|
| | GGML_TENSOR_BINARY_OP_LOCALS
|
| |
|
| | const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10);
|
| |
|
| | GGML_ASSERT(ne03 == 1 && ne13 == 1);
|
| | GGML_ASSERT(ne12 % ne02 == 0);
|
| | const int64_t r2 = ne12 / ne02;
|
| |
|
| | const int64_t i12 = src1_start / ne1;
|
| | const int64_t i11 = src1_start - i12 * ne1;
|
| |
|
| |
|
| | const int64_t i02 = i12 / r2;
|
| |
|
| | const int64_t i1 = i11;
|
| | const int64_t i2 = i12;
|
| |
|
| | const char * src0_ptr = (const char *) src0->data + i02 * nb02;
|
| | const char * src1_ptr = (const char *) params->wdata + (i11 + i12 * ne11) * src1_col_stride;
|
| | char * dst_ptr = ((char *) dst->data + (i1 * nb1 + i2 * nb2));
|
| |
|
| | const int64_t nrows = src1_end - src1_start;
|
| | const int64_t ncols = src0_end - src0_start;
|
| |
|
| | GGML_ASSERT(src1_ptr + src1_col_stride * nrows <= (const char *) params->wdata + params->wsize);
|
| |
|
| |
|
| | if (nrows > 3) {
|
| | gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr) + src0_start, nb1 / nb0,
|
| | src0_ptr + src0_start * nb01, src1_ptr,
|
| | nrows - (nrows % 4), ncols);
|
| | }
|
| | for (int iter = nrows - (nrows % 4); iter < nrows; iter++) {
|
| | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, (float *) (dst_ptr + (iter * nb1)) + src0_start,
|
| | ne01, src0_ptr + src0_start * nb01,
|
| | src1_ptr + (src1_col_stride * iter), 1 , ncols);
|
| | }
|
| | }
|
| |
|
| | void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) {
|
| | const ggml_tensor * src0 = op->src[0];
|
| | const ggml_tensor * src1 = op->src[1];
|
| | ggml_tensor * dst = op;
|
| |
|
| | GGML_TENSOR_BINARY_OP_LOCALS
|
| |
|
| | const int ith = params->ith;
|
| | const int nth = params->nth;
|
| |
|
| | GGML_ASSERT(ne0 == ne01);
|
| | GGML_ASSERT(ne1 == ne11);
|
| | GGML_ASSERT(ne2 == ne12);
|
| | GGML_ASSERT(ne3 == ne13);
|
| |
|
| |
|
| | GGML_ASSERT(nb0 == sizeof(float));
|
| | GGML_ASSERT(nb0 <= nb1);
|
| | GGML_ASSERT(nb1 <= nb2);
|
| | GGML_ASSERT(nb2 <= nb3);
|
| |
|
| |
|
| |
|
| | GGML_ASSERT(ne03 == 1);
|
| | GGML_ASSERT(ne13 == 1);
|
| | GGML_ASSERT(ne3 == 1);
|
| |
|
| | GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| |
|
| | GGML_ASSERT(ggml_n_dims(op->src[0]) == 2);
|
| |
|
| |
|
| | char * wdata = static_cast<char *>(params->wdata);
|
| | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
|
| | const size_t nbw2 = nbw1 * ne11;
|
| |
|
| | assert(params->wsize >= nbw2 * ne12);
|
| |
|
| | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
|
| |
|
| |
|
| |
|
| |
|
| | for (int64_t i12 = 0; i12 < ne12; i12++) {
|
| | char * data_ptr = (char *) src1->data + i12 * nb12;
|
| | char * wdata_ptr = wdata + i12 * nbw2;
|
| |
|
| | for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
| | ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) (data_ptr + i11 * nb11),
|
| | (void *) (wdata_ptr + i11 * nbw1), 4, ne10);
|
| | }
|
| |
|
| | const int64_t i11_processed = ne11 - ne11 % 4;
|
| | for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
| | from_float((float *) (data_ptr + i11 * nb11), (void *) (wdata_ptr + i11 * nbw1), ne10);
|
| | }
|
| | }
|
| |
|
| |
|
| | const bool disable_chunking = ggml_is_numa();
|
| |
|
| |
|
| | const int64_t nr0 = ggml_nrows(op->src[0]);
|
| |
|
| | int nth_scaled = nth * 4;
|
| | int64_t chunk_size0 = (nr0 + nth_scaled - 1) / nth_scaled;
|
| | int64_t nchunk0 = (nr0 + chunk_size0 - 1) / chunk_size0;
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | int64_t nchunk1 = ne12;
|
| |
|
| |
|
| |
|
| | const int64_t min_chunk_size = NB_COLS;
|
| | if (nchunk0 > 0 && (nr0 / nchunk0) < min_chunk_size && nr0 >= min_chunk_size) {
|
| | nchunk0 = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
| | }
|
| |
|
| | int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
| |
|
| | if (nth == 1 || ((nchunk0 < nth || disable_chunking) && (nr0 + nth - 1) / nth >= min_chunk_size)) {
|
| | nchunk0 = nth;
|
| | dr0 = (nr0 + nchunk0 - 1) / nchunk0;
|
| | }
|
| |
|
| |
|
| |
|
| | const int64_t max_nchunk = (nr0 + min_chunk_size - 1) / min_chunk_size;
|
| | nchunk0 = MIN(nchunk0, max_nchunk);
|
| |
|
| | if (ith == 0) {
|
| |
|
| | ggml_threadpool_chunk_set(params->threadpool, nth);
|
| | }
|
| |
|
| | ggml_barrier(params->threadpool);
|
| |
|
| |
|
| | int current_chunk = ith;
|
| |
|
| | while (current_chunk < nchunk0 * nchunk1) {
|
| | const int64_t ith0 = current_chunk % nchunk0;
|
| | const int64_t ith1 = current_chunk / nchunk0;
|
| |
|
| | int64_t src0_start = dr0 * ith0;
|
| | int64_t src0_end = MIN(src0_start + dr0, nr0);
|
| |
|
| |
|
| | int64_t src1_start = ith1 * ne11;
|
| | int64_t src1_end = (ith1 + 1) * ne11;
|
| |
|
| |
|
| |
|
| | src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start;
|
| | src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end;
|
| | src0_end = MIN(src0_end, ne01);
|
| |
|
| |
|
| | if (src0_start >= src0_end) {
|
| | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
| | continue;
|
| | }
|
| |
|
| | forward_mul_mat_one_chunk(params, dst, src0_start, src0_end, src1_start, src1_end);
|
| |
|
| | current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1);
|
| | }
|
| | }
|
| |
|
| | void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) {
|
| | const ggml_tensor * src0 = op->src[0];
|
| | const ggml_tensor * src1 = op->src[1];
|
| | const ggml_tensor * ids = op->src[2];
|
| | ggml_tensor * dst = op;
|
| |
|
| | GGML_TENSOR_BINARY_OP_LOCALS
|
| |
|
| | const int ith = params->ith;
|
| | const int nth = params->nth;
|
| |
|
| | const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float;
|
| |
|
| |
|
| | GGML_ASSERT(nb00 == ggml_type_size(src0->type));
|
| | GGML_ASSERT(nb10 == ggml_type_size(src1->type));
|
| |
|
| |
|
| | GGML_ASSERT(nb0 == sizeof(float));
|
| | GGML_ASSERT(nb0 <= nb1);
|
| | GGML_ASSERT(nb1 <= nb2);
|
| | GGML_ASSERT(nb2 <= nb3);
|
| |
|
| | GGML_ASSERT(ne03 == 1);
|
| | GGML_ASSERT(ne13 == 1);
|
| | GGML_ASSERT(ne3 == 1);
|
| |
|
| | GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
| |
|
| |
|
| | const int n_ids = ids->ne[0];
|
| | const int n_as = ne02;
|
| |
|
| | const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10);
|
| | const size_t nbw2 = nbw1*ne11;
|
| | const size_t nbw3 = nbw2*ne12;
|
| |
|
| | struct mmid_row_mapping {
|
| | int32_t i1;
|
| | int32_t i2;
|
| | };
|
| |
|
| | GGML_ASSERT(params->wsize >=
|
| | (GGML_PAD(nbw3, sizeof(int64_t)) +
|
| | n_as*(ne12 + 1)*sizeof(mmid_row_mapping))
|
| | );
|
| |
|
| | auto * wdata = (char *)params->wdata;
|
| | auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
|
| |
|
| |
|
| | auto * matrix_row_counts = (int64_t *) (wdata_src1_end);
|
| | struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as);
|
| |
|
| |
|
| | for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| | for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
| | from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11),
|
| | (void *) (wdata + i12 * nbw2 + i11 * nbw1),
|
| | ne10);
|
| | }
|
| | }
|
| |
|
| | #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)]
|
| |
|
| | if (ith == 0) {
|
| |
|
| | memset(matrix_row_counts, 0, n_as * sizeof(int64_t));
|
| |
|
| |
|
| | for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) {
|
| | for (int32_t id = 0; id < n_ids; ++id) {
|
| | const int32_t i02 =
|
| | *(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]);
|
| |
|
| | GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
| |
|
| | MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 };
|
| | matrix_row_counts[i02] += 1;
|
| | }
|
| | }
|
| | }
|
| |
|
| | ggml_barrier(params->threadpool);
|
| |
|
| |
|
| | for (int cur_a = 0; cur_a < n_as; ++cur_a) {
|
| | const int64_t cne1 = matrix_row_counts[cur_a];
|
| |
|
| | if (cne1 == 0) {
|
| | continue;
|
| | }
|
| |
|
| | const auto * src0_cur = (const char *) src0->data + cur_a*nb02;
|
| |
|
| |
|
| | const int64_t nr1 = cne1;
|
| |
|
| | int64_t src0_cur_start = (ith * ne01) / nth;
|
| | int64_t src0_cur_end = ((ith + 1) * ne01) / nth;
|
| |
|
| |
|
| | src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start;
|
| | src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end;
|
| | if (src0_cur_end > ne01) {
|
| | src0_cur_end = ne01;
|
| | }
|
| |
|
| | if (src0_cur_start >= src0_cur_end) {
|
| | return;
|
| | }
|
| |
|
| | for (int ir1 = 0; ir1 < nr1; ir1++) {
|
| | struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
|
| |
|
| | const int id = row_mapping.i1;
|
| |
|
| | const int64_t i11 = id % ne11;
|
| | const int64_t i12 = row_mapping.i2;
|
| |
|
| | const int64_t i1 = id;
|
| | const int64_t i2 = i12;
|
| |
|
| | const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2);
|
| |
|
| | gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(
|
| | ne00, (float *) ((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
|
| | src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
|
| | }
|
| | }
|
| | #undef MMID_MATRIX_ROW
|
| | }
|
| |
|
| | int repack(struct ggml_tensor * t, const void * data, size_t data_size) override {
|
| | GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type),
|
| | (int) NB_COLS, (int) INTER_SIZE);
|
| | return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size);
|
| | }
|
| | };
|
| |
|
| | }
|
| |
|
| | static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) {
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0;
|
| | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0;
|
| | static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q4_K, 4, 8, GGML_TYPE_Q8_K> q4_K_8x4_q8_K;
|
| | static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q5_K, 4, 8, GGML_TYPE_Q8_K> q5_K_8x4_q8_K;
|
| | static const ggml::cpu::repack::tensor_traits<block_q5_K, 8, 8, GGML_TYPE_Q8_K> q5_K_8x8_q8_K;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q6_K, 4, 8, GGML_TYPE_Q8_K> q6_K_8x4_q8_K;
|
| | static const ggml::cpu::repack::tensor_traits<block_q6_K, 8, 8, GGML_TYPE_Q8_K> q6_K_8x8_q8_K;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q2_K, 8, 8, GGML_TYPE_Q8_K> q2_K_8x8_q8_K;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0;
|
| | static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 8, 8, GGML_TYPE_Q8_0> iq4_nl_8x8_q8_0;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_mxfp4, 4, 4, GGML_TYPE_Q8_0> mxfp4_4x4_q8_0;
|
| | static const ggml::cpu::repack::tensor_traits<block_mxfp4, 8, 8, GGML_TYPE_Q8_0> mxfp4_8x8_q8_0;
|
| |
|
| |
|
| | static const ggml::cpu::repack::tensor_traits<block_q8_0, 4, 4, GGML_TYPE_Q8_0> q8_0_4x4_q8_0;
|
| | static const ggml::cpu::repack::tensor_traits<block_q8_0, 8, 4, GGML_TYPE_Q8_0> q8_0_4x8_q8_0;
|
| |
|
| | if (cur->type == GGML_TYPE_Q4_0) {
|
| | if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)
|
| | || (ggml_cpu_has_riscv_v() && (ggml_cpu_get_rvv_vlen() >= QK4_0))) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q4_0_8x8_q8_0;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &q4_0_4x8_q8_0;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &q4_0_4x4_q8_0;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_Q4_K) {
|
| | if (ggml_cpu_has_avx2()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q4_K_8x8_q8_K;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q4_K_8x8_q8_K;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q4_K_8x4_q8_K;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_Q2_K) {
|
| | if (ggml_cpu_has_avx512()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q2_K_8x8_q8_K;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_Q5_K) {
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q5_K_8x8_q8_K;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q5_K_8x4_q8_K;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_Q6_K) {
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q6_K_8x8_q8_K;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &q6_K_8x4_q8_K;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_IQ4_NL) {
|
| | if (ggml_cpu_has_avx2()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &iq4_nl_8x8_q8_0;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &iq4_nl_4x4_q8_0;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_MXFP4) {
|
| | if (ggml_cpu_has_avx2()) {
|
| | if (cur->ne[1] % 8 == 0) {
|
| | return &mxfp4_8x8_q8_0;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &mxfp4_4x4_q8_0;
|
| | }
|
| | }
|
| | } else if (cur->type == GGML_TYPE_Q8_0) {
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &q8_0_4x8_q8_0;
|
| | }
|
| | }
|
| | if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) {
|
| | if (cur->ne[1] % 4 == 0) {
|
| | return &q8_0_4x4_q8_0;
|
| | }
|
| | }
|
| | }
|
| |
|
| | return nullptr;
|
| | }
|
| |
|
| | static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
| | tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor));
|
| |
|
| | GGML_UNUSED(buffer);
|
| | return GGML_STATUS_SUCCESS;
|
| | }
|
| |
|
| | static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor,
|
| | const void * data, size_t offset, size_t size) {
|
| | GGML_ASSERT(offset == 0);
|
| | GGML_ASSERT(size == ggml_nbytes(tensor));
|
| |
|
| | auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra;
|
| | auto OK = tensor_traits->repack(tensor, data, size);
|
| |
|
| | GGML_ASSERT(OK == 0);
|
| | GGML_UNUSED(buffer);
|
| | }
|
| |
|
| | static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
| | return "CPU_REPACK";
|
| |
|
| | GGML_UNUSED(buft);
|
| | }
|
| |
|
| | static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
| | ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
| |
|
| | if (buffer == nullptr) {
|
| | return nullptr;
|
| | }
|
| |
|
| | buffer->buft = buft;
|
| | buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor;
|
| | buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor;
|
| | buffer->iface.get_tensor = nullptr;
|
| | buffer->iface.cpy_tensor = nullptr;
|
| | return buffer;
|
| | }
|
| |
|
| | static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
| | return TENSOR_ALIGNMENT;
|
| |
|
| | GGML_UNUSED(buft);
|
| | }
|
| |
|
| | namespace ggml::cpu::repack {
|
| | class extra_buffer_type : ggml::cpu::extra_buffer_type {
|
| | bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override {
|
| | if ( op->op == GGML_OP_MUL_MAT &&
|
| | op->src[0]->buffer &&
|
| | (ggml_n_dims(op->src[0]) == 2) &&
|
| | op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() &&
|
| | ggml_repack_get_optimal_repack_type(op->src[0])
|
| | ) {
|
| | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
| | return false;
|
| | }
|
| | if (op->src[1]->type == GGML_TYPE_F32) {
|
| | return true;
|
| | }
|
| |
|
| |
|
| |
|
| |
|
| | } else if (op->op == GGML_OP_MUL_MAT_ID
|
| | && op->src[0]->buffer
|
| | && (ggml_n_dims(op->src[0]) == 3)
|
| | && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()
|
| | && ggml_repack_get_optimal_repack_type(op->src[0])
|
| | ) {
|
| | if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
|
| | return false;
|
| | }
|
| | if (op->src[1]->type == GGML_TYPE_F32) {
|
| | return true;
|
| | }
|
| |
|
| |
|
| |
|
| | }
|
| | return false;
|
| | }
|
| |
|
| | ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
|
| | if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) {
|
| | if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) {
|
| | return (ggml::cpu::tensor_traits *) op->src[0]->extra;
|
| | }
|
| | }
|
| | return nullptr;
|
| | }
|
| | };
|
| | }
|
| |
|
| | ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) {
|
| | static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = {
|
| | {
|
| | ggml_backend_cpu_repack_buffer_type_get_name,
|
| | ggml_backend_cpu_repack_buffer_type_alloc_buffer,
|
| | ggml_backend_cpu_repack_buffer_type_get_alignment,
|
| | nullptr,
|
| | nullptr,
|
| | nullptr,
|
| | },
|
| | ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
| | new ggml::cpu::repack::extra_buffer_type(),
|
| | };
|
| |
|
| | return &ggml_backend_cpu_buffer_type_repack;
|
| | }
|
| |
|