|
|
#define GGML_COMMON_IMPL_CPP |
|
|
#define GGML_COMMON_DECL_CPP |
|
|
#include "ggml-common.h" |
|
|
#include "ggml-backend-impl.h" |
|
|
|
|
|
#include "ggml-impl.h" |
|
|
#include "ggml-cpu.h" |
|
|
#include "ggml-cpu-impl.h" |
|
|
#include "simd-mappings.h" |
|
|
#include "traits.h" |
|
|
|
|
|
#include "arch-fallback.h" |
|
|
|
|
|
#include <cmath> |
|
|
#include <cstring> |
|
|
#include <cassert> |
|
|
#include <cstdlib> |
|
|
#include <cstdio> |
|
|
|
|
|
#include "repack.h" |
|
|
|
|
|
#if defined(__GNUC__) |
|
|
#pragma GCC diagnostic ignored "-Woverlength-strings" |
|
|
#endif |
|
|
|
|
|
#define UNUSED GGML_UNUSED |
|
|
|
|
|
static inline int nearest_int(float fval) { |
|
|
assert(fabsf(fval) <= 4194303.f); |
|
|
float val = fval + 12582912.f; |
|
|
int i; memcpy(&i, &val, sizeof(int)); |
|
|
return (i & 0x007fffff) - 0x00400000; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
extern "C" { |
|
|
|
|
|
void ggml_quantize_mat_q8_0_4x4_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
|
|
assert(QK8_0 == 32); |
|
|
assert(k % QK8_0 == 0); |
|
|
const int nb = k / QK8_0; |
|
|
|
|
|
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
|
|
|
|
|
|
|
|
const int blck_size_interleave = 4; |
|
|
float srcv[4][QK8_0]; |
|
|
float id[4]; |
|
|
|
|
|
for (int i = 0; i < nb; i++) { |
|
|
for (int row_iter = 0; row_iter < 4; row_iter++) { |
|
|
float amax = 0.0f; |
|
|
|
|
|
for (int j = 0; j < QK8_0; j++) { |
|
|
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
|
|
amax = MAX(amax, fabsf(srcv[row_iter][j])); |
|
|
} |
|
|
|
|
|
const float d = amax / ((1 << 7) - 1); |
|
|
id[row_iter] = d ? 1.0f / d : 0.0f; |
|
|
|
|
|
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
|
|
} |
|
|
|
|
|
for (int j = 0; j < QK8_0 * 4; j++) { |
|
|
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
|
|
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
|
|
src_offset += (j % blck_size_interleave); |
|
|
|
|
|
float x0 = srcv[src_id][src_offset] * id[src_id]; |
|
|
y[i].qs[j] = roundf(x0); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_quantize_mat_q8_0_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
|
|
assert(QK8_0 == 32); |
|
|
assert(k % QK8_0 == 0); |
|
|
const int nb = k / QK8_0; |
|
|
|
|
|
block_q8_0x4 * GGML_RESTRICT y = (block_q8_0x4 *) vy; |
|
|
|
|
|
|
|
|
const int blck_size_interleave = 8; |
|
|
float srcv[4][QK8_0]; |
|
|
float id[4]; |
|
|
|
|
|
for (int i = 0; i < nb; i++) { |
|
|
for (int row_iter = 0; row_iter < 4; row_iter++) { |
|
|
float amax = 0.0f; |
|
|
|
|
|
for (int j = 0; j < QK8_0; j++) { |
|
|
srcv[row_iter][j] = x[row_iter * k + i * QK8_0 + j]; |
|
|
amax = MAX(amax, fabsf(srcv[row_iter][j])); |
|
|
} |
|
|
|
|
|
const float d = amax / ((1 << 7) - 1); |
|
|
id[row_iter] = d ? 1.0f / d : 0.0f; |
|
|
|
|
|
y[i].d[row_iter] = GGML_CPU_FP32_TO_FP16(d); |
|
|
} |
|
|
|
|
|
for (int j = 0; j < QK8_0 * 4; j++) { |
|
|
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
|
|
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
|
|
src_offset += (j % blck_size_interleave); |
|
|
|
|
|
float x0 = srcv[src_id][src_offset] * id[src_id]; |
|
|
y[i].qs[j] = roundf(x0); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_quantize_mat_q8_K_4x8_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { |
|
|
assert(QK_K == 256); |
|
|
assert(k % QK_K == 0); |
|
|
const int nb = k / QK_K; |
|
|
|
|
|
block_q8_Kx4 * GGML_RESTRICT y = (block_q8_Kx4 *) vy; |
|
|
|
|
|
|
|
|
const int blck_size_interleave = 8; |
|
|
float srcv[4][QK_K]; |
|
|
float iscale[4]; |
|
|
|
|
|
for (int i = 0; i < nb; i++) { |
|
|
for (int row_iter = 0; row_iter < 4; row_iter++) { |
|
|
float amax = 0.0f; |
|
|
float max = 0; |
|
|
|
|
|
for (int j = 0; j < QK_K; j++) { |
|
|
srcv[row_iter][j] = x[row_iter * k + i * QK_K + j]; |
|
|
|
|
|
if(amax < fabsf(srcv[row_iter][j])) { |
|
|
amax = fabsf(srcv[row_iter][j]); |
|
|
max = srcv[row_iter][j]; |
|
|
} |
|
|
} |
|
|
|
|
|
iscale[row_iter] = amax ? -127.f/max : 0; |
|
|
|
|
|
y[i].d[row_iter] = amax ? 1/iscale[row_iter] : 0; |
|
|
} |
|
|
|
|
|
for (int j = 0; j < QK_K / 4; j++) { |
|
|
y[i].bsums[j] = 0; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 0; j < QK_K * 4; j++) { |
|
|
int src_offset = (j / (4 * blck_size_interleave)) * blck_size_interleave; |
|
|
int src_id = (j % (4 * blck_size_interleave)) / blck_size_interleave; |
|
|
src_offset += (j % blck_size_interleave); |
|
|
int index = (((j & 31) >> 3) << 2) + ((j >> 8) << 4) + ((j >> 6) & 3); |
|
|
|
|
|
float x0 = srcv[src_id][src_offset] * iscale[src_id]; |
|
|
y[i].qs[j] = nearest_int(x0); |
|
|
y[i].bsums[index] += y[i].qs[j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
template <int64_t INTER_SIZE, ggml_type PARAM_TYPE> |
|
|
void ggml_quantize_mat_t(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row); |
|
|
|
|
|
template <> void ggml_quantize_mat_t<4, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
|
|
assert(nrow == 4); |
|
|
UNUSED(nrow); |
|
|
ggml_quantize_mat_q8_0_4x4(x, vy, n_per_row); |
|
|
} |
|
|
|
|
|
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_0>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
|
|
assert(nrow == 4); |
|
|
UNUSED(nrow); |
|
|
ggml_quantize_mat_q8_0_4x8(x, vy, n_per_row); |
|
|
} |
|
|
|
|
|
template <> void ggml_quantize_mat_t<8, GGML_TYPE_Q8_K>(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t nrow, int64_t n_per_row) { |
|
|
assert(nrow == 4); |
|
|
UNUSED(nrow); |
|
|
ggml_quantize_mat_q8_K_4x8(x, vy, n_per_row); |
|
|
} |
|
|
|
|
|
extern "C" { |
|
|
|
|
|
void ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 4; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[4]; |
|
|
int sumi; |
|
|
|
|
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
|
|
|
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
|
|
} |
|
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 8; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[4]; |
|
|
int sumi; |
|
|
|
|
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
|
|
|
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
|
|
} |
|
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 8; |
|
|
const int blocklen = 8; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
{ |
|
|
float sumf[8]; |
|
|
int sumi; |
|
|
|
|
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
|
|
|
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])) >> 4; |
|
|
} |
|
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK_K; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 8; |
|
|
const int blocklen = 8; |
|
|
static const uint32_t kmask1 = 0x3f3f3f3f; |
|
|
static const uint32_t kmask2 = 0x0f0f0f0f; |
|
|
static const uint32_t kmask3 = 0x03030303; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[8]; |
|
|
float sum_minf[8]; |
|
|
uint32_t utmp[32]; |
|
|
int sumi1; |
|
|
int sumi2; |
|
|
int sumi; |
|
|
|
|
|
const block_q8_K * a_ptr = (const block_q8_K *) vy; |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
|
|
|
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumf[j] = 0.0; |
|
|
sum_minf[j] = 0.0; |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int sb = 0; sb < 8; sb++) { |
|
|
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
|
|
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
|
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
|
|
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
|
|
utmp[sb * 4 + 2] = uaux_0; |
|
|
utmp[sb * 4 + 0] &= kmask1; |
|
|
} |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
|
|
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi1 = 0; |
|
|
sumi2 = 0; |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
|
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i]); |
|
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 64 + (k % 4) * blocklen + i + 32]); |
|
|
sumi1 = sumi1 * scales_0[j]; |
|
|
sumi2 = sumi2 * scales_1[j]; |
|
|
sumi += sumi1 + sumi2; |
|
|
} |
|
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d; |
|
|
} |
|
|
} |
|
|
for (int sb = 0; sb < 8; sb++) { |
|
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sum_minf[j] += mins[j] * (a_ptr[l].bsums[sb * 2] + a_ptr[l].bsums[sb * 2 + 1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d; |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
s[x * ncols_interleaved + j] = sumf[j] - sum_minf[j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 4; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
{ |
|
|
float sumf[4]; |
|
|
int sumi; |
|
|
|
|
|
const block_q8_0 * a_ptr = (const block_q8_0 *) vy; |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
|
|
|
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[j] = 0.0; |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
|
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
|
|
sumi += ((v0 * a_ptr[l].qs[k * blocklen + i]) + (v1 * a_ptr[l].qs[k * blocklen + i + qk / 2])); |
|
|
} |
|
|
sumf[j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d); |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int j = 0; j < ncols_interleaved; j++) s[x * ncols_interleaved + j] = sumf[j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 4; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nr % 4 == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
{ |
|
|
float sumf[4][4]; |
|
|
int sumi; |
|
|
|
|
|
for (int y = 0; y < nr / 4; y++) { |
|
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
|
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
|
|
} |
|
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) |
|
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 8; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nr % 4 == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[4][4]; |
|
|
int sumi; |
|
|
|
|
|
for (int y = 0; y < nr / 4; y++) { |
|
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx + (x * nb); |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
|
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
|
|
} |
|
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) |
|
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 8; |
|
|
const int blocklen = 8; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nr % 4 == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[4][8]; |
|
|
int sumi; |
|
|
|
|
|
for (int y = 0; y < nr / 4; y++) { |
|
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_0x8 * b_ptr = (const block_q4_0x8 *) vx + (x * nb); |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] << 4); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF0); |
|
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
|
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])) >> 4; |
|
|
} |
|
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) |
|
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK_K; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 8; |
|
|
const int blocklen = 8; |
|
|
static const uint32_t kmask1 = 0x3f3f3f3f; |
|
|
static const uint32_t kmask2 = 0x0f0f0f0f; |
|
|
static const uint32_t kmask3 = 0x03030303; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nr % 4 == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
float sumf[4][8]; |
|
|
float sum_minf[4][8]; |
|
|
uint32_t utmp[32]; |
|
|
int sumi1; |
|
|
int sumi2; |
|
|
int sumi; |
|
|
|
|
|
for (int y = 0; y < nr / 4; y++) { |
|
|
const block_q8_Kx4 * a_ptr = (const block_q8_Kx4 *) vy + (y * nb); |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_q4_Kx8 * b_ptr = (const block_q4_Kx8 *) vx + (x * nb); |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumf[m][j] = 0.0; |
|
|
sum_minf[m][j] = 0.0; |
|
|
} |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int sb = 0; sb < 8; sb++) { |
|
|
memcpy(utmp + sb * 4, b_ptr[l].scales + sb * 12, 12); |
|
|
utmp[sb * 4 + 3] = ((utmp[sb * 4 + 2] >> 4) & kmask2) | (((utmp[sb * 4 + 1] >> 6) & kmask3) << 4); |
|
|
const uint32_t uaux_0 = utmp[sb * 4 + 1] & kmask1; |
|
|
utmp[sb * 4 + 1] = (utmp[sb * 4 + 2] & kmask2) | (((utmp[sb * 4 + 0] >> 6) & kmask3) << 4); |
|
|
utmp[sb * 4 + 2] = uaux_0; |
|
|
utmp[sb * 4 + 0] &= kmask1; |
|
|
} |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
uint8_t *scales_0 = (uint8_t*) utmp + (k / 4) * 32; |
|
|
uint8_t *scales_1 = (uint8_t*) utmp + (k / 4) * 32 + 16; |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi1 = 0; |
|
|
sumi2 = 0; |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0xF); |
|
|
const int v1 = (int8_t) (b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4); |
|
|
sumi1 = (v0 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i]); |
|
|
sumi2 = (v1 * a_ptr[l].qs[(k >> 2) * 256 + (k % 4) * 4 * blocklen + m * blocklen + i + 128]); |
|
|
sumi1 = sumi1 * scales_0[j]; |
|
|
sumi2 = sumi2 * scales_1[j]; |
|
|
sumi += sumi1 + sumi2; |
|
|
} |
|
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * a_ptr[l].d[m]; |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int sb = 0; sb < 8; sb++) { |
|
|
uint8_t *mins = (uint8_t*) utmp + 8 + sb * 16; |
|
|
for(int m = 0; m < 4; m++) { |
|
|
const int16_t *bsums = a_ptr[l].bsums + (sb * 8) + (m * 4) - ((sb % 2) * 6); |
|
|
for(int j = 0; j < ncols_interleaved; j++) { |
|
|
sum_minf[m][j] += mins[j] * (bsums[0] + bsums[1]) * GGML_CPU_FP16_TO_FP32(b_ptr[l].dmin[j]) * a_ptr[l].d[m]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j] - sum_minf[m][j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
void ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc) { |
|
|
const int qk = QK8_0; |
|
|
const int nb = n / qk; |
|
|
const int ncols_interleaved = 4; |
|
|
const int blocklen = 4; |
|
|
|
|
|
assert (n % qk == 0); |
|
|
assert (nr % 4 == 0); |
|
|
assert (nc % ncols_interleaved == 0); |
|
|
|
|
|
UNUSED(s); |
|
|
UNUSED(bs); |
|
|
UNUSED(vx); |
|
|
UNUSED(vy); |
|
|
UNUSED(nr); |
|
|
UNUSED(nc); |
|
|
UNUSED(nb); |
|
|
UNUSED(ncols_interleaved); |
|
|
UNUSED(blocklen); |
|
|
|
|
|
{ |
|
|
float sumf[4][4]; |
|
|
int sumi; |
|
|
|
|
|
for (int y = 0; y < nr / 4; y++) { |
|
|
const block_q8_0x4 * a_ptr = (const block_q8_0x4 *) vy + (y * nb); |
|
|
for (int x = 0; x < nc / ncols_interleaved; x++) { |
|
|
const block_iq4_nlx4 * b_ptr = (const block_iq4_nlx4 *) vx + (x * nb); |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) sumf[m][j] = 0.0; |
|
|
} |
|
|
for (int l = 0; l < nb; l++) { |
|
|
for (int k = 0; k < (qk / (2 * blocklen)); k++) { |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) { |
|
|
sumi = 0; |
|
|
for (int i = 0; i < blocklen; ++i) { |
|
|
const int v0 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] & 0x0F]; |
|
|
const int v1 = kvalues_iq4nl[b_ptr[l].qs[k * ncols_interleaved * blocklen + j * blocklen + i] >> 4]; |
|
|
sumi += ((v0 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i]) + |
|
|
(v1 * a_ptr[l].qs[k * 4 * blocklen + m * blocklen + i + qk / 2 * 4])); |
|
|
} |
|
|
sumf[m][j] += sumi * GGML_CPU_FP16_TO_FP32(b_ptr[l].d[j]) * GGML_CPU_FP16_TO_FP32(a_ptr[l].d[m]); |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
for (int m = 0; m < 4; m++) { |
|
|
for (int j = 0; j < ncols_interleaved; j++) |
|
|
s[(y * 4 + m) * bs + x * ncols_interleaved + j] = sumf[m][j]; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
static block_q4_0x4 make_block_q4_0x4(block_q4_0 * in, unsigned int blck_size_interleave) { |
|
|
block_q4_0x4 out; |
|
|
|
|
|
for (int i = 0; i < 4; i++) { |
|
|
out.d[i] = in[i].d; |
|
|
} |
|
|
|
|
|
const int end = QK4_0 * 2 / blck_size_interleave; |
|
|
|
|
|
if (blck_size_interleave == 8) { |
|
|
const uint64_t xor_mask = 0x8888888888888888ULL; |
|
|
for (int i = 0; i < end; ++i) { |
|
|
int src_id = i % 4; |
|
|
int src_offset = (i / 4) * blck_size_interleave; |
|
|
int dst_offset = i * blck_size_interleave; |
|
|
|
|
|
uint64_t elems; |
|
|
|
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
|
|
elems ^= xor_mask; |
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
|
|
} |
|
|
} else if (blck_size_interleave == 4) { |
|
|
const uint32_t xor_mask = 0x88888888; |
|
|
for (int i = 0; i < end; ++i) { |
|
|
int src_id = i % 4; |
|
|
int src_offset = (i / 4) * blck_size_interleave; |
|
|
int dst_offset = i * blck_size_interleave; |
|
|
|
|
|
uint32_t elems; |
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint32_t)); |
|
|
elems ^= xor_mask; |
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint32_t)); |
|
|
} |
|
|
} else { |
|
|
GGML_ASSERT(false); |
|
|
} |
|
|
|
|
|
return out; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static block_q4_0x8 make_block_q4_0x8(block_q4_0 * in, unsigned int blck_size_interleave) { |
|
|
block_q4_0x8 out; |
|
|
|
|
|
for (int i = 0; i < 8; i++) { |
|
|
out.d[i] = in[i].d; |
|
|
} |
|
|
|
|
|
const int end = QK4_0 * 4 / blck_size_interleave; |
|
|
const uint64_t xor_mask = 0x8888888888888888ULL; |
|
|
|
|
|
for (int i = 0; i < end; ++i) { |
|
|
int src_id = i % 8; |
|
|
int src_offset = (i / 8) * blck_size_interleave; |
|
|
int dst_offset = i * blck_size_interleave; |
|
|
|
|
|
uint64_t elems; |
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
|
|
elems ^= xor_mask; |
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
|
|
} |
|
|
|
|
|
return out; |
|
|
} |
|
|
|
|
|
static block_q4_Kx8 make_block_q4_Kx8(block_q4_K * in, unsigned int blck_size_interleave) { |
|
|
block_q4_Kx8 out; |
|
|
|
|
|
for (int i = 0; i < 8; i++) { |
|
|
out.d[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.d; |
|
|
} |
|
|
|
|
|
for (int i = 0; i < 8; i++) { |
|
|
out.dmin[i] = in[i].GGML_COMMON_AGGR_U.GGML_COMMON_AGGR_S.dmin; |
|
|
} |
|
|
|
|
|
const int end = QK_K * 4 / blck_size_interleave; |
|
|
|
|
|
|
|
|
for (int i = 0; i < end; ++i) { |
|
|
int src_id = i % 8; |
|
|
int src_offset = (i / 8) * blck_size_interleave; |
|
|
int dst_offset = i * blck_size_interleave; |
|
|
|
|
|
uint64_t elems; |
|
|
memcpy(&elems, &in[src_id].qs[src_offset], sizeof(uint64_t)); |
|
|
memcpy(&out.qs[dst_offset], &elems, sizeof(uint64_t)); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uint8_t s[8], m[8]; |
|
|
|
|
|
for (int i = 0; i < 4; i++) { |
|
|
for (int j = 0; j < 8; j++) { |
|
|
s[j] = in[j].scales[i] & 63; |
|
|
m[j] = in[j].scales[i + 4] & 63; |
|
|
} |
|
|
|
|
|
out.scales[i * 12] = (s[0] & 63) + ((s[4] & 48) << 2); |
|
|
out.scales[i * 12 + 1] = (s[1] & 63) + ((s[5] & 48) << 2); |
|
|
out.scales[i * 12 + 2] = (s[2] & 63) + ((s[6] & 48) << 2); |
|
|
out.scales[i * 12 + 3] = (s[3] & 63) + ((s[7] & 48) << 2); |
|
|
out.scales[i * 12 + 4] = (m[0] & 63) + ((m[4] & 48) << 2); |
|
|
out.scales[i * 12 + 5] = (m[1] & 63) + ((m[5] & 48) << 2); |
|
|
out.scales[i * 12 + 6] = (m[2] & 63) + ((m[6] & 48) << 2); |
|
|
out.scales[i * 12 + 7] = (m[3] & 63) + ((m[7] & 48) << 2); |
|
|
out.scales[i * 12 + 8] = (s[4] & 15) + ((m[4] & 15) << 4); |
|
|
out.scales[i * 12 + 9] = (s[5] & 15) + ((m[5] & 15) << 4); |
|
|
out.scales[i * 12 + 10] = (s[6] & 15) + ((m[6] & 15) << 4); |
|
|
out.scales[i * 12 + 11] = (s[7] & 15) + ((m[7] & 15) << 4); |
|
|
|
|
|
} |
|
|
|
|
|
for (int i = 0; i < 4; i++) { |
|
|
for (int j = 0; j < 8; j++) { |
|
|
s[j] = ((in[j].scales[i] & 192) >> 2) | (in[j].scales[i+8] & 15); |
|
|
m[j] = ((in[j].scales[i + 4] & 192) >> 2) | ((in[j].scales[i+8] & 240) >> 4); |
|
|
} |
|
|
|
|
|
out.scales[i * 12 + 48] = (s[0] & 63) + ((s[4] & 48) << 2); |
|
|
out.scales[i * 12 + 49] = (s[1] & 63) + ((s[5] & 48) << 2); |
|
|
out.scales[i * 12 + 50] = (s[2] & 63) + ((s[6] & 48) << 2); |
|
|
out.scales[i * 12 + 51] = (s[3] & 63) + ((s[7] & 48) << 2); |
|
|
out.scales[i * 12 + 52] = (m[0] & 63) + ((m[4] & 48) << 2); |
|
|
out.scales[i * 12 + 53] = (m[1] & 63) + ((m[5] & 48) << 2); |
|
|
out.scales[i * 12 + 54] = (m[2] & 63) + ((m[6] & 48) << 2); |
|
|
out.scales[i * 12 + 55] = (m[3] & 63) + ((m[7] & 48) << 2); |
|
|
out.scales[i * 12 + 56] = (s[4] & 15) + ((m[4] & 15) << 4); |
|
|
out.scales[i * 12 + 57] = (s[5] & 15) + ((m[5] & 15) << 4); |
|
|
out.scales[i * 12 + 58] = (s[6] & 15) + ((m[6] & 15) << 4); |
|
|
out.scales[i * 12 + 59] = (s[7] & 15) + ((m[7] & 15) << 4); |
|
|
|
|
|
} |
|
|
|
|
|
return out; |
|
|
} |
|
|
|
|
|
static int repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
|
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
|
|
GGML_ASSERT(interleave_block == 4 || interleave_block == 8); |
|
|
constexpr int nrows_interleaved = 4; |
|
|
|
|
|
block_q4_0x4 * dst = (block_q4_0x4 *)t->data; |
|
|
const block_q4_0 * src = (const block_q4_0 *)data; |
|
|
block_q4_0 dst_tmp[4]; |
|
|
int nrow = ggml_nrows(t); |
|
|
int nblocks = t->ne[0] / QK4_0; |
|
|
|
|
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
|
|
|
|
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
|
|
return -1; |
|
|
} |
|
|
|
|
|
for (int b = 0; b < nrow; b += nrows_interleaved) { |
|
|
for (int64_t x = 0; x < nblocks; x++) { |
|
|
for (int i = 0; i < nrows_interleaved; i++) { |
|
|
dst_tmp[i] = src[x + i * nblocks]; |
|
|
} |
|
|
*dst++ = make_block_q4_0x4(dst_tmp, interleave_block); |
|
|
} |
|
|
src += nrows_interleaved * nblocks; |
|
|
} |
|
|
return 0; |
|
|
|
|
|
GGML_UNUSED(data_size); |
|
|
} |
|
|
static int repack_q4_K_to_q4_K_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
|
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_K); |
|
|
GGML_ASSERT(interleave_block == 8); |
|
|
constexpr int nrows_interleaved = 8; |
|
|
|
|
|
block_q4_Kx8 * dst = (block_q4_Kx8*)t->data; |
|
|
const block_q4_K * src = (const block_q4_K*) data; |
|
|
block_q4_K dst_tmp[8]; |
|
|
int nrow = ggml_nrows(t); |
|
|
int nblocks = t->ne[0] / QK_K; |
|
|
|
|
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_K)); |
|
|
|
|
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
|
|
return -1; |
|
|
} |
|
|
|
|
|
for (int b = 0; b < nrow; b += nrows_interleaved) { |
|
|
for (int64_t x = 0; x < nblocks; x++) { |
|
|
for (int i = 0; i < nrows_interleaved; i++ ) { |
|
|
dst_tmp[i] = src[x + i * nblocks]; |
|
|
} |
|
|
*dst++ = make_block_q4_Kx8(dst_tmp, interleave_block); |
|
|
} |
|
|
src += nrows_interleaved * nblocks; |
|
|
} |
|
|
return 0; |
|
|
|
|
|
GGML_UNUSED(data_size); |
|
|
} |
|
|
|
|
|
static int repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
|
|
GGML_ASSERT(t->type == GGML_TYPE_Q4_0); |
|
|
GGML_ASSERT(interleave_block == 8); |
|
|
constexpr int nrows_interleaved = 8; |
|
|
|
|
|
block_q4_0x8 * dst = (block_q4_0x8*)t->data; |
|
|
const block_q4_0 * src = (const block_q4_0*) data; |
|
|
block_q4_0 dst_tmp[8]; |
|
|
int nrow = ggml_nrows(t); |
|
|
int nblocks = t->ne[0] / QK4_0; |
|
|
|
|
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0)); |
|
|
|
|
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
|
|
return -1; |
|
|
} |
|
|
|
|
|
for (int b = 0; b < nrow; b += nrows_interleaved) { |
|
|
for (int64_t x = 0; x < nblocks; x++) { |
|
|
for (int i = 0; i < nrows_interleaved; i++ ) { |
|
|
dst_tmp[i] = src[x + i * nblocks]; |
|
|
} |
|
|
*dst++ = make_block_q4_0x8(dst_tmp, interleave_block); |
|
|
} |
|
|
src += nrows_interleaved * nblocks; |
|
|
} |
|
|
return 0; |
|
|
|
|
|
GGML_UNUSED(data_size); |
|
|
} |
|
|
|
|
|
static block_iq4_nlx4 make_block_iq4_nlx4(block_iq4_nl * in, unsigned int blck_size_interleave) { |
|
|
block_iq4_nlx4 out; |
|
|
|
|
|
for (int i = 0; i < 4; i++) { |
|
|
out.d[i] = in[i].d; |
|
|
} |
|
|
|
|
|
const int end = QK4_NL * 2 / blck_size_interleave; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (blck_size_interleave == 4) { |
|
|
for (int i = 0; i < end; ++i) { |
|
|
int src_id = i % 4; |
|
|
int src_offset = (i / 4) * blck_size_interleave; |
|
|
int dst_offset = i * blck_size_interleave; |
|
|
|
|
|
memcpy(&out.qs[dst_offset], &in[src_id].qs[src_offset], sizeof(uint32_t)); |
|
|
} |
|
|
} else { |
|
|
GGML_ASSERT(false); |
|
|
} |
|
|
|
|
|
return out; |
|
|
} |
|
|
|
|
|
static int repack_iq4_nl_to_iq4_nl_4_bl(struct ggml_tensor * t, int interleave_block, const void * GGML_RESTRICT data, size_t data_size) { |
|
|
GGML_ASSERT(t->type == GGML_TYPE_IQ4_NL); |
|
|
|
|
|
GGML_ASSERT(interleave_block == 4); |
|
|
|
|
|
block_iq4_nlx4 * dst = (block_iq4_nlx4 *)t->data; |
|
|
const block_iq4_nl * src = (const block_iq4_nl *)data; |
|
|
block_iq4_nl dst_tmp[4]; |
|
|
int nrow = ggml_nrows(t); |
|
|
int nrows_interleaved = 4; |
|
|
int nblocks = t->ne[0] / QK4_0; |
|
|
|
|
|
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_iq4_nl)); |
|
|
|
|
|
if (t->ne[1] % nrows_interleaved != 0 || t->ne[0] % 8 != 0) { |
|
|
return -1; |
|
|
} |
|
|
|
|
|
for (int b = 0; b < nrow; b += nrows_interleaved) { |
|
|
for (int64_t x = 0; x < nblocks; x++) { |
|
|
for (int i = 0; i < nrows_interleaved; i++) { |
|
|
dst_tmp[i] = src[x + i * nblocks]; |
|
|
} |
|
|
*dst++ = make_block_iq4_nlx4(dst_tmp, interleave_block); |
|
|
} |
|
|
src += nrows_interleaved * nblocks; |
|
|
} |
|
|
return 0; |
|
|
|
|
|
GGML_UNUSED(data_size); |
|
|
} |
|
|
|
|
|
namespace ggml::cpu::repack { |
|
|
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> |
|
|
int repack(struct ggml_tensor *, const void *, size_t); |
|
|
|
|
|
|
|
|
template <> int repack<block_q4_0, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
|
|
return repack_q4_0_to_q4_0_4_bl(t, 4, data, data_size); |
|
|
} |
|
|
|
|
|
template <> int repack<block_q4_0, 8, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
|
|
return repack_q4_0_to_q4_0_4_bl(t, 8, data, data_size); |
|
|
} |
|
|
|
|
|
template <> int repack<block_q4_0, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
|
|
return repack_q4_0_to_q4_0_8_bl(t, 8, data, data_size); |
|
|
} |
|
|
|
|
|
template <> int repack<block_q4_K, 8, 8>(struct ggml_tensor * t, const void * data, size_t data_size) { |
|
|
return repack_q4_K_to_q4_K_8_bl(t, 8, data, data_size); |
|
|
} |
|
|
|
|
|
template <> int repack<block_iq4_nl, 4, 4>(struct ggml_tensor * t, const void * data, size_t data_size) { |
|
|
return repack_iq4_nl_to_iq4_nl_4_bl(t, 4, data, data_size); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
|
|
void gemv(int, float *, size_t, const void *, const void *, int, int); |
|
|
|
|
|
template <> void gemv<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemv_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemv<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemv_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemv<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemv_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemv<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemv_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemv<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemv_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> |
|
|
void gemm(int, float *, size_t, const void *, const void *, int, int); |
|
|
|
|
|
template <> void gemm<block_q4_0, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemm_q4_0_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemm<block_q4_0, 8, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemm_q4_0_4x8_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemm<block_q4_0, 8, 8, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemm_q4_0_8x8_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemm<block_q4_K, 8, 8, GGML_TYPE_Q8_K>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemm_q4_K_8x8_q8_K(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
template <> void gemm<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0>(int n, float * s, size_t bs, const void * vx, const void * vy, int nr, int nc) { |
|
|
ggml_gemm_iq4_nl_4x4_q8_0(n, s, bs, vx, vy, nr, nc); |
|
|
} |
|
|
|
|
|
class tensor_traits_base : public ggml::cpu::tensor_traits { |
|
|
public: |
|
|
virtual int repack(struct ggml_tensor * t, const void * data, size_t data_size) = 0; |
|
|
}; |
|
|
|
|
|
template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PARAM_TYPE> class tensor_traits : public tensor_traits_base { |
|
|
|
|
|
bool work_size(int , const struct ggml_tensor * op, size_t & size) override { |
|
|
|
|
|
switch (op->op) { |
|
|
case GGML_OP_MUL_MAT: |
|
|
{ |
|
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
|
|
return true; |
|
|
} |
|
|
case GGML_OP_MUL_MAT_ID: |
|
|
{ |
|
|
size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1])); |
|
|
size = GGML_PAD(size, sizeof(int64_t)); |
|
|
|
|
|
const int64_t ne02 = op->src[0]->ne[2]; |
|
|
const int64_t ne12 = op->src[1]->ne[2]; |
|
|
|
|
|
const size_t sizeof_mmid_row_mapping = sizeof(int64_t); |
|
|
|
|
|
size += sizeof_mmid_row_mapping*ne02*(ne12 + 1); |
|
|
|
|
|
return true; |
|
|
} |
|
|
default: |
|
|
|
|
|
break; |
|
|
} |
|
|
return false; |
|
|
} |
|
|
|
|
|
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { |
|
|
switch (op->op) { |
|
|
case GGML_OP_MUL_MAT: |
|
|
forward_mul_mat(params, op); |
|
|
return true; |
|
|
case GGML_OP_MUL_MAT_ID: |
|
|
forward_mul_mat_id(params, op); |
|
|
return true; |
|
|
default: |
|
|
|
|
|
break; |
|
|
} |
|
|
return false; |
|
|
} |
|
|
|
|
|
void forward_mul_mat(ggml_compute_params * params, ggml_tensor * op) { |
|
|
const ggml_tensor * src0 = op->src[0]; |
|
|
const ggml_tensor * src1 = op->src[1]; |
|
|
ggml_tensor * dst = op; |
|
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS |
|
|
|
|
|
const int ith = params->ith; |
|
|
const int nth = params->nth; |
|
|
|
|
|
GGML_ASSERT(ne0 == ne01); |
|
|
GGML_ASSERT(ne1 == ne11); |
|
|
GGML_ASSERT(ne2 == ne12); |
|
|
GGML_ASSERT(ne3 == ne13); |
|
|
|
|
|
|
|
|
GGML_ASSERT(nb0 == sizeof(float)); |
|
|
GGML_ASSERT(nb0 <= nb1); |
|
|
GGML_ASSERT(nb1 <= nb2); |
|
|
GGML_ASSERT(nb2 <= nb3); |
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32); |
|
|
|
|
|
GGML_ASSERT(ggml_n_dims(op->src[0]) == 2); |
|
|
|
|
|
|
|
|
char * wdata = static_cast<char *>(params->wdata); |
|
|
const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
|
|
|
|
|
assert(params->wsize >= nbw1 * ne11); |
|
|
|
|
|
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
|
|
|
|
|
int64_t i11_processed = 0; |
|
|
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) { |
|
|
ggml_quantize_mat_t<INTER_SIZE, PARAM_TYPE>((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), 4, ne10); |
|
|
} |
|
|
|
|
|
i11_processed = ne11 - ne11 % 4; |
|
|
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) { |
|
|
from_float((float *) ((char *) src1->data + i11 * nb11), (void *) (wdata + i11 * nbw1), ne10); |
|
|
} |
|
|
|
|
|
ggml_barrier(params->threadpool); |
|
|
|
|
|
const void * src1_wdata = params->wdata; |
|
|
const size_t src1_col_stride = ggml_row_size(PARAM_TYPE, ne10); |
|
|
int64_t src0_start = (ith * ne01) / nth; |
|
|
int64_t src0_end = ((ith + 1) * ne01) / nth; |
|
|
src0_start = (src0_start % NB_COLS) ? src0_start + NB_COLS - (src0_start % NB_COLS) : src0_start; |
|
|
src0_end = (src0_end % NB_COLS) ? src0_end + NB_COLS - (src0_end % NB_COLS) : src0_end; |
|
|
if (src0_start >= src0_end) { |
|
|
return; |
|
|
} |
|
|
|
|
|
|
|
|
if (ne11 > 3) { |
|
|
gemm<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, |
|
|
(float *) ((char *) dst->data) + src0_start, ne01, |
|
|
(const char *) src0->data + src0_start * nb01, |
|
|
(const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start); |
|
|
} |
|
|
for (int iter = ne11 - ne11 % 4; iter < ne11; iter++) { |
|
|
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, |
|
|
(float *) ((char *) dst->data + (iter * nb1)) + src0_start, ne01, |
|
|
(const char *) src0->data + src0_start * nb01, |
|
|
(const char *) src1_wdata + (src1_col_stride * iter), 1, |
|
|
src0_end - src0_start); |
|
|
} |
|
|
} |
|
|
|
|
|
void forward_mul_mat_id(ggml_compute_params * params, ggml_tensor * op) { |
|
|
const ggml_tensor * src0 = op->src[0]; |
|
|
const ggml_tensor * src1 = op->src[1]; |
|
|
const ggml_tensor * ids = op->src[2]; |
|
|
ggml_tensor * dst = op; |
|
|
|
|
|
GGML_TENSOR_BINARY_OP_LOCALS |
|
|
|
|
|
const int ith = params->ith; |
|
|
const int nth = params->nth; |
|
|
|
|
|
const ggml_from_float_t from_float = ggml_get_type_traits_cpu(PARAM_TYPE)->from_float; |
|
|
|
|
|
|
|
|
GGML_ASSERT(nb00 == ggml_type_size(src0->type)); |
|
|
GGML_ASSERT(nb10 == ggml_type_size(src1->type)); |
|
|
|
|
|
|
|
|
GGML_ASSERT(nb0 == sizeof(float)); |
|
|
GGML_ASSERT(nb0 <= nb1); |
|
|
GGML_ASSERT(nb1 <= nb2); |
|
|
GGML_ASSERT(nb2 <= nb3); |
|
|
|
|
|
GGML_ASSERT(ne03 == 1); |
|
|
GGML_ASSERT(ne13 == 1); |
|
|
GGML_ASSERT(ne3 == 1); |
|
|
|
|
|
GGML_ASSERT(src1->type == GGML_TYPE_F32); |
|
|
|
|
|
|
|
|
const int n_ids = ids->ne[0]; |
|
|
const int n_as = ne02; |
|
|
|
|
|
const size_t nbw1 = ggml_row_size(PARAM_TYPE, ne10); |
|
|
const size_t nbw2 = nbw1*ne11; |
|
|
const size_t nbw3 = nbw2*ne12; |
|
|
|
|
|
struct mmid_row_mapping { |
|
|
int32_t i1; |
|
|
int32_t i2; |
|
|
}; |
|
|
|
|
|
GGML_ASSERT(params->wsize >= |
|
|
(GGML_PAD(nbw3, sizeof(int64_t)) + |
|
|
n_as*(ne12 + 1)*sizeof(mmid_row_mapping)) |
|
|
); |
|
|
|
|
|
auto * wdata = (char *)params->wdata; |
|
|
auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t)); |
|
|
|
|
|
|
|
|
auto * matrix_row_counts = (int64_t *) (wdata_src1_end); |
|
|
struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); |
|
|
|
|
|
|
|
|
for (int64_t i12 = 0; i12 < ne12; ++i12) { |
|
|
for (int64_t i11 = ith; i11 < ne11; i11 += nth) { |
|
|
from_float((float *)((char *) src1->data + i12 * nb12 + i11 * nb11), |
|
|
(void *) (wdata + i12 * nbw2 + i11 * nbw1), |
|
|
ne10); |
|
|
} |
|
|
} |
|
|
|
|
|
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ne12 + (i1)] |
|
|
|
|
|
if (ith == 0) { |
|
|
|
|
|
memset(matrix_row_counts, 0, n_as * sizeof(int64_t)); |
|
|
|
|
|
|
|
|
for (int32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { |
|
|
for (int32_t id = 0; id < n_ids; ++id) { |
|
|
const int32_t i02 = |
|
|
*(const int32_t *) ((const char *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); |
|
|
|
|
|
GGML_ASSERT(i02 >= 0 && i02 < n_as); |
|
|
|
|
|
MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = { id, iid1 }; |
|
|
matrix_row_counts[i02] += 1; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
ggml_barrier(params->threadpool); |
|
|
|
|
|
|
|
|
for (int cur_a = 0; cur_a < n_as; ++cur_a) { |
|
|
const int64_t cne1 = matrix_row_counts[cur_a]; |
|
|
|
|
|
if (cne1 == 0) { |
|
|
continue; |
|
|
} |
|
|
|
|
|
const auto * src0_cur = (const char *) src0->data + cur_a*nb02; |
|
|
|
|
|
|
|
|
const int64_t nr1 = cne1; |
|
|
|
|
|
int64_t src0_cur_start = (ith * ne01) / nth; |
|
|
int64_t src0_cur_end = ((ith + 1) * ne01) / nth; |
|
|
|
|
|
src0_cur_start = (src0_cur_start % NB_COLS) ? src0_cur_start + NB_COLS - (src0_cur_start % NB_COLS) : src0_cur_start; |
|
|
src0_cur_end = (src0_cur_end % NB_COLS) ? src0_cur_end + NB_COLS - (src0_cur_end % NB_COLS) : src0_cur_end; |
|
|
|
|
|
if (src0_cur_start >= src0_cur_end) { |
|
|
return; |
|
|
} |
|
|
|
|
|
for (int ir1 = 0; ir1 < nr1; ir1++) { |
|
|
struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1); |
|
|
|
|
|
const int id = row_mapping.i1; |
|
|
|
|
|
const int64_t i11 = id % ne11; |
|
|
const int64_t i12 = row_mapping.i2; |
|
|
|
|
|
const int64_t i1 = id; |
|
|
const int64_t i2 = i12; |
|
|
|
|
|
const auto * src1_col = (const char *) wdata + (i11 * nbw1 + i12 * nbw2); |
|
|
|
|
|
gemv<BLOC_TYPE, INTER_SIZE, NB_COLS, PARAM_TYPE>(ne00, |
|
|
(float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01, |
|
|
src0_cur + src0_cur_start * nb01, |
|
|
src1_col, 1, src0_cur_end - src0_cur_start); |
|
|
} |
|
|
} |
|
|
#undef MMID_MATRIX_ROW |
|
|
} |
|
|
|
|
|
int repack(struct ggml_tensor * t, const void * data, size_t data_size) override { |
|
|
GGML_LOG_DEBUG("%s: repack tensor %s with %s_%dx%d\n", __func__, t->name, ggml_type_name(t->type), |
|
|
(int) NB_COLS, (int) INTER_SIZE); |
|
|
return ggml::cpu::repack::repack<BLOC_TYPE, INTER_SIZE, NB_COLS>(t, data, data_size); |
|
|
} |
|
|
}; |
|
|
|
|
|
} |
|
|
|
|
|
static const ggml::cpu::tensor_traits * ggml_repack_get_optimal_repack_type(const struct ggml_tensor * cur) { |
|
|
|
|
|
|
|
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 4, 4, GGML_TYPE_Q8_0> q4_0_4x4_q8_0; |
|
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 4, GGML_TYPE_Q8_0> q4_0_4x8_q8_0; |
|
|
static const ggml::cpu::repack::tensor_traits<block_q4_0, 8, 8, GGML_TYPE_Q8_0> q4_0_8x8_q8_0; |
|
|
static const ggml::cpu::repack::tensor_traits<block_q4_K, 8, 8, GGML_TYPE_Q8_K> q4_K_8x8_q8_K; |
|
|
|
|
|
|
|
|
static const ggml::cpu::repack::tensor_traits<block_iq4_nl, 4, 4, GGML_TYPE_Q8_0> iq4_nl_4x4_q8_0; |
|
|
|
|
|
if (cur->type == GGML_TYPE_Q4_0) { |
|
|
if (ggml_cpu_has_avx2() || (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) { |
|
|
if (cur->ne[1] % 8 == 0) { |
|
|
return &q4_0_8x8_q8_0; |
|
|
} |
|
|
} |
|
|
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { |
|
|
if (cur->ne[1] % 4 == 0) { |
|
|
return &q4_0_4x8_q8_0; |
|
|
} |
|
|
} |
|
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
|
|
if (cur->ne[1] % 4 == 0) { |
|
|
return &q4_0_4x4_q8_0; |
|
|
} |
|
|
} |
|
|
} else if (cur->type == GGML_TYPE_Q4_K) { |
|
|
if (ggml_cpu_has_avx2()) { |
|
|
if (cur->ne[1] % 8 == 0) { |
|
|
return &q4_K_8x8_q8_K; |
|
|
} |
|
|
} |
|
|
} else if (cur->type == GGML_TYPE_IQ4_NL) { |
|
|
if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { |
|
|
if (cur->ne[1] % 4 == 0) { |
|
|
return &iq4_nl_4x4_q8_0; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
return nullptr; |
|
|
} |
|
|
|
|
|
static enum ggml_status ggml_backend_cpu_repack_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
|
|
tensor->extra = (void *) const_cast<ggml::cpu::tensor_traits *>(ggml_repack_get_optimal_repack_type(tensor)); |
|
|
|
|
|
GGML_UNUSED(buffer); |
|
|
return GGML_STATUS_SUCCESS; |
|
|
} |
|
|
|
|
|
static void ggml_backend_cpu_repack_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, |
|
|
const void * data, size_t offset, size_t size) { |
|
|
GGML_ASSERT(offset == 0); |
|
|
GGML_ASSERT(size == ggml_nbytes(tensor)); |
|
|
|
|
|
auto tensor_traits = (ggml::cpu::repack::tensor_traits_base *) tensor->extra; |
|
|
auto OK = tensor_traits->repack(tensor, data, size); |
|
|
|
|
|
GGML_ASSERT(OK == 0); |
|
|
GGML_UNUSED(buffer); |
|
|
} |
|
|
|
|
|
static const char * ggml_backend_cpu_repack_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |
|
|
return "CPU_REPACK"; |
|
|
|
|
|
GGML_UNUSED(buft); |
|
|
} |
|
|
|
|
|
static ggml_backend_buffer_t ggml_backend_cpu_repack_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |
|
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); |
|
|
|
|
|
if (buffer == nullptr) { |
|
|
return nullptr; |
|
|
} |
|
|
|
|
|
buffer->buft = buft; |
|
|
buffer->iface.init_tensor = ggml_backend_cpu_repack_buffer_init_tensor; |
|
|
buffer->iface.set_tensor = ggml_backend_cpu_repack_buffer_set_tensor; |
|
|
buffer->iface.get_tensor = nullptr; |
|
|
buffer->iface.cpy_tensor = nullptr; |
|
|
return buffer; |
|
|
} |
|
|
|
|
|
static size_t ggml_backend_cpu_repack_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |
|
|
return TENSOR_ALIGNMENT; |
|
|
|
|
|
GGML_UNUSED(buft); |
|
|
} |
|
|
|
|
|
namespace ggml::cpu::repack { |
|
|
class extra_buffer_type : ggml::cpu::extra_buffer_type { |
|
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { |
|
|
if ( op->op == GGML_OP_MUL_MAT && |
|
|
op->src[0]->buffer && |
|
|
(ggml_n_dims(op->src[0]) == 2) && |
|
|
op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() && |
|
|
ggml_repack_get_optimal_repack_type(op->src[0]) |
|
|
) { |
|
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
|
|
return false; |
|
|
} |
|
|
if (op->src[1]->type == GGML_TYPE_F32) { |
|
|
return true; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else if (op->op == GGML_OP_MUL_MAT_ID |
|
|
&& op->src[0]->buffer |
|
|
&& (ggml_n_dims(op->src[0]) == 3) |
|
|
&& op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type() |
|
|
&& ggml_repack_get_optimal_repack_type(op->src[0]) |
|
|
) { |
|
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
|
|
return false; |
|
|
} |
|
|
if (op->src[1]->type == GGML_TYPE_F32) { |
|
|
return true; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
return false; |
|
|
} |
|
|
|
|
|
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { |
|
|
if (op->op == GGML_OP_MUL_MAT || op->op == GGML_OP_MUL_MAT_ID) { |
|
|
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_repack_buffer_type()) { |
|
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra; |
|
|
} |
|
|
} |
|
|
return nullptr; |
|
|
} |
|
|
}; |
|
|
} |
|
|
|
|
|
ggml_backend_buffer_type_t ggml_backend_cpu_repack_buffer_type(void) { |
|
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_repack = { |
|
|
{ |
|
|
ggml_backend_cpu_repack_buffer_type_get_name, |
|
|
ggml_backend_cpu_repack_buffer_type_alloc_buffer, |
|
|
ggml_backend_cpu_repack_buffer_type_get_alignment, |
|
|
nullptr, |
|
|
nullptr, |
|
|
nullptr, |
|
|
}, |
|
|
ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
|
|
new ggml::cpu::repack::extra_buffer_type(), |
|
|
}; |
|
|
|
|
|
return &ggml_backend_cpu_buffer_type_repack; |
|
|
} |
|
|
|