|
|
#include "amx.h" |
|
|
#include "common.h" |
|
|
#include "mmq.h" |
|
|
#include "ggml-backend-impl.h" |
|
|
#include "ggml-backend.h" |
|
|
#include "ggml-impl.h" |
|
|
#include "ggml-cpu.h" |
|
|
#include "traits.h" |
|
|
|
|
|
#if defined(__gnu_linux__) |
|
|
#include <sys/syscall.h> |
|
|
#include <unistd.h> |
|
|
#endif |
|
|
|
|
|
#include <cstdlib> |
|
|
#include <cstring> |
|
|
#include <memory> |
|
|
|
|
|
#if defined(__AMX_INT8__) && defined(__AVX512VNNI__) |
|
|
|
|
|
|
|
|
namespace ggml::cpu::amx { |
|
|
class tensor_traits : public ggml::cpu::tensor_traits { |
|
|
bool work_size(int , const struct ggml_tensor * op, size_t & size) override { |
|
|
size = ggml_backend_amx_desired_wsize(op); |
|
|
return true; |
|
|
} |
|
|
|
|
|
bool compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) override { |
|
|
if (op->op == GGML_OP_MUL_MAT) { |
|
|
ggml_backend_amx_mul_mat(params, op); |
|
|
return true; |
|
|
} |
|
|
return false; |
|
|
} |
|
|
}; |
|
|
|
|
|
static ggml::cpu::tensor_traits * get_tensor_traits(ggml_backend_buffer_t, struct ggml_tensor *) { |
|
|
static tensor_traits traits; |
|
|
return &traits; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) { |
|
|
free(buffer->context); |
|
|
} |
|
|
|
|
|
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) { |
|
|
return (void *) (buffer->context); |
|
|
} |
|
|
|
|
|
static enum ggml_status ggml_backend_amx_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { |
|
|
tensor->extra = (void *) ggml::cpu::amx::get_tensor_traits(buffer, tensor); |
|
|
|
|
|
GGML_UNUSED(buffer); |
|
|
return GGML_STATUS_SUCCESS; |
|
|
} |
|
|
|
|
|
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, |
|
|
uint8_t value, size_t offset, size_t size) { |
|
|
memset((char *) tensor->data + offset, value, size); |
|
|
|
|
|
GGML_UNUSED(buffer); |
|
|
} |
|
|
|
|
|
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, |
|
|
const void * data, size_t offset, size_t size) { |
|
|
if (qtype_has_amx_kernels(tensor->type)) { |
|
|
GGML_LOG_DEBUG("%s: amx repack tensor %s of type %s\n", __func__, tensor->name, ggml_type_name(tensor->type)); |
|
|
ggml_backend_amx_convert_weight(tensor, data, offset, size); |
|
|
} else { |
|
|
memcpy((char *) tensor->data + offset, data, size); |
|
|
} |
|
|
|
|
|
GGML_UNUSED(buffer); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { |
|
|
memset(buffer->context, value, buffer->size); |
|
|
} |
|
|
|
|
|
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = { |
|
|
ggml_backend_amx_buffer_free_buffer, |
|
|
ggml_backend_amx_buffer_get_base, |
|
|
ggml_backend_amx_buffer_init_tensor, |
|
|
ggml_backend_amx_buffer_memset_tensor, |
|
|
ggml_backend_amx_buffer_set_tensor, |
|
|
nullptr, |
|
|
nullptr, |
|
|
ggml_backend_amx_buffer_clear, |
|
|
nullptr, |
|
|
}; |
|
|
|
|
|
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) { |
|
|
return "AMX"; |
|
|
|
|
|
GGML_UNUSED(buft); |
|
|
} |
|
|
|
|
|
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { |
|
|
void * data = ggml_aligned_malloc(size); |
|
|
if (data == NULL) { |
|
|
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); |
|
|
return NULL; |
|
|
} |
|
|
|
|
|
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size); |
|
|
} |
|
|
|
|
|
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { |
|
|
return TENSOR_ALIGNMENT; |
|
|
|
|
|
GGML_UNUSED(buft); |
|
|
} |
|
|
|
|
|
namespace ggml::cpu::amx { |
|
|
class extra_buffer_type : ggml::cpu::extra_buffer_type { |
|
|
bool supports_op(ggml_backend_dev_t, const struct ggml_tensor * op) override { |
|
|
|
|
|
auto is_contiguous_2d = [](const struct ggml_tensor * t) { |
|
|
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1; |
|
|
}; |
|
|
|
|
|
if (op->op == GGML_OP_MUL_MAT && is_contiguous_2d(op->src[0]) && |
|
|
is_contiguous_2d(op->src[1]) && |
|
|
op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_amx_buffer_type() && |
|
|
op->ne[0] % (TILE_N * 2) == 0 && |
|
|
(qtype_has_amx_kernels(op->src[0]->type) || (op->src[0]->type == GGML_TYPE_F16))) { |
|
|
|
|
|
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) { |
|
|
return false; |
|
|
} |
|
|
|
|
|
if (op->src[1]->type == GGML_TYPE_F32) { |
|
|
return true; |
|
|
} |
|
|
} |
|
|
return false; |
|
|
} |
|
|
|
|
|
ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override { |
|
|
if (op->op == GGML_OP_MUL_MAT && op->src[0]->buffer && |
|
|
op->src[0]->buffer->buft == ggml_backend_amx_buffer_type()) { |
|
|
return (ggml::cpu::tensor_traits *) op->src[0]->extra; |
|
|
} |
|
|
|
|
|
return nullptr; |
|
|
} |
|
|
}; |
|
|
} |
|
|
|
|
|
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { |
|
|
return ggml_backend_amx_get_alloc_size(tensor); |
|
|
|
|
|
GGML_UNUSED(buft); |
|
|
} |
|
|
|
|
|
#define ARCH_GET_XCOMP_PERM 0x1022 |
|
|
#define ARCH_REQ_XCOMP_PERM 0x1023 |
|
|
#define XFEATURE_XTILECFG 17 |
|
|
#define XFEATURE_XTILEDATA 18 |
|
|
|
|
|
static bool ggml_amx_init() { |
|
|
#if defined(__gnu_linux__) |
|
|
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) { |
|
|
fprintf(stderr, "AMX is not ready to be used!\n"); |
|
|
return false; |
|
|
} |
|
|
return true; |
|
|
#elif defined(_WIN32) |
|
|
return true; |
|
|
#endif |
|
|
} |
|
|
|
|
|
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() { |
|
|
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = { |
|
|
{ |
|
|
ggml_backend_amx_buffer_type_get_name, |
|
|
ggml_backend_amx_buffer_type_alloc_buffer, |
|
|
ggml_backend_amx_buffer_type_get_alignment, |
|
|
nullptr, |
|
|
ggml_backend_amx_buffer_type_get_alloc_size, |
|
|
nullptr, |
|
|
}, |
|
|
ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0), |
|
|
new ggml::cpu::amx::extra_buffer_type(), |
|
|
}; |
|
|
|
|
|
if (!ggml_amx_init()) { |
|
|
return nullptr; |
|
|
} |
|
|
|
|
|
return &ggml_backend_buffer_type_amx; |
|
|
} |
|
|
|
|
|
#endif |
|
|
|