| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| #ifndef GGML_SYCL_QUANTS_HPP |
| #define GGML_SYCL_QUANTS_HPP |
|
|
| #include <utility> |
|
|
| #include "ggml-common.h" |
| #include "ggml.h" |
|
|
| namespace ggml_sycl_reordered { |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| template <ggml_type type> struct block_q_t; |
|
|
| |
| |
| |
| |
| |
| |
| template <> struct block_q_t<GGML_TYPE_Q4_0> { |
| struct traits { |
| static constexpr uint32_t qk = QK4_0; |
| static constexpr uint32_t qi = QI4_0; |
| static constexpr uint32_t qr = QR4_0; |
| static constexpr uint32_t vdr_mmvq = 2; |
| }; |
|
|
| static constexpr std::pair<int, int> get_block_offset(const int block_index, const int ) { |
| return { block_index * (QK4_0 / QR4_0), 0 }; |
| } |
|
|
| static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { |
| return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 }; |
| } |
|
|
| static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } |
| }; |
|
|
| template <> struct block_q_t<GGML_TYPE_Q4_K> { |
| struct traits { |
| static constexpr uint32_t qk = QK_K; |
| static constexpr uint32_t qi = QI4_K; |
| static constexpr uint32_t qr = QR4_K; |
| static constexpr uint32_t vdr_mmvq = 2; |
| }; |
|
|
| static constexpr std::pair<int, int> get_block_offset(const int block_index, const int ) { |
| return { block_index * (traits::qk / traits::qr), 0 }; |
| } |
|
|
| static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { |
| auto nblocks = (nrows * (ncols / QK_K)); |
| return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE), |
| (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) }; |
| } |
|
|
| static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } |
| }; |
|
|
| template <> struct block_q_t<GGML_TYPE_Q6_K> { |
| struct traits { |
| static constexpr uint32_t qk = QK_K; |
| static constexpr uint32_t qi = QI6_K; |
| static constexpr uint32_t qr = QR6_K; |
| static constexpr uint32_t vdr_mmvq = 1; |
| }; |
|
|
| static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) { |
| auto low_bits_index = block_index * (QK_K / QR6_K); |
| |
| auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4)); |
| return { low_bits_index, high_bits_index }; |
| } |
|
|
| static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { |
| auto nblocks = (nrows * (ncols / QK_K)); |
| auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4); |
| auto block_scales = total_qs_bytes + block_index * (QK_K / 16); |
| auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half); |
| return { block_scales, sb_scale }; |
| } |
|
|
| static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } |
| }; |
|
|
| } |
|
|
| #endif |
|
|