Spaces:
Running
Running
| # Under development | |
| (I could not get this working yet) | |
| --- | |
| # IQ5_NL Vulkan Dequantization Shader | |
| ```glsl | |
| #version 450 | |
| #include "dequant_head.comp" | |
| layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; | |
| layout (binding = 0) readonly buffer A {block_iq5_nl data_a[];}; | |
| layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; | |
| void main() { | |
| const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; | |
| init_iq_shmem(gl_WorkGroupSize); | |
| const uint tid = gl_LocalInvocationID.x % 64; | |
| const uint il = tid/32; | |
| const uint ir = tid%32; | |
| const uint ib = 32*i + ir; | |
| if (ib >= p.nel / 32) { | |
| return; | |
| } | |
| const uint b_idx = 1024*i + 32*ir; | |
| const float d = float(data_a[ib].d); | |
| // IQ5_NL: 32 values in 20 bytes, each value is 5 bits | |
| // Unpack 5-bit values from byte array | |
| [[unroll]] for (uint l = 0; l < 32; ++l) { | |
| const uint bit_offset = l * 5; | |
| const uint byte_idx = bit_offset / 8; | |
| const uint bit_in_byte = bit_offset % 8; | |
| uint val; | |
| if (bit_in_byte <= 3) { | |
| // Value fits within current and next byte | |
| val = (uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & 0x1F; | |
| } else { | |
| // Value spans two bytes | |
| const uint low_bits = 8 - bit_in_byte; | |
| const uint low_mask = (1 << low_bits) - 1; | |
| const uint high_bits = 5 - low_bits; | |
| val = ((uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & low_mask) | | |
| ((uint(data_a[ib].qs[byte_idx + 1]) & ((1 << high_bits) - 1)) << low_bits); | |
| } | |
| data_b[b_idx + l] = D_TYPE(d * kvalues_iq5nl[val]); | |
| } | |
| } | |
| ``` | |
| # Full IQ5_NL Implementation Steps for llama.cpp | |
| ## Step 1: Define Block Structure | |
| Add to `ggml/src/ggml-common.h` after the IQ4_NL definition: | |
| ```c | |
| // Non-linear 5-bit quantization | |
| #define QK5_NL 32 | |
| typedef struct { | |
| ggml_half d; | |
| uint8_t qs[QK5_NL * 5 / 8]; // 20 bytes for 32 5-bit values | |
| } block_iq5_nl; | |
| static_assert(sizeof(block_iq5_nl) == sizeof(ggml_half) + QK5_NL * 5 / 8, "wrong iq5_nl block size/padding"); | |
| ``` | |
| ## Step 2: Define Lookup Table | |
| Add after `kvalues_iq4nl` in `ggml/src/ggml-common.h`: | |
| ```c | |
| GGML_TABLE_BEGIN(int8_t, kvalues_iq5nl, 32) | |
| -127, -113, -99, -87, -76, -65, -56, -47, -39, -32, -25, -19, -13, -8, -3, 0, | |
| 3, 8, 13, 19, 25, 32, 39, 47, 56, 65, 76, 87, 99, 113, 127, 127, // Note: adjust values based on training | |
| GGML_TABLE_END() | |
| ``` | |
| ## Step 3: Add Enum Value | |
| Add to `ggml/include/ggml.h` in the `ggml_type` enum after `GGML_TYPE_BF16`: | |
| Use the next available enum value (likely 31 or later): | |
| ```c | |
| GGML_TYPE_IQ5_NL = 31, | |
| ``` | |
| ## Step 4: Implement Dequantization Function | |
| Add to `ggml/src/ggml-quants.c` after `dequantize_row_iq4_nl`: | |
| ```c | |
| void dequantize_row_iq5_nl(const block_iq5_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { | |
| assert(k % QK5_NL == 0); | |
| const int64_t nb = k / QK5_NL; | |
| for (int i = 0; i < nb; i++) { | |
| const uint8_t * qs = x[i].qs; | |
| const float d = GGML_FP16_TO_FP32(x[i].d); | |
| for (int j = 0; j < QK5_NL; ++j) { | |
| const int bit_offset = j * 5; | |
| const int byte_idx = bit_offset / 8; | |
| const int bit_in_byte = bit_offset % 8; | |
| uint8_t val; | |
| if (bit_in_byte <= 3) { | |
| val = (qs[byte_idx] >> bit_in_byte) & 0x1F; | |
| } else { | |
| const int low_bits = 8 - bit_in_byte; | |
| const int low_mask = (1 << low_bits) - 1; | |
| const int high_bits = 5 - low_bits; | |
| val = ((qs[byte_idx] >> bit_in_byte) & low_mask) | | |
| ((qs[byte_idx + 1] & ((1 << high_bits) - 1)) << low_bits); | |
| } | |
| y[j] = d * kvalues_iq5nl[val]; | |
| } | |
| y += QK5_NL; | |
| } | |
| } | |
| ``` | |
| ## Step 5: Implement Quantization Function | |
| Add to `ggml/src/ggml-quants.c` using the existing quantization implementation pattern: | |
| ```c | |
| size_t quantize_iq5_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { | |
| GGML_ASSERT(n_per_row % QK5_NL == 0); | |
| int64_t nblock = n_per_row / QK5_NL; | |
| char * qrow = (char *)dst; | |
| uint8_t L[QK5_NL]; | |
| float weight[QK5_NL]; | |
| uint16_t unused_h; | |
| uint8_t * unused_l = NULL; | |
| float scale; | |
| for (int64_t row = 0; row < nrow; ++row) { | |
| block_iq5_nl * iq5 = (block_iq5_nl *)qrow; | |
| for (int ibl = 0; ibl < nblock; ++ibl) { | |
| const float * qw = quant_weights ? quant_weights + QK5_NL * ibl : NULL; | |
| quantize_row_iq4_nl_impl(QK5_NL, 32, src + QK5_NL * ibl, &iq5[ibl].d, iq5[ibl].qs, | |
| &unused_h, unused_l, &scale, weight, L, kvalues_iq5nl, qw, 7); | |
| // Pack 5-bit values into bytes | |
| memset(iq5[ibl].qs, 0, QK5_NL * 5 / 8); | |
| for (int j = 0; j < QK5_NL; ++j) { | |
| const int bit_offset = j * 5; | |
| const int byte_idx = bit_offset / 8; | |
| const int bit_in_byte = bit_offset % 8; | |
| if (bit_in_byte <= 3) { | |
| iq5[ibl].qs[byte_idx] |= (L[j] & 0x1F) << bit_in_byte; | |
| } else { | |
| const int low_bits = 8 - bit_in_byte; | |
| const int high_bits = 5 - low_bits; | |
| iq5[ibl].qs[byte_idx] |= (L[j] & ((1 << low_bits) - 1)) << bit_in_byte; | |
| iq5[ibl].qs[byte_idx + 1] |= (L[j] >> low_bits) & ((1 << high_bits) - 1); | |
| } | |
| } | |
| } | |
| src += n_per_row; | |
| qrow += nblock * sizeof(block_iq5_nl); | |
| } | |
| return nrow * nblock * sizeof(block_iq5_nl); | |
| } | |
| void quantize_row_iq5_nl_ref(const float * GGML_RESTRICT x, block_iq5_nl * GGML_RESTRICT y, int64_t k) { | |
| assert(k % QK5_NL == 0); | |
| quantize_iq5_nl(x, y, 1, k, NULL); | |
| } | |
| ``` | |
| Note: Modify `quantize_row_iq4_nl_impl` to accept 32 values in the lookup table, or call `best_index_int8(32, values, ...)` instead of `best_index_int8(16, values, ...)`. | |
| ## Step 6: Register Type Traits | |
| Add type registration in `ggml/src/ggml.c` type traits table. Find the section with type trait definitions and add an entry similar to IQ4_NL. The location would be in the type_traits array where other quantization types are registered. | |
| ## Step 7: Update Vulkan Shader Support | |
| 1. Add the `block_iq5_nl` structure definition to `ggml/src/ggml-vulkan/vulkan-shaders/types.glsl` | |
| 2. Add the `kvalues_iq5nl` lookup table initialization similar to IQ4_NL: | |
| 3. Create the dequantization shader file `dequant_iq5_nl.comp` with the shader code provided above | |
| 4. Register the shader in the Vulkan backend build system | |
| ## Step 8: Add CPU Backend Support | |
| Add to `ggml/src/ggml-cpu/quants.c`: | |
| ```c | |
| void quantize_row_iq5_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { | |
| quantize_row_iq5_nl_ref(x, (block_iq5_nl*)y, k); | |
| } | |
| ``` | |
| Register in CPU backend type traits in `ggml/src/ggml-cpu/ggml-cpu.c`, adding vec_dot function pointer and other necessary traits. | |
| ## Step 9: Add Additional Backend Support | |
| Implement quantization/dequantization kernels for: | |
| - **CUDA**: Add to `ggml/src/ggml-cuda/convert.cu` and `ggml/src/ggml-cuda/cpy-utils.cuh` | |
| - **Metal**: Add to `ggml/src/ggml-metal/ggml-metal.metal` | |
| - **SYCL**: Add to `ggml/src/ggml-sycl/` appropriate files | |
| - **WebGPU**: Add to `ggml/src/ggml-webgpu/wgsl-shaders/` | |
| Follow the patterns established for IQ4_NL in each backend. | |
| ## Notes | |
| **Lookup Table Optimization**: The provided lookup table values are a starting point. For optimal quality, generate the `kvalues_iq5nl` table through training on representative model weights using Lloyd-Max quantization or k-means clustering with 32 centroids. The non-uniform spacing should concentrate more values near zero where weight distributions peak. | |
| **Best Index Function**: The `best_index_int8` helper function needs to be called with `32` as the first parameter instead of `16` for IQ5_NL. | |
| Change the call from `best_index_int8(16, values, al)` to `best_index_int8(32, kvalues_iq5nl, al)`. | |
| **Testing**: Add comprehensive tests in `tests/test-backend-ops.cpp` for quantization/dequantization accuracy and add to `tests/test-quantize-fns.cpp` for bit-exactness verification. | |
| **Performance Trade-offs**: IQ5_NL at 5.5 bpw provides better quality than IQ4_NL (4.5 bpw) but increases model size by ~22%. Benchmark inference speed across different hardware to validate the trade-off is worthwhile for your use case. |