Spaces:
Running
Running
File size: 8,782 Bytes
9371685 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | # Under development
(I could not get this working yet)
---
# IQ5_NL Vulkan Dequantization Shader
```glsl
#version 450
#include "dequant_head.comp"
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
layout (binding = 0) readonly buffer A {block_iq5_nl data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
void main() {
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
init_iq_shmem(gl_WorkGroupSize);
const uint tid = gl_LocalInvocationID.x % 64;
const uint il = tid/32;
const uint ir = tid%32;
const uint ib = 32*i + ir;
if (ib >= p.nel / 32) {
return;
}
const uint b_idx = 1024*i + 32*ir;
const float d = float(data_a[ib].d);
// IQ5_NL: 32 values in 20 bytes, each value is 5 bits
// Unpack 5-bit values from byte array
[[unroll]] for (uint l = 0; l < 32; ++l) {
const uint bit_offset = l * 5;
const uint byte_idx = bit_offset / 8;
const uint bit_in_byte = bit_offset % 8;
uint val;
if (bit_in_byte <= 3) {
// Value fits within current and next byte
val = (uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & 0x1F;
} else {
// Value spans two bytes
const uint low_bits = 8 - bit_in_byte;
const uint low_mask = (1 << low_bits) - 1;
const uint high_bits = 5 - low_bits;
val = ((uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & low_mask) |
((uint(data_a[ib].qs[byte_idx + 1]) & ((1 << high_bits) - 1)) << low_bits);
}
data_b[b_idx + l] = D_TYPE(d * kvalues_iq5nl[val]);
}
}
```
# Full IQ5_NL Implementation Steps for llama.cpp
## Step 1: Define Block Structure
Add to `ggml/src/ggml-common.h` after the IQ4_NL definition:
```c
// Non-linear 5-bit quantization
#define QK5_NL 32
typedef struct {
ggml_half d;
uint8_t qs[QK5_NL * 5 / 8]; // 20 bytes for 32 5-bit values
} block_iq5_nl;
static_assert(sizeof(block_iq5_nl) == sizeof(ggml_half) + QK5_NL * 5 / 8, "wrong iq5_nl block size/padding");
```
## Step 2: Define Lookup Table
Add after `kvalues_iq4nl` in `ggml/src/ggml-common.h`:
```c
GGML_TABLE_BEGIN(int8_t, kvalues_iq5nl, 32)
-127, -113, -99, -87, -76, -65, -56, -47, -39, -32, -25, -19, -13, -8, -3, 0,
3, 8, 13, 19, 25, 32, 39, 47, 56, 65, 76, 87, 99, 113, 127, 127, // Note: adjust values based on training
GGML_TABLE_END()
```
## Step 3: Add Enum Value
Add to `ggml/include/ggml.h` in the `ggml_type` enum after `GGML_TYPE_BF16`:
Use the next available enum value (likely 31 or later):
```c
GGML_TYPE_IQ5_NL = 31,
```
## Step 4: Implement Dequantization Function
Add to `ggml/src/ggml-quants.c` after `dequantize_row_iq4_nl`:
```c
void dequantize_row_iq5_nl(const block_iq5_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
assert(k % QK5_NL == 0);
const int64_t nb = k / QK5_NL;
for (int i = 0; i < nb; i++) {
const uint8_t * qs = x[i].qs;
const float d = GGML_FP16_TO_FP32(x[i].d);
for (int j = 0; j < QK5_NL; ++j) {
const int bit_offset = j * 5;
const int byte_idx = bit_offset / 8;
const int bit_in_byte = bit_offset % 8;
uint8_t val;
if (bit_in_byte <= 3) {
val = (qs[byte_idx] >> bit_in_byte) & 0x1F;
} else {
const int low_bits = 8 - bit_in_byte;
const int low_mask = (1 << low_bits) - 1;
const int high_bits = 5 - low_bits;
val = ((qs[byte_idx] >> bit_in_byte) & low_mask) |
((qs[byte_idx + 1] & ((1 << high_bits) - 1)) << low_bits);
}
y[j] = d * kvalues_iq5nl[val];
}
y += QK5_NL;
}
}
```
## Step 5: Implement Quantization Function
Add to `ggml/src/ggml-quants.c` using the existing quantization implementation pattern:
```c
size_t quantize_iq5_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
GGML_ASSERT(n_per_row % QK5_NL == 0);
int64_t nblock = n_per_row / QK5_NL;
char * qrow = (char *)dst;
uint8_t L[QK5_NL];
float weight[QK5_NL];
uint16_t unused_h;
uint8_t * unused_l = NULL;
float scale;
for (int64_t row = 0; row < nrow; ++row) {
block_iq5_nl * iq5 = (block_iq5_nl *)qrow;
for (int ibl = 0; ibl < nblock; ++ibl) {
const float * qw = quant_weights ? quant_weights + QK5_NL * ibl : NULL;
quantize_row_iq4_nl_impl(QK5_NL, 32, src + QK5_NL * ibl, &iq5[ibl].d, iq5[ibl].qs,
&unused_h, unused_l, &scale, weight, L, kvalues_iq5nl, qw, 7);
// Pack 5-bit values into bytes
memset(iq5[ibl].qs, 0, QK5_NL * 5 / 8);
for (int j = 0; j < QK5_NL; ++j) {
const int bit_offset = j * 5;
const int byte_idx = bit_offset / 8;
const int bit_in_byte = bit_offset % 8;
if (bit_in_byte <= 3) {
iq5[ibl].qs[byte_idx] |= (L[j] & 0x1F) << bit_in_byte;
} else {
const int low_bits = 8 - bit_in_byte;
const int high_bits = 5 - low_bits;
iq5[ibl].qs[byte_idx] |= (L[j] & ((1 << low_bits) - 1)) << bit_in_byte;
iq5[ibl].qs[byte_idx + 1] |= (L[j] >> low_bits) & ((1 << high_bits) - 1);
}
}
}
src += n_per_row;
qrow += nblock * sizeof(block_iq5_nl);
}
return nrow * nblock * sizeof(block_iq5_nl);
}
void quantize_row_iq5_nl_ref(const float * GGML_RESTRICT x, block_iq5_nl * GGML_RESTRICT y, int64_t k) {
assert(k % QK5_NL == 0);
quantize_iq5_nl(x, y, 1, k, NULL);
}
```
Note: Modify `quantize_row_iq4_nl_impl` to accept 32 values in the lookup table, or call `best_index_int8(32, values, ...)` instead of `best_index_int8(16, values, ...)`.
## Step 6: Register Type Traits
Add type registration in `ggml/src/ggml.c` type traits table. Find the section with type trait definitions and add an entry similar to IQ4_NL. The location would be in the type_traits array where other quantization types are registered.
## Step 7: Update Vulkan Shader Support
1. Add the `block_iq5_nl` structure definition to `ggml/src/ggml-vulkan/vulkan-shaders/types.glsl`
2. Add the `kvalues_iq5nl` lookup table initialization similar to IQ4_NL:
3. Create the dequantization shader file `dequant_iq5_nl.comp` with the shader code provided above
4. Register the shader in the Vulkan backend build system
## Step 8: Add CPU Backend Support
Add to `ggml/src/ggml-cpu/quants.c`:
```c
void quantize_row_iq5_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
quantize_row_iq5_nl_ref(x, (block_iq5_nl*)y, k);
}
```
Register in CPU backend type traits in `ggml/src/ggml-cpu/ggml-cpu.c`, adding vec_dot function pointer and other necessary traits.
## Step 9: Add Additional Backend Support
Implement quantization/dequantization kernels for:
- **CUDA**: Add to `ggml/src/ggml-cuda/convert.cu` and `ggml/src/ggml-cuda/cpy-utils.cuh`
- **Metal**: Add to `ggml/src/ggml-metal/ggml-metal.metal`
- **SYCL**: Add to `ggml/src/ggml-sycl/` appropriate files
- **WebGPU**: Add to `ggml/src/ggml-webgpu/wgsl-shaders/`
Follow the patterns established for IQ4_NL in each backend.
## Notes
**Lookup Table Optimization**: The provided lookup table values are a starting point. For optimal quality, generate the `kvalues_iq5nl` table through training on representative model weights using Lloyd-Max quantization or k-means clustering with 32 centroids. The non-uniform spacing should concentrate more values near zero where weight distributions peak.
**Best Index Function**: The `best_index_int8` helper function needs to be called with `32` as the first parameter instead of `16` for IQ5_NL.
Change the call from `best_index_int8(16, values, al)` to `best_index_int8(32, kvalues_iq5nl, al)`.
**Testing**: Add comprehensive tests in `tests/test-backend-ops.cpp` for quantization/dequantization accuracy and add to `tests/test-quantize-fns.cpp` for bit-exactness verification.
**Performance Trade-offs**: IQ5_NL at 5.5 bpw provides better quality than IQ4_NL (4.5 bpw) but increases model size by ~22%. Benchmark inference speed across different hardware to validate the trade-off is worthwhile for your use case. |