Spaces:
Running
Running
Upload IQ5_NL.md
Browse files
IQ5_NL.md
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Under development
|
| 2 |
+
(I could not get this working yet)
|
| 3 |
+
|
| 4 |
+
---
|
| 5 |
+
|
| 6 |
+
# IQ5_NL Vulkan Dequantization Shader
|
| 7 |
+
|
| 8 |
+
```glsl
|
| 9 |
+
#version 450
|
| 10 |
+
|
| 11 |
+
#include "dequant_head.comp"
|
| 12 |
+
|
| 13 |
+
layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
|
| 14 |
+
|
| 15 |
+
layout (binding = 0) readonly buffer A {block_iq5_nl data_a[];};
|
| 16 |
+
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
|
| 17 |
+
|
| 18 |
+
void main() {
|
| 19 |
+
const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
|
| 20 |
+
|
| 21 |
+
init_iq_shmem(gl_WorkGroupSize);
|
| 22 |
+
|
| 23 |
+
const uint tid = gl_LocalInvocationID.x % 64;
|
| 24 |
+
const uint il = tid/32;
|
| 25 |
+
const uint ir = tid%32;
|
| 26 |
+
const uint ib = 32*i + ir;
|
| 27 |
+
if (ib >= p.nel / 32) {
|
| 28 |
+
return;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
const uint b_idx = 1024*i + 32*ir;
|
| 32 |
+
const float d = float(data_a[ib].d);
|
| 33 |
+
|
| 34 |
+
// IQ5_NL: 32 values in 20 bytes, each value is 5 bits
|
| 35 |
+
// Unpack 5-bit values from byte array
|
| 36 |
+
[[unroll]] for (uint l = 0; l < 32; ++l) {
|
| 37 |
+
const uint bit_offset = l * 5;
|
| 38 |
+
const uint byte_idx = bit_offset / 8;
|
| 39 |
+
const uint bit_in_byte = bit_offset % 8;
|
| 40 |
+
|
| 41 |
+
uint val;
|
| 42 |
+
if (bit_in_byte <= 3) {
|
| 43 |
+
// Value fits within current and next byte
|
| 44 |
+
val = (uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & 0x1F;
|
| 45 |
+
} else {
|
| 46 |
+
// Value spans two bytes
|
| 47 |
+
const uint low_bits = 8 - bit_in_byte;
|
| 48 |
+
const uint low_mask = (1 << low_bits) - 1;
|
| 49 |
+
const uint high_bits = 5 - low_bits;
|
| 50 |
+
val = ((uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & low_mask) |
|
| 51 |
+
((uint(data_a[ib].qs[byte_idx + 1]) & ((1 << high_bits) - 1)) << low_bits);
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
data_b[b_idx + l] = D_TYPE(d * kvalues_iq5nl[val]);
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
# Full IQ5_NL Implementation Steps for llama.cpp
|
| 60 |
+
|
| 61 |
+
## Step 1: Define Block Structure
|
| 62 |
+
|
| 63 |
+
Add to `ggml/src/ggml-common.h` after the IQ4_NL definition:
|
| 64 |
+
|
| 65 |
+
```c
|
| 66 |
+
// Non-linear 5-bit quantization
|
| 67 |
+
#define QK5_NL 32
|
| 68 |
+
typedef struct {
|
| 69 |
+
ggml_half d;
|
| 70 |
+
uint8_t qs[QK5_NL * 5 / 8]; // 20 bytes for 32 5-bit values
|
| 71 |
+
} block_iq5_nl;
|
| 72 |
+
static_assert(sizeof(block_iq5_nl) == sizeof(ggml_half) + QK5_NL * 5 / 8, "wrong iq5_nl block size/padding");
|
| 73 |
+
```
|
| 74 |
+
|
| 75 |
+
## Step 2: Define Lookup Table
|
| 76 |
+
|
| 77 |
+
Add after `kvalues_iq4nl` in `ggml/src/ggml-common.h`:
|
| 78 |
+
|
| 79 |
+
```c
|
| 80 |
+
GGML_TABLE_BEGIN(int8_t, kvalues_iq5nl, 32)
|
| 81 |
+
-127, -113, -99, -87, -76, -65, -56, -47, -39, -32, -25, -19, -13, -8, -3, 0,
|
| 82 |
+
3, 8, 13, 19, 25, 32, 39, 47, 56, 65, 76, 87, 99, 113, 127, 127, // Note: adjust values based on training
|
| 83 |
+
GGML_TABLE_END()
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Step 3: Add Enum Value
|
| 87 |
+
|
| 88 |
+
Add to `ggml/include/ggml.h` in the `ggml_type` enum after `GGML_TYPE_BF16`:
|
| 89 |
+
|
| 90 |
+
Use the next available enum value (likely 31 or later):
|
| 91 |
+
```c
|
| 92 |
+
GGML_TYPE_IQ5_NL = 31,
|
| 93 |
+
```
|
| 94 |
+
|
| 95 |
+
## Step 4: Implement Dequantization Function
|
| 96 |
+
|
| 97 |
+
Add to `ggml/src/ggml-quants.c` after `dequantize_row_iq4_nl`:
|
| 98 |
+
|
| 99 |
+
```c
|
| 100 |
+
void dequantize_row_iq5_nl(const block_iq5_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
|
| 101 |
+
assert(k % QK5_NL == 0);
|
| 102 |
+
const int64_t nb = k / QK5_NL;
|
| 103 |
+
|
| 104 |
+
for (int i = 0; i < nb; i++) {
|
| 105 |
+
const uint8_t * qs = x[i].qs;
|
| 106 |
+
const float d = GGML_FP16_TO_FP32(x[i].d);
|
| 107 |
+
|
| 108 |
+
for (int j = 0; j < QK5_NL; ++j) {
|
| 109 |
+
const int bit_offset = j * 5;
|
| 110 |
+
const int byte_idx = bit_offset / 8;
|
| 111 |
+
const int bit_in_byte = bit_offset % 8;
|
| 112 |
+
|
| 113 |
+
uint8_t val;
|
| 114 |
+
if (bit_in_byte <= 3) {
|
| 115 |
+
val = (qs[byte_idx] >> bit_in_byte) & 0x1F;
|
| 116 |
+
} else {
|
| 117 |
+
const int low_bits = 8 - bit_in_byte;
|
| 118 |
+
const int low_mask = (1 << low_bits) - 1;
|
| 119 |
+
const int high_bits = 5 - low_bits;
|
| 120 |
+
val = ((qs[byte_idx] >> bit_in_byte) & low_mask) |
|
| 121 |
+
((qs[byte_idx + 1] & ((1 << high_bits) - 1)) << low_bits);
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
y[j] = d * kvalues_iq5nl[val];
|
| 125 |
+
}
|
| 126 |
+
y += QK5_NL;
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
## Step 5: Implement Quantization Function
|
| 132 |
+
|
| 133 |
+
Add to `ggml/src/ggml-quants.c` using the existing quantization implementation pattern:
|
| 134 |
+
|
| 135 |
+
```c
|
| 136 |
+
size_t quantize_iq5_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 137 |
+
GGML_ASSERT(n_per_row % QK5_NL == 0);
|
| 138 |
+
int64_t nblock = n_per_row / QK5_NL;
|
| 139 |
+
char * qrow = (char *)dst;
|
| 140 |
+
uint8_t L[QK5_NL];
|
| 141 |
+
float weight[QK5_NL];
|
| 142 |
+
uint16_t unused_h;
|
| 143 |
+
uint8_t * unused_l = NULL;
|
| 144 |
+
float scale;
|
| 145 |
+
|
| 146 |
+
for (int64_t row = 0; row < nrow; ++row) {
|
| 147 |
+
block_iq5_nl * iq5 = (block_iq5_nl *)qrow;
|
| 148 |
+
for (int ibl = 0; ibl < nblock; ++ibl) {
|
| 149 |
+
const float * qw = quant_weights ? quant_weights + QK5_NL * ibl : NULL;
|
| 150 |
+
quantize_row_iq4_nl_impl(QK5_NL, 32, src + QK5_NL * ibl, &iq5[ibl].d, iq5[ibl].qs,
|
| 151 |
+
&unused_h, unused_l, &scale, weight, L, kvalues_iq5nl, qw, 7);
|
| 152 |
+
|
| 153 |
+
// Pack 5-bit values into bytes
|
| 154 |
+
memset(iq5[ibl].qs, 0, QK5_NL * 5 / 8);
|
| 155 |
+
for (int j = 0; j < QK5_NL; ++j) {
|
| 156 |
+
const int bit_offset = j * 5;
|
| 157 |
+
const int byte_idx = bit_offset / 8;
|
| 158 |
+
const int bit_in_byte = bit_offset % 8;
|
| 159 |
+
|
| 160 |
+
if (bit_in_byte <= 3) {
|
| 161 |
+
iq5[ibl].qs[byte_idx] |= (L[j] & 0x1F) << bit_in_byte;
|
| 162 |
+
} else {
|
| 163 |
+
const int low_bits = 8 - bit_in_byte;
|
| 164 |
+
const int high_bits = 5 - low_bits;
|
| 165 |
+
iq5[ibl].qs[byte_idx] |= (L[j] & ((1 << low_bits) - 1)) << bit_in_byte;
|
| 166 |
+
iq5[ibl].qs[byte_idx + 1] |= (L[j] >> low_bits) & ((1 << high_bits) - 1);
|
| 167 |
+
}
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
src += n_per_row;
|
| 171 |
+
qrow += nblock * sizeof(block_iq5_nl);
|
| 172 |
+
}
|
| 173 |
+
return nrow * nblock * sizeof(block_iq5_nl);
|
| 174 |
+
}
|
| 175 |
+
|
| 176 |
+
void quantize_row_iq5_nl_ref(const float * GGML_RESTRICT x, block_iq5_nl * GGML_RESTRICT y, int64_t k) {
|
| 177 |
+
assert(k % QK5_NL == 0);
|
| 178 |
+
quantize_iq5_nl(x, y, 1, k, NULL);
|
| 179 |
+
}
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
Note: Modify `quantize_row_iq4_nl_impl` to accept 32 values in the lookup table, or call `best_index_int8(32, values, ...)` instead of `best_index_int8(16, values, ...)`.
|
| 183 |
+
|
| 184 |
+
## Step 6: Register Type Traits
|
| 185 |
+
|
| 186 |
+
Add type registration in `ggml/src/ggml.c` type traits table. Find the section with type trait definitions and add an entry similar to IQ4_NL. The location would be in the type_traits array where other quantization types are registered.
|
| 187 |
+
|
| 188 |
+
## Step 7: Update Vulkan Shader Support
|
| 189 |
+
|
| 190 |
+
1. Add the `block_iq5_nl` structure definition to `ggml/src/ggml-vulkan/vulkan-shaders/types.glsl`
|
| 191 |
+
2. Add the `kvalues_iq5nl` lookup table initialization similar to IQ4_NL:
|
| 192 |
+
|
| 193 |
+
3. Create the dequantization shader file `dequant_iq5_nl.comp` with the shader code provided above
|
| 194 |
+
4. Register the shader in the Vulkan backend build system
|
| 195 |
+
|
| 196 |
+
## Step 8: Add CPU Backend Support
|
| 197 |
+
|
| 198 |
+
Add to `ggml/src/ggml-cpu/quants.c`:
|
| 199 |
+
|
| 200 |
+
```c
|
| 201 |
+
void quantize_row_iq5_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
|
| 202 |
+
quantize_row_iq5_nl_ref(x, (block_iq5_nl*)y, k);
|
| 203 |
+
}
|
| 204 |
+
```
|
| 205 |
+
|
| 206 |
+
Register in CPU backend type traits in `ggml/src/ggml-cpu/ggml-cpu.c`, adding vec_dot function pointer and other necessary traits.
|
| 207 |
+
|
| 208 |
+
## Step 9: Add Additional Backend Support
|
| 209 |
+
|
| 210 |
+
Implement quantization/dequantization kernels for:
|
| 211 |
+
- **CUDA**: Add to `ggml/src/ggml-cuda/convert.cu` and `ggml/src/ggml-cuda/cpy-utils.cuh`
|
| 212 |
+
- **Metal**: Add to `ggml/src/ggml-metal/ggml-metal.metal`
|
| 213 |
+
- **SYCL**: Add to `ggml/src/ggml-sycl/` appropriate files
|
| 214 |
+
- **WebGPU**: Add to `ggml/src/ggml-webgpu/wgsl-shaders/`
|
| 215 |
+
|
| 216 |
+
Follow the patterns established for IQ4_NL in each backend.
|
| 217 |
+
|
| 218 |
+
## Notes
|
| 219 |
+
|
| 220 |
+
**Lookup Table Optimization**: The provided lookup table values are a starting point. For optimal quality, generate the `kvalues_iq5nl` table through training on representative model weights using Lloyd-Max quantization or k-means clustering with 32 centroids. The non-uniform spacing should concentrate more values near zero where weight distributions peak.
|
| 221 |
+
|
| 222 |
+
**Best Index Function**: The `best_index_int8` helper function needs to be called with `32` as the first parameter instead of `16` for IQ5_NL.
|
| 223 |
+
|
| 224 |
+
Change the call from `best_index_int8(16, values, al)` to `best_index_int8(32, kvalues_iq5nl, al)`.
|
| 225 |
+
|
| 226 |
+
**Testing**: Add comprehensive tests in `tests/test-backend-ops.cpp` for quantization/dequantization accuracy and add to `tests/test-quantize-fns.cpp` for bit-exactness verification.
|
| 227 |
+
|
| 228 |
+
**Performance Trade-offs**: IQ5_NL at 5.5 bpw provides better quality than IQ4_NL (4.5 bpw) but increases model size by ~22%. Benchmark inference speed across different hardware to validate the trade-off is worthwhile for your use case.
|