Naphula commited on
Commit
9371685
·
verified ·
1 Parent(s): b37ea51

Upload IQ5_NL.md

Browse files
Files changed (1) hide show
  1. IQ5_NL.md +228 -0
IQ5_NL.md ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Under development
2
+ (I could not get this working yet)
3
+
4
+ ---
5
+
6
+ # IQ5_NL Vulkan Dequantization Shader
7
+
8
+ ```glsl
9
+ #version 450
10
+
11
+ #include "dequant_head.comp"
12
+
13
+ layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
14
+
15
+ layout (binding = 0) readonly buffer A {block_iq5_nl data_a[];};
16
+ layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
17
+
18
+ void main() {
19
+ const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
20
+
21
+ init_iq_shmem(gl_WorkGroupSize);
22
+
23
+ const uint tid = gl_LocalInvocationID.x % 64;
24
+ const uint il = tid/32;
25
+ const uint ir = tid%32;
26
+ const uint ib = 32*i + ir;
27
+ if (ib >= p.nel / 32) {
28
+ return;
29
+ }
30
+
31
+ const uint b_idx = 1024*i + 32*ir;
32
+ const float d = float(data_a[ib].d);
33
+
34
+ // IQ5_NL: 32 values in 20 bytes, each value is 5 bits
35
+ // Unpack 5-bit values from byte array
36
+ [[unroll]] for (uint l = 0; l < 32; ++l) {
37
+ const uint bit_offset = l * 5;
38
+ const uint byte_idx = bit_offset / 8;
39
+ const uint bit_in_byte = bit_offset % 8;
40
+
41
+ uint val;
42
+ if (bit_in_byte <= 3) {
43
+ // Value fits within current and next byte
44
+ val = (uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & 0x1F;
45
+ } else {
46
+ // Value spans two bytes
47
+ const uint low_bits = 8 - bit_in_byte;
48
+ const uint low_mask = (1 << low_bits) - 1;
49
+ const uint high_bits = 5 - low_bits;
50
+ val = ((uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & low_mask) |
51
+ ((uint(data_a[ib].qs[byte_idx + 1]) & ((1 << high_bits) - 1)) << low_bits);
52
+ }
53
+
54
+ data_b[b_idx + l] = D_TYPE(d * kvalues_iq5nl[val]);
55
+ }
56
+ }
57
+ ```
58
+
59
+ # Full IQ5_NL Implementation Steps for llama.cpp
60
+
61
+ ## Step 1: Define Block Structure
62
+
63
+ Add to `ggml/src/ggml-common.h` after the IQ4_NL definition:
64
+
65
+ ```c
66
+ // Non-linear 5-bit quantization
67
+ #define QK5_NL 32
68
+ typedef struct {
69
+ ggml_half d;
70
+ uint8_t qs[QK5_NL * 5 / 8]; // 20 bytes for 32 5-bit values
71
+ } block_iq5_nl;
72
+ static_assert(sizeof(block_iq5_nl) == sizeof(ggml_half) + QK5_NL * 5 / 8, "wrong iq5_nl block size/padding");
73
+ ```
74
+
75
+ ## Step 2: Define Lookup Table
76
+
77
+ Add after `kvalues_iq4nl` in `ggml/src/ggml-common.h`:
78
+
79
+ ```c
80
+ GGML_TABLE_BEGIN(int8_t, kvalues_iq5nl, 32)
81
+ -127, -113, -99, -87, -76, -65, -56, -47, -39, -32, -25, -19, -13, -8, -3, 0,
82
+ 3, 8, 13, 19, 25, 32, 39, 47, 56, 65, 76, 87, 99, 113, 127, 127, // Note: adjust values based on training
83
+ GGML_TABLE_END()
84
+ ```
85
+
86
+ ## Step 3: Add Enum Value
87
+
88
+ Add to `ggml/include/ggml.h` in the `ggml_type` enum after `GGML_TYPE_BF16`:
89
+
90
+ Use the next available enum value (likely 31 or later):
91
+ ```c
92
+ GGML_TYPE_IQ5_NL = 31,
93
+ ```
94
+
95
+ ## Step 4: Implement Dequantization Function
96
+
97
+ Add to `ggml/src/ggml-quants.c` after `dequantize_row_iq4_nl`:
98
+
99
+ ```c
100
+ void dequantize_row_iq5_nl(const block_iq5_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
101
+ assert(k % QK5_NL == 0);
102
+ const int64_t nb = k / QK5_NL;
103
+
104
+ for (int i = 0; i < nb; i++) {
105
+ const uint8_t * qs = x[i].qs;
106
+ const float d = GGML_FP16_TO_FP32(x[i].d);
107
+
108
+ for (int j = 0; j < QK5_NL; ++j) {
109
+ const int bit_offset = j * 5;
110
+ const int byte_idx = bit_offset / 8;
111
+ const int bit_in_byte = bit_offset % 8;
112
+
113
+ uint8_t val;
114
+ if (bit_in_byte <= 3) {
115
+ val = (qs[byte_idx] >> bit_in_byte) & 0x1F;
116
+ } else {
117
+ const int low_bits = 8 - bit_in_byte;
118
+ const int low_mask = (1 << low_bits) - 1;
119
+ const int high_bits = 5 - low_bits;
120
+ val = ((qs[byte_idx] >> bit_in_byte) & low_mask) |
121
+ ((qs[byte_idx + 1] & ((1 << high_bits) - 1)) << low_bits);
122
+ }
123
+
124
+ y[j] = d * kvalues_iq5nl[val];
125
+ }
126
+ y += QK5_NL;
127
+ }
128
+ }
129
+ ```
130
+
131
+ ## Step 5: Implement Quantization Function
132
+
133
+ Add to `ggml/src/ggml-quants.c` using the existing quantization implementation pattern:
134
+
135
+ ```c
136
+ size_t quantize_iq5_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
137
+ GGML_ASSERT(n_per_row % QK5_NL == 0);
138
+ int64_t nblock = n_per_row / QK5_NL;
139
+ char * qrow = (char *)dst;
140
+ uint8_t L[QK5_NL];
141
+ float weight[QK5_NL];
142
+ uint16_t unused_h;
143
+ uint8_t * unused_l = NULL;
144
+ float scale;
145
+
146
+ for (int64_t row = 0; row < nrow; ++row) {
147
+ block_iq5_nl * iq5 = (block_iq5_nl *)qrow;
148
+ for (int ibl = 0; ibl < nblock; ++ibl) {
149
+ const float * qw = quant_weights ? quant_weights + QK5_NL * ibl : NULL;
150
+ quantize_row_iq4_nl_impl(QK5_NL, 32, src + QK5_NL * ibl, &iq5[ibl].d, iq5[ibl].qs,
151
+ &unused_h, unused_l, &scale, weight, L, kvalues_iq5nl, qw, 7);
152
+
153
+ // Pack 5-bit values into bytes
154
+ memset(iq5[ibl].qs, 0, QK5_NL * 5 / 8);
155
+ for (int j = 0; j < QK5_NL; ++j) {
156
+ const int bit_offset = j * 5;
157
+ const int byte_idx = bit_offset / 8;
158
+ const int bit_in_byte = bit_offset % 8;
159
+
160
+ if (bit_in_byte <= 3) {
161
+ iq5[ibl].qs[byte_idx] |= (L[j] & 0x1F) << bit_in_byte;
162
+ } else {
163
+ const int low_bits = 8 - bit_in_byte;
164
+ const int high_bits = 5 - low_bits;
165
+ iq5[ibl].qs[byte_idx] |= (L[j] & ((1 << low_bits) - 1)) << bit_in_byte;
166
+ iq5[ibl].qs[byte_idx + 1] |= (L[j] >> low_bits) & ((1 << high_bits) - 1);
167
+ }
168
+ }
169
+ }
170
+ src += n_per_row;
171
+ qrow += nblock * sizeof(block_iq5_nl);
172
+ }
173
+ return nrow * nblock * sizeof(block_iq5_nl);
174
+ }
175
+
176
+ void quantize_row_iq5_nl_ref(const float * GGML_RESTRICT x, block_iq5_nl * GGML_RESTRICT y, int64_t k) {
177
+ assert(k % QK5_NL == 0);
178
+ quantize_iq5_nl(x, y, 1, k, NULL);
179
+ }
180
+ ```
181
+
182
+ Note: Modify `quantize_row_iq4_nl_impl` to accept 32 values in the lookup table, or call `best_index_int8(32, values, ...)` instead of `best_index_int8(16, values, ...)`.
183
+
184
+ ## Step 6: Register Type Traits
185
+
186
+ Add type registration in `ggml/src/ggml.c` type traits table. Find the section with type trait definitions and add an entry similar to IQ4_NL. The location would be in the type_traits array where other quantization types are registered.
187
+
188
+ ## Step 7: Update Vulkan Shader Support
189
+
190
+ 1. Add the `block_iq5_nl` structure definition to `ggml/src/ggml-vulkan/vulkan-shaders/types.glsl`
191
+ 2. Add the `kvalues_iq5nl` lookup table initialization similar to IQ4_NL:
192
+
193
+ 3. Create the dequantization shader file `dequant_iq5_nl.comp` with the shader code provided above
194
+ 4. Register the shader in the Vulkan backend build system
195
+
196
+ ## Step 8: Add CPU Backend Support
197
+
198
+ Add to `ggml/src/ggml-cpu/quants.c`:
199
+
200
+ ```c
201
+ void quantize_row_iq5_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
202
+ quantize_row_iq5_nl_ref(x, (block_iq5_nl*)y, k);
203
+ }
204
+ ```
205
+
206
+ Register in CPU backend type traits in `ggml/src/ggml-cpu/ggml-cpu.c`, adding vec_dot function pointer and other necessary traits.
207
+
208
+ ## Step 9: Add Additional Backend Support
209
+
210
+ Implement quantization/dequantization kernels for:
211
+ - **CUDA**: Add to `ggml/src/ggml-cuda/convert.cu` and `ggml/src/ggml-cuda/cpy-utils.cuh`
212
+ - **Metal**: Add to `ggml/src/ggml-metal/ggml-metal.metal`
213
+ - **SYCL**: Add to `ggml/src/ggml-sycl/` appropriate files
214
+ - **WebGPU**: Add to `ggml/src/ggml-webgpu/wgsl-shaders/`
215
+
216
+ Follow the patterns established for IQ4_NL in each backend.
217
+
218
+ ## Notes
219
+
220
+ **Lookup Table Optimization**: The provided lookup table values are a starting point. For optimal quality, generate the `kvalues_iq5nl` table through training on representative model weights using Lloyd-Max quantization or k-means clustering with 32 centroids. The non-uniform spacing should concentrate more values near zero where weight distributions peak.
221
+
222
+ **Best Index Function**: The `best_index_int8` helper function needs to be called with `32` as the first parameter instead of `16` for IQ5_NL.
223
+
224
+ Change the call from `best_index_int8(16, values, al)` to `best_index_int8(32, kvalues_iq5nl, al)`.
225
+
226
+ **Testing**: Add comprehensive tests in `tests/test-backend-ops.cpp` for quantization/dequantization accuracy and add to `tests/test-quantize-fns.cpp` for bit-exactness verification.
227
+
228
+ **Performance Trade-offs**: IQ5_NL at 5.5 bpw provides better quality than IQ4_NL (4.5 bpw) but increases model size by ~22%. Benchmark inference speed across different hardware to validate the trade-off is worthwhile for your use case.