model_tools / IQ5_NL.md
Naphula's picture
Upload IQ5_NL.md
9371685 verified

Under development

(I could not get this working yet)


IQ5_NL Vulkan Dequantization Shader

#version 450

#include "dequant_head.comp"

layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;

layout (binding = 0) readonly buffer A {block_iq5_nl data_a[];};
layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};

void main() {
    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;

    init_iq_shmem(gl_WorkGroupSize);

    const uint tid = gl_LocalInvocationID.x % 64;
    const uint il  = tid/32;
    const uint ir  = tid%32;
    const uint ib = 32*i + ir;
    if (ib >= p.nel / 32) {
        return;
    }

    const uint b_idx = 1024*i + 32*ir;
    const float d = float(data_a[ib].d);

    // IQ5_NL: 32 values in 20 bytes, each value is 5 bits
    // Unpack 5-bit values from byte array
    [[unroll]] for (uint l = 0; l < 32; ++l) {
        const uint bit_offset = l * 5;
        const uint byte_idx = bit_offset / 8;
        const uint bit_in_byte = bit_offset % 8;
        
        uint val;
        if (bit_in_byte <= 3) {
            // Value fits within current and next byte
            val = (uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & 0x1F;
        } else {
            // Value spans two bytes
            const uint low_bits = 8 - bit_in_byte;
            const uint low_mask = (1 << low_bits) - 1;
            const uint high_bits = 5 - low_bits;
            val = ((uint(data_a[ib].qs[byte_idx]) >> bit_in_byte) & low_mask) |
                  ((uint(data_a[ib].qs[byte_idx + 1]) & ((1 << high_bits) - 1)) << low_bits);
        }
        
        data_b[b_idx + l] = D_TYPE(d * kvalues_iq5nl[val]);
    }
}

Full IQ5_NL Implementation Steps for llama.cpp

Step 1: Define Block Structure

Add to ggml/src/ggml-common.h after the IQ4_NL definition:

// Non-linear 5-bit quantization
#define QK5_NL 32
typedef struct {
    ggml_half d;
    uint8_t qs[QK5_NL * 5 / 8]; // 20 bytes for 32 5-bit values
} block_iq5_nl;
static_assert(sizeof(block_iq5_nl) == sizeof(ggml_half) + QK5_NL * 5 / 8, "wrong iq5_nl block size/padding");

Step 2: Define Lookup Table

Add after kvalues_iq4nl in ggml/src/ggml-common.h:

GGML_TABLE_BEGIN(int8_t, kvalues_iq5nl, 32)
    -127, -113, -99, -87, -76, -65, -56, -47, -39, -32, -25, -19, -13, -8, -3, 0,
    3, 8, 13, 19, 25, 32, 39, 47, 56, 65, 76, 87, 99, 113, 127, 127, // Note: adjust values based on training
GGML_TABLE_END()

Step 3: Add Enum Value

Add to ggml/include/ggml.h in the ggml_type enum after GGML_TYPE_BF16:

Use the next available enum value (likely 31 or later):

GGML_TYPE_IQ5_NL  = 31,

Step 4: Implement Dequantization Function

Add to ggml/src/ggml-quants.c after dequantize_row_iq4_nl:

void dequantize_row_iq5_nl(const block_iq5_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
    assert(k % QK5_NL == 0);
    const int64_t nb = k / QK5_NL;

    for (int i = 0; i < nb; i++) {
        const uint8_t * qs = x[i].qs;
        const float d = GGML_FP16_TO_FP32(x[i].d);
        
        for (int j = 0; j < QK5_NL; ++j) {
            const int bit_offset = j * 5;
            const int byte_idx = bit_offset / 8;
            const int bit_in_byte = bit_offset % 8;
            
            uint8_t val;
            if (bit_in_byte <= 3) {
                val = (qs[byte_idx] >> bit_in_byte) & 0x1F;
            } else {
                const int low_bits = 8 - bit_in_byte;
                const int low_mask = (1 << low_bits) - 1;
                const int high_bits = 5 - low_bits;
                val = ((qs[byte_idx] >> bit_in_byte) & low_mask) |
                      ((qs[byte_idx + 1] & ((1 << high_bits) - 1)) << low_bits);
            }
            
            y[j] = d * kvalues_iq5nl[val];
        }
        y += QK5_NL;
    }
}

Step 5: Implement Quantization Function

Add to ggml/src/ggml-quants.c using the existing quantization implementation pattern:

size_t quantize_iq5_nl(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
    GGML_ASSERT(n_per_row % QK5_NL == 0);
    int64_t nblock = n_per_row / QK5_NL;
    char * qrow = (char *)dst;
    uint8_t L[QK5_NL];
    float weight[QK5_NL];
    uint16_t unused_h;
    uint8_t * unused_l = NULL;
    float scale;
    
    for (int64_t row = 0; row < nrow; ++row) {
        block_iq5_nl * iq5 = (block_iq5_nl *)qrow;
        for (int ibl = 0; ibl < nblock; ++ibl) {
            const float * qw = quant_weights ? quant_weights + QK5_NL * ibl : NULL;
            quantize_row_iq4_nl_impl(QK5_NL, 32, src + QK5_NL * ibl, &iq5[ibl].d, iq5[ibl].qs, 
                                     &unused_h, unused_l, &scale, weight, L, kvalues_iq5nl, qw, 7);
            
            // Pack 5-bit values into bytes
            memset(iq5[ibl].qs, 0, QK5_NL * 5 / 8);
            for (int j = 0; j < QK5_NL; ++j) {
                const int bit_offset = j * 5;
                const int byte_idx = bit_offset / 8;
                const int bit_in_byte = bit_offset % 8;
                
                if (bit_in_byte <= 3) {
                    iq5[ibl].qs[byte_idx] |= (L[j] & 0x1F) << bit_in_byte;
                } else {
                    const int low_bits = 8 - bit_in_byte;
                    const int high_bits = 5 - low_bits;
                    iq5[ibl].qs[byte_idx] |= (L[j] & ((1 << low_bits) - 1)) << bit_in_byte;
                    iq5[ibl].qs[byte_idx + 1] |= (L[j] >> low_bits) & ((1 << high_bits) - 1);
                }
            }
        }
        src += n_per_row;
        qrow += nblock * sizeof(block_iq5_nl);
    }
    return nrow * nblock * sizeof(block_iq5_nl);
}

void quantize_row_iq5_nl_ref(const float * GGML_RESTRICT x, block_iq5_nl * GGML_RESTRICT y, int64_t k) {
    assert(k % QK5_NL == 0);
    quantize_iq5_nl(x, y, 1, k, NULL);
}

Note: Modify quantize_row_iq4_nl_impl to accept 32 values in the lookup table, or call best_index_int8(32, values, ...) instead of best_index_int8(16, values, ...).

Step 6: Register Type Traits

Add type registration in ggml/src/ggml.c type traits table. Find the section with type trait definitions and add an entry similar to IQ4_NL. The location would be in the type_traits array where other quantization types are registered.

Step 7: Update Vulkan Shader Support

  1. Add the block_iq5_nl structure definition to ggml/src/ggml-vulkan/vulkan-shaders/types.glsl

  2. Add the kvalues_iq5nl lookup table initialization similar to IQ4_NL:

  3. Create the dequantization shader file dequant_iq5_nl.comp with the shader code provided above

  4. Register the shader in the Vulkan backend build system

Step 8: Add CPU Backend Support

Add to ggml/src/ggml-cpu/quants.c:

void quantize_row_iq5_nl(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) {
    quantize_row_iq5_nl_ref(x, (block_iq5_nl*)y, k);
}

Register in CPU backend type traits in ggml/src/ggml-cpu/ggml-cpu.c, adding vec_dot function pointer and other necessary traits.

Step 9: Add Additional Backend Support

Implement quantization/dequantization kernels for:

  • CUDA: Add to ggml/src/ggml-cuda/convert.cu and ggml/src/ggml-cuda/cpy-utils.cuh
  • Metal: Add to ggml/src/ggml-metal/ggml-metal.metal
  • SYCL: Add to ggml/src/ggml-sycl/ appropriate files
  • WebGPU: Add to ggml/src/ggml-webgpu/wgsl-shaders/

Follow the patterns established for IQ4_NL in each backend.

Notes

Lookup Table Optimization: The provided lookup table values are a starting point. For optimal quality, generate the kvalues_iq5nl table through training on representative model weights using Lloyd-Max quantization or k-means clustering with 32 centroids. The non-uniform spacing should concentrate more values near zero where weight distributions peak.

Best Index Function: The best_index_int8 helper function needs to be called with 32 as the first parameter instead of 16 for IQ5_NL.

Change the call from best_index_int8(16, values, al) to best_index_int8(32, kvalues_iq5nl, al).

Testing: Add comprehensive tests in tests/test-backend-ops.cpp for quantization/dequantization accuracy and add to tests/test-quantize-fns.cpp for bit-exactness verification.

Performance Trade-offs: IQ5_NL at 5.5 bpw provides better quality than IQ4_NL (4.5 bpw) but increases model size by ~22%. Benchmark inference speed across different hardware to validate the trade-off is worthwhile for your use case.