/*
 * gguf_format.h — GGUF v3 Binary Format Writer
 *
 * ╔═══════════════════════════════════════════════════════════════╗
 * ║  HExState GGUF Output Module                                 ║
 * ║  Implements the GGUF v3 binary specification for writing     ║
 * ║  quantized LLM weight files compatible with llama.cpp        ║
 * ╚═══════════════════════════════════════════════════════════════╝
 *
 * File Layout:
 *   1. Header:    magic(4) + version(4) + tensor_count(8) + kv_count(8)
 *   2. Metadata:  Key-Value pairs (variable length)
 *   3. Tensor Info: Per-tensor descriptors (name, dims, type, offset)
 *   4. Padding:   Align to GGUF_DEFAULT_ALIGNMENT bytes
 *   5. Tensor Data: Raw quantized weight data
 *
 * All values are little-endian.
 */

#ifndef GGUF_FORMAT_H
#define GGUF_FORMAT_H

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

/* ═══════════════════════════════════════════════════════════════════════
 * GGUF CONSTANTS
 * ═══════════════════════════════════════════════════════════════════════ */

#define GGUF_MAGIC            0x46554747  /* "GGUF" in little-endian    */
#define GGUF_VERSION          3
#define GGUF_DEFAULT_ALIGNMENT 32

/* ═══════════════════════════════════════════════════════════════════════
 * GGML TENSOR TYPES
 * ═══════════════════════════════════════════════════════════════════════ */

typedef enum {
    GGML_TYPE_F32   = 0,
    GGML_TYPE_F16   = 1,
    GGML_TYPE_Q4_0  = 2,
    GGML_TYPE_Q4_1  = 3,
    GGML_TYPE_Q5_0  = 6,
    GGML_TYPE_Q5_1  = 7,
    GGML_TYPE_Q8_0  = 8,
    GGML_TYPE_Q8_1  = 9,
    GGML_TYPE_Q2_K  = 10,
    GGML_TYPE_Q3_K  = 11,
    GGML_TYPE_Q4_K  = 12,
    GGML_TYPE_Q5_K  = 13,
    GGML_TYPE_Q6_K  = 14,
    GGML_TYPE_Q8_K  = 15,
    GGML_TYPE_IQ2_XXS = 16,
    GGML_TYPE_IQ2_XS  = 17,
    GGML_TYPE_IQ3_XXS = 18,
    GGML_TYPE_IQ1_S   = 19,
    GGML_TYPE_IQ4_NL  = 20,
    GGML_TYPE_IQ3_S   = 21,
    GGML_TYPE_IQ2_S   = 22,
    GGML_TYPE_IQ4_XS  = 23,
    GGML_TYPE_I8      = 24,
    GGML_TYPE_I16     = 25,
    GGML_TYPE_I32     = 26,
    GGML_TYPE_I64     = 27,
    GGML_TYPE_F64     = 28,
    GGML_TYPE_IQ1_M   = 29,
    GGML_TYPE_BF16    = 30,
    GGML_TYPE_COUNT
} GGMLType;

/* ═══════════════════════════════════════════════════════════════════════
 * GGUF METADATA VALUE TYPES
 * ═══════════════════════════════════════════════════════════════════════ */

typedef enum {
    GGUF_TYPE_UINT8   = 0,
    GGUF_TYPE_INT8    = 1,
    GGUF_TYPE_UINT16  = 2,
    GGUF_TYPE_INT16   = 3,
    GGUF_TYPE_UINT32  = 4,
    GGUF_TYPE_INT32   = 5,
    GGUF_TYPE_FLOAT32 = 6,
    GGUF_TYPE_BOOL    = 7,
    GGUF_TYPE_STRING  = 8,
    GGUF_TYPE_ARRAY   = 9,
    GGUF_TYPE_UINT64  = 10,
    GGUF_TYPE_INT64   = 11,
    GGUF_TYPE_FLOAT64 = 12
} GGUFValueType;

/* ═══════════════════════════════════════════════════════════════════════
 * Q8_0 BLOCK STRUCTURE
 *
 * The fundamental quantized unit: 32 weights + 1 fp16 scale.
 * Total: 34 bytes per block = 8.5 bits per weight.
 *
 * Dequantization: w_i = qs[i] * d
 * ═══════════════════════════════════════════════════════════════════════ */

#define QK8_0 32  /* Block size for Q8_0 */

typedef struct {
    uint16_t d;           /* fp16 scale (delta)                         */
    int8_t   qs[QK8_0];  /* quantized values [-127, 127]               */
} BlockQ8_0;

/* Verify: sizeof(BlockQ8_0) should be 34 bytes (2 + 32) */

/* ═══════════════════════════════════════════════════════════════════════
 * Q4_0 BLOCK STRUCTURE
 *
 * 32 weights per block with 4-bit quantization.
 * Layout: 1 fp16 scale + 16 bytes packed quants (2 weights per byte)
 * Total: 18 bytes per block = 4.5 bits per weight.
 *
 * Dequantization: w_i = (q_i - 8) * d
 *   where q_i in {0..15}, stored as nibbles
 * ═══════════════════════════════════════════════════════════════════════ */

#define QK4_0 32  /* Block size for Q4_0 */

typedef struct {
    uint16_t d;             /* fp16 scale (delta)                         */
    uint8_t  qs[QK4_0/2];  /* 16 bytes: packed 4-bit quants (2 per byte) */
} BlockQ4_0;

/* sizeof(BlockQ4_0) = 2 + 16 = 18 bytes for 32 weights */

/* ═══════════════════════════════════════════════════════════════════════
 * Q2_K BLOCK STRUCTURE (K-Quant, 2-bit)
 *
 * 256-weight superblock divided into 16 sub-blocks of 16 weights.
 *
 * Layout (must match ggml block_q2_K):
 *   d:          fp16 super-block scale for scales
 *   dmin:       fp16 super-block scale for mins
 *   scales[16]: Per-sub-block scale (low 4 bits) + min (high 4 bits)
 *   qs[64]:     Packed 2-bit quants (4 weights per byte)
 *
 * Dequantization: w_i = d * scale_j * q_i - dmin * min_j
 *   where j = sub-block index, q_i in {0, 1, 2, 3}
 *
 * Effective: 2.625 bits per weight (84 bytes / 256 weights)
 * ═══════════════════════════════════════════════════════════════════════ */

#define QK_K 256   /* K-quant superblock size */

typedef struct {
    uint8_t  scales[QK_K/16]; /* 16 bytes: scale(4bit) | min(4bit)       */
    uint8_t  qs[QK_K/4];     /* 64 bytes: packed 2-bit quants            */
    uint16_t d;              /* fp16 super-block scale                   */
    uint16_t dmin;           /* fp16 super-block min scale               */
} BlockQ2K;

/* sizeof(BlockQ2K) = 2 + 2 + 16 + 64 = 84 bytes for 256 weights */

/* ═══════════════════════════════════════════════════════════════════════
 * FP16 ←→ FP32 CONVERSION
 *
 * IEEE 754 half-precision (binary16):
 *   1 sign bit, 5 exponent bits, 10 mantissa bits
 * ═══════════════════════════════════════════════════════════════════════ */

static inline uint16_t gguf_fp32_to_fp16(float f)
{
    /* Use the union approach for bit manipulation */
    union { float f; uint32_t u; } fu;
    fu.f = f;
    uint32_t x = fu.u;

    uint16_t sign = (x >> 16) & 0x8000;
    int32_t exponent = ((x >> 23) & 0xFF) - 127 + 15;
    uint32_t mantissa = x & 0x7FFFFF;

    if (exponent <= 0) {
        /* Subnormal or zero */
        if (exponent < -10) return sign;  /* too small → ±0 */
        mantissa = (mantissa | 0x800000) >> (1 - exponent);
        return sign | (uint16_t)(mantissa >> 13);
    } else if (exponent >= 0x1F) {
        /* Infinity or NaN */
        return sign | 0x7C00 | (uint16_t)(mantissa ? (mantissa >> 13) : 0);
    }

    /* Normalized */
    return sign | (uint16_t)(exponent << 10) | (uint16_t)(mantissa >> 13);
}

static inline float gguf_fp16_to_fp32(uint16_t h)
{
    uint32_t sign = (uint32_t)(h & 0x8000) << 16;
    int32_t exponent = (h >> 10) & 0x1F;
    uint32_t mantissa = h & 0x03FF;

    uint32_t result;

    if (exponent == 0) {
        if (mantissa == 0) {
            result = sign;  /* ±0 */
        } else {
            /* Subnormal → normalize */
            exponent = 1;
            while (!(mantissa & 0x0400)) {
                mantissa <<= 1;
                exponent--;
            }
            mantissa &= 0x03FF;
            result = sign | ((uint32_t)(exponent + 127 - 15) << 23) | (mantissa << 13);
        }
    } else if (exponent == 0x1F) {
        result = sign | 0x7F800000 | (mantissa << 13);  /* Inf/NaN */
    } else {
        result = sign | ((uint32_t)(exponent + 127 - 15) << 23) | (mantissa << 13);
    }

    union { uint32_t u; float f; } uf;
    uf.u = result;
    return uf.f;
}

/* BFloat16 → Float32 (just shift left by 16, it IS the top 16 bits of fp32) */
static inline float gguf_bf16_to_fp32(uint16_t bf)
{
    union { uint32_t u; float f; } uf;
    uf.u = (uint32_t)bf << 16;
    return uf.f;
}

/* ═══════════════════════════════════════════════════════════════════════
 * GGUF STRING — Length-prefixed UTF-8 (no null terminator in file)
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_write_string(FILE *fp, const char *s)
{
    uint64_t len = strlen(s);
    fwrite(&len, sizeof(uint64_t), 1, fp);
    fwrite(s, 1, len, fp);
}

/* ═══════════════════════════════════════════════════════════════════════
 * GGUF METADATA KEY-VALUE WRITERS
 *
 * Each KV entry: key_string + value_type(u32) + value_data
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_write_kv_string(FILE *fp, const char *key, const char *val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_STRING;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    gguf_write_string(fp, val);
}

static inline void gguf_write_kv_uint32(FILE *fp, const char *key, uint32_t val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_UINT32;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    fwrite(&val, sizeof(uint32_t), 1, fp);
}

static inline void gguf_write_kv_int32(FILE *fp, const char *key, int32_t val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_INT32;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    fwrite(&val, sizeof(int32_t), 1, fp);
}

static inline void gguf_write_kv_uint64(FILE *fp, const char *key, uint64_t val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_UINT64;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    fwrite(&val, sizeof(uint64_t), 1, fp);
}

static inline void gguf_write_kv_float32(FILE *fp, const char *key, float val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_FLOAT32;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    fwrite(&val, sizeof(float), 1, fp);
}

static inline void gguf_write_kv_bool(FILE *fp, const char *key, int val)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_BOOL;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    uint8_t b = val ? 1 : 0;
    fwrite(&b, sizeof(uint8_t), 1, fp);
}

/* Write an array of float32 values */
static inline void gguf_write_kv_float32_array(FILE *fp, const char *key,
                                                 const float *vals, uint64_t count)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_ARRAY;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    uint32_t subtype = GGUF_TYPE_FLOAT32;
    fwrite(&subtype, sizeof(uint32_t), 1, fp);
    fwrite(&count, sizeof(uint64_t), 1, fp);
    fwrite(vals, sizeof(float), count, fp);
}

/* Write an array of int32 values */
static inline void gguf_write_kv_int32_array(FILE *fp, const char *key,
                                               const int32_t *vals, uint64_t count)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_ARRAY;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    uint32_t subtype = GGUF_TYPE_INT32;
    fwrite(&subtype, sizeof(uint32_t), 1, fp);
    fwrite(&count, sizeof(uint64_t), 1, fp);
    fwrite(vals, sizeof(int32_t), count, fp);
}

/* Write an array of string values */
static inline void gguf_write_kv_string_array(FILE *fp, const char *key,
                                                 const char **vals, uint64_t count)
{
    gguf_write_string(fp, key);
    uint32_t vtype = GGUF_TYPE_ARRAY;
    fwrite(&vtype, sizeof(uint32_t), 1, fp);
    uint32_t subtype = GGUF_TYPE_STRING;
    fwrite(&subtype, sizeof(uint32_t), 1, fp);
    fwrite(&count, sizeof(uint64_t), 1, fp);
    for (uint64_t i = 0; i < count; i++) {
        gguf_write_string(fp, vals[i] ? vals[i] : "");
    }
}
/* ═══════════════════════════════════════════════════════════════════════
 * GGUF TENSOR INFO WRITER
 *
 * Per-tensor descriptor in the file:
 *   name_string + n_dims(u32) + dims[n_dims](u64 each) +
 *   type(u32) + offset(u64)
 *
 * Offset is relative to the start of the tensor data section.
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_write_tensor_info(FILE *fp, const char *name,
                                            uint32_t n_dims, const uint64_t *dims,
                                            GGMLType type, uint64_t offset)
{
    gguf_write_string(fp, name);
    fwrite(&n_dims, sizeof(uint32_t), 1, fp);
    for (uint32_t i = 0; i < n_dims; i++) {
        fwrite(&dims[i], sizeof(uint64_t), 1, fp);
    }
    uint32_t t = (uint32_t)type;
    fwrite(&t, sizeof(uint32_t), 1, fp);
    fwrite(&offset, sizeof(uint64_t), 1, fp);
}

/* ═══════════════════════════════════════════════════════════════════════
 * GGUF HEADER WRITER
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_write_header(FILE *fp, uint64_t tensor_count,
                                       uint64_t metadata_kv_count)
{
    uint32_t magic = GGUF_MAGIC;
    uint32_t version = GGUF_VERSION;
    fwrite(&magic, sizeof(uint32_t), 1, fp);
    fwrite(&version, sizeof(uint32_t), 1, fp);
    fwrite(&tensor_count, sizeof(uint64_t), 1, fp);
    fwrite(&metadata_kv_count, sizeof(uint64_t), 1, fp);
}

/* ═══════════════════════════════════════════════════════════════════════
 * ALIGNMENT PADDING
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_write_padding(FILE *fp, uint32_t alignment)
{
    long pos = ftell(fp);
    long pad = (alignment - (pos % alignment)) % alignment;
    if (pad > 0) {
        uint8_t zeros[64] = {0};
        while (pad > 0) {
            long write_n = (pad > 64) ? 64 : pad;
            fwrite(zeros, 1, write_n, fp);
            pad -= write_n;
        }
    }
}

/* ═══════════════════════════════════════════════════════════════════════
 * Q8_0 QUANTIZATION — Reference Implementation
 *
 * For each block of 32 floats:
 *   1. Find amax = max(|x_i|)
 *   2. Scale d = amax / 127.0
 *   3. Quantize: qs[i] = round(x_i / d)
 *
 * This is the STANDARD brute-force approach.
 * The HExState MCMC optimizer replaces step 2 with intelligent
 * search for the optimal d that minimizes weighted error.
 * ═══════════════════════════════════════════════════════════════════════ */

static inline void gguf_quantize_q8_0_reference(const float *x,
                                                   BlockQ8_0 *y,
                                                   int64_t n_elements)
{
    int64_t n_blocks = n_elements / QK8_0;

    for (int64_t i = 0; i < n_blocks; i++) {
        float amax = 0.0f;
        for (int j = 0; j < QK8_0; j++) {
            float v = fabsf(x[i * QK8_0 + j]);
            if (v > amax) amax = v;
        }

        float d = amax / 127.0f;
        float id = (d != 0.0f) ? 1.0f / d : 0.0f;

        y[i].d = gguf_fp32_to_fp16(d);

        for (int j = 0; j < QK8_0; j++) {
            float v = x[i * QK8_0 + j] * id;
            y[i].qs[j] = (int8_t)roundf(v);
        }
    }
}

/* Dequantize a single Q8_0 block back to float (for error measurement) */
static inline void gguf_dequantize_q8_0_block(const BlockQ8_0 *block,
                                                float *out)
{
    float d = gguf_fp16_to_fp32(block->d);
    for (int j = 0; j < QK8_0; j++) {
        out[j] = (float)block->qs[j] * d;
    }
}

/* Compute L2 reconstruction error for a Q8_0 quantized block */
static inline float gguf_q8_0_block_error(const float *original,
                                            const BlockQ8_0 *block)
{
    float deq[QK8_0];
    gguf_dequantize_q8_0_block(block, deq);
    float err = 0.0f;
    for (int j = 0; j < QK8_0; j++) {
        float diff = original[j] - deq[j];
        err += diff * diff;
    }
    return err;
}

/* ═══════════════════════════════════════════════════════════════════════
 * Q2_K QUANTIZATION — Reference Implementation
 *
 * For each superblock of 256 floats:
 *   1. Divide into 16 sub-blocks of 16 weights
 *   2. For each sub-block: find optimal (scale, min) → w ≈ min + scale * q
 *   3. Quantize sub-block scales/mins to 4 bits each
 *   4. Re-quantize weights to 2 bits using final scales
 *   5. Pack 4 quants per byte
 *
 * The HExState MCMC optimizer replaces step 2's brute-force grid search
 * with intelligent Boltzmann-guided exploration.
 * ═══════════════════════════════════════════════════════════════════════ */

/* Helper: find nearest integer (ggml-compatible) */
static inline int gguf_nearest_int(float fval)
{
    float val = fval + 12582912.f;  /* 2^23 + 2^22 */
    int i;
    memcpy(&i, &val, sizeof(int));
    return (i & 0x007fffff) - 0x00400000;
}

/* Quantize a sub-block of 16 floats with scale+min scheme.
 * Returns scale; stores abs(min) in *the_min.
 * Outputs L[i] ∈ {0, 1, 2, 3} (nmax = 3). */
static inline float gguf_make_qkx_quants(int n, int nmax,
                                           const float *x, uint8_t *L,
                                           float *the_min)
{
    float min_val = x[0];
    float max_val = x[0];
    for (int i = 1; i < n; i++) {
        if (x[i] < min_val) min_val = x[i];
        if (x[i] > max_val) max_val = x[i];
    }
    if (max_val == min_val) {
        for (int i = 0; i < n; i++) L[i] = 0;
        *the_min = -min_val;
        return 0.0f;
    }
    if (min_val > 0) min_val = 0;

    float iscale = nmax / (max_val - min_val);
    float scale = 1.0f / iscale;

    /* Iterative refinement (matches ggml's make_qkx1_quants) */
    for (int itry = 0; itry < 5; itry++) {
        float sumlx = 0;
        int suml2 = 0;
        int did_change = 0;
        for (int i = 0; i < n; i++) {
            int l = gguf_nearest_int(iscale * (x[i] - min_val));
            if (l < 0) l = 0;
            if (l > nmax) l = nmax;
            if (l != (int)L[i]) { L[i] = l; did_change = 1; }
            sumlx += (x[i] - min_val) * l;
            suml2 += l * l;
        }
        if (suml2 > 0) scale = sumlx / suml2;
        float sum = 0;
        for (int i = 0; i < n; i++) {
            sum += x[i] - scale * L[i];
        }
        min_val = 0.7f * min_val + 0.3f * sum / n;
        if (min_val > 0) min_val = 0;
        if (scale > 1e-15f) iscale = 1.0f / scale;
        if (!did_change) break;
    }

    *the_min = -min_val;
    return scale;
}

static inline void gguf_quantize_q2_k_reference(const float *x,
                                                  BlockQ2K *y,
                                                  int64_t n_elements)
{
    int64_t n_blocks = n_elements / QK_K;
    const float q4scale = 15.0f;

    for (int64_t i = 0; i < n_blocks; i++) {
        const float *block_x = x + i * QK_K;
        uint8_t L[QK_K];
        float mins[QK_K / 16];
        float scales[QK_K / 16];

        float max_scale = 0.0f;
        float max_min = 0.0f;

        /* Step 1: Find scale and min for each of 16 sub-blocks */
        for (int j = 0; j < QK_K / 16; j++) {
            scales[j] = gguf_make_qkx_quants(16, 3,
                                               block_x + 16 * j,
                                               L + 16 * j, &mins[j]);
            if (scales[j] > max_scale) max_scale = scales[j];
            if (mins[j] > max_min) max_min = mins[j];
        }

        /* Step 2: Quantize the 16 sub-block scales to 4 bits */
        if (max_scale > 0) {
            float iscale = q4scale / max_scale;
            for (int j = 0; j < QK_K / 16; j++) {
                int l = gguf_nearest_int(iscale * scales[j]);
                if (l < 0) l = 0;
                if (l > 15) l = 15;
                y[i].scales[j] = (uint8_t)l;
            }
            y[i].d = gguf_fp32_to_fp16(max_scale / q4scale);
        } else {
            for (int j = 0; j < QK_K / 16; j++) y[i].scales[j] = 0;
            y[i].d = gguf_fp32_to_fp16(0.0f);
        }

        /* Step 3: Quantize the 16 sub-block mins to 4 bits (packed in high nibble) */
        if (max_min > 0) {
            float iscale = q4scale / max_min;
            for (int j = 0; j < QK_K / 16; j++) {
                int l = gguf_nearest_int(iscale * mins[j]);
                if (l < 0) l = 0;
                if (l > 15) l = 15;
                y[i].scales[j] |= ((uint8_t)l << 4);
            }
            y[i].dmin = gguf_fp32_to_fp16(max_min / q4scale);
        } else {
            y[i].dmin = gguf_fp32_to_fp16(0.0f);
        }

        /* Step 4: Re-quantize weights to 2 bits using final rounded scales */
        for (int j = 0; j < QK_K / 16; j++) {
            float d = gguf_fp16_to_fp32(y[i].d) * (y[i].scales[j] & 0xF);
            if (d < 1e-15f) {
                for (int ii = 0; ii < 16; ii++) L[16 * j + ii] = 0;
                continue;
            }
            float dm = gguf_fp16_to_fp32(y[i].dmin) * (y[i].scales[j] >> 4);
            for (int ii = 0; ii < 16; ii++) {
                int l = gguf_nearest_int((block_x[16 * j + ii] + dm) / d);
                if (l < 0) l = 0;
                if (l > 3) l = 3;
                L[16 * j + ii] = (uint8_t)l;
            }
        }

        /* Step 5: Pack 4 quants per byte (2 bits each)
         * Layout: 2 groups of 128, each packed as 32 bytes holding 4×32 quants */
        for (int j = 0; j < QK_K; j += 128) {
            for (int l = 0; l < 32; l++) {
                y[i].qs[j / 4 + l] = L[j + l]
                                    | (L[j + l + 32] << 2)
                                    | (L[j + l + 64] << 4)
                                    | (L[j + l + 96] << 6);
            }
        }
    }
}

/* Dequantize a single Q2_K superblock to float (for error measurement) */
static inline void gguf_dequantize_q2_k_block(const BlockQ2K *block,
                                                float *out)
{
    float d = gguf_fp16_to_fp32(block->d);
    float dmin = gguf_fp16_to_fp32(block->dmin);

    const uint8_t *q = block->qs;
    int is = 0;

    for (int n = 0; n < QK_K; n += 128) {
        int shift = 0;
        for (int j = 0; j < 4; j++) {
            uint8_t sc = block->scales[is++];
            float dl = d * (sc & 0xF);
            float ml = dmin * (sc >> 4);
            for (int l = 0; l < 16; l++) {
                *out++ = dl * ((float)((q[l] >> shift) & 3)) - ml;
            }

            sc = block->scales[is++];
            dl = d * (sc & 0xF);
            ml = dmin * (sc >> 4);
            for (int l = 0; l < 16; l++) {
                *out++ = dl * ((float)((q[l + 16] >> shift) & 3)) - ml;
            }

            shift += 2;
        }
        q += 32;
    }
}

/* Compute L2 error for a Q2_K quantized superblock */
static inline float gguf_q2_k_block_error(const float *original,
                                            const BlockQ2K *block)
{
    float deq[QK_K];
    gguf_dequantize_q2_k_block(block, deq);
    float err = 0.0f;
    for (int j = 0; j < QK_K; j++) {
        float diff = original[j] - deq[j];
        err += diff * diff;
    }
    return err;
}

/* ═══════════════════════════════════════════════════════════════════════
 * GGML TYPE METADATA — Size calculations
 * ═══════════════════════════════════════════════════════════════════════ */

/* Block size for a given type */
static inline int64_t ggml_type_block_size(GGMLType type)
{
    switch (type) {
        case GGML_TYPE_F32:   return 1;
        case GGML_TYPE_F16:   return 1;
        case GGML_TYPE_Q8_0:  return QK8_0;
        case GGML_TYPE_Q2_K:  return QK_K;
        case GGML_TYPE_Q4_0:  return 32;
        case GGML_TYPE_Q4_1:  return 32;
        case GGML_TYPE_Q5_0:  return 32;
        case GGML_TYPE_Q5_1:  return 32;
        case GGML_TYPE_Q4_K:  return 256;
        case GGML_TYPE_Q5_K:  return 256;
        case GGML_TYPE_Q6_K:  return 256;
        default: return 1;
    }
}

/* Bytes per block for a given type */
static inline int64_t ggml_type_bytes_per_block(GGMLType type)
{
    switch (type) {
        case GGML_TYPE_F32:   return 4;
        case GGML_TYPE_F16:   return 2;
        case GGML_TYPE_Q8_0:  return sizeof(BlockQ8_0);  /* 34 */
        case GGML_TYPE_Q2_K:  return sizeof(BlockQ2K);   /* 84 */
        case GGML_TYPE_Q4_0:  return 18;   /* 2 + 16 */
        case GGML_TYPE_Q4_1:  return 20;   /* 2 + 2 + 16 */
        default: return 4;
    }
}

/* Total bytes for n_elements of a given type */
static inline int64_t ggml_type_size(GGMLType type, int64_t n_elements)
{
    int64_t block_size = ggml_type_block_size(type);
    int64_t bytes_per_block = ggml_type_bytes_per_block(type);
    int64_t n_blocks = (n_elements + block_size - 1) / block_size;
    return n_blocks * bytes_per_block;
}

#endif /* GGUF_FORMAT_H */