CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 15 days ago

Commit

e0ba36a

verified ·

1 Parent(s): 73e9225

Revert to Alpha 0.1

Browse files

Files changed (1) hide show

hexstate_quantize.c +41 -344

hexstate_quantize.c CHANGED Viewed

@@ -2732,209 +2732,70 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     }
     /* ══════════════════════════════════════════════════════════════════
-     * PHASE 3.9 — HOLOGRAPHIC BOUNDARY RECONSTRUCTION
      *
-     * Proper implementation of:
      *
-     *   W = Σᵢ ( Proj_n̂(Grad(V)) · Quantize(I_boundary) ) ⊗ Trans(Δτ)
      *
-     * W is NOT a correction to an existing reconstruction — it IS the
-     * reconstruction, expressed as a sum over block-boundary contributions.
-     * Each block-boundary i produces a scalar signal that is then lifted
-     * via ⊗ Trans(Δτ) into a vector that decays into the interior of the
-     * blocks on either side of that boundary.
      *
-     * ── Term-by-term derivation ─────────────────────────────────────
      *
-     *  Proj_n̂(Grad(V)) at boundary i
-     *      The gradient of the original weight tensor V projected onto
-     *      n̂, the unit normal to the block boundary.  In the 1-D block-
-     *      sequence space n̂ points in the inter-block direction, so this
-     *      equals the cross-boundary finite difference:
      *
-     *        g_i = w[i·QK_K] − w[i·QK_K − 1]
      *
-     *      (first weight of block i minus last weight of block i-1)
      *
-     *  Quantize(I_boundary) at boundary i
-     *      The boundary information I_boundary = the weight value at the
-     *      boundary element, expressed through the quantizer.  Computed
-     *      from Phase 1 seeds for the boundary sub-block:
      *
-     *        Q_i = d_sub · round((w_boundary + m_sub) / d_sub) − m_sub
-     *
-     *      This is not a step size or ratio — it is the actual dequantized
-     *      value of the boundary weight.
-     *
-     *  Signal s_i = Proj_n̂(Grad(V)) · Quantize(I_boundary)
-     *      Scalar product at boundary i.  Captures the signed energy of
-     *      the weight function at that boundary in the quantized domain.
-     *      Units: weight².  Normalised by d_sub² to become dimensionless.
-     *
-     *  ⊗ Trans(Δτ)
-     *      The tensor product lifts the scalar s_i into a vector over
-     *      the 256-element interior of the adjacent blocks.  Trans(Δτ)
-     *      is parameterised by Δτ = d_{i-1}/d_i (the scale ratio between
-     *      adjacent blocks), which re-projects s_i from block i-1's
-     *      quantization space into block i's:
-     *
-     *        right-propagation into block i at position k:
-     *          s_i · (d_{i-1}/d_i) · exp(−k / τ)
-     *
-     *        left-propagation into block i-1 at position k:
-     *          s_i · (d_i/d_{i-1}) · exp(−(QK_K−1−k) / τ)
-     *
-     *  The full reconstruction for block b at position k:
-     *
-     *    W[b][k] = x[b][k]           (original weights, not replaced)
-     *            − dc_bias[b]        (zeroth-moment error correction, unchanged)
-     *            + s_left[b]  · (d_{b-1}/d_b) · exp(−k/τ)
-     *            + s_right[b] · (d_{b+1}/d_b) · exp(−(QK_K−1−k)/τ)
-     *
-     *  where s_left[b]  = signal from boundary b   (left  edge of block b)
-     *        s_right[b] = signal from boundary b+1 (right edge of block b)
-     *
-     *  The DC bias is subtracted (it is an error correction, not a signal).
-     *  The boundary signals are added (they encode the holographic surface).
-     *
-     * ── Implementation ──────────────────────────────────────────────
-     *
-     *  Pre-pass (sequential) computes per block:
-     *    block_dc_bias[b]  — scalar DC offset (existing, unchanged)
-     *    block_s_left[b]   — left  boundary signal (normalised, pre-scaled)
-     *    block_s_right[b]  — right boundary signal (normalised, pre-scaled)
-     *
-     *  Phase 4 (parallel) applies:
-     *    adj_x[k] = x[k] − dc_bias + s_left·fwd_decay[k] + s_right·rev_decay[k]
-     *
-     *  Two precomputed decay tables (initialised once, thread-safe):
-     *    boundary_decay[k]     = exp(−k         / τ)  forward  (left → interior)
-     *    boundary_decay_rev[k] = exp(−(255−k)   / τ)  reversed (right → interior)
      * ══════════════════════════════════════════════════════════════════ */
-    #define DC_DECAY   0.85f  /* DC residual leak factor                       */
-    #define HOLO_TAU   32.0f  /* Boundary signal decay length (elements)       */
-    #define HOLO_ALPHA 0.20f  /* Boundary signal weight (fraction of one step) */
-    /* Precompute forward and reverse decay tables — read-only in Phase 4. */
-    static float boundary_decay    [QK_K];
-    static float boundary_decay_rev[QK_K];
-    {
-        static int _decay_init = 0;
-        if (!_decay_init) {
-            int _dk;
-            for (_dk = 0; _dk < QK_K; _dk++) {
-                boundary_decay    [_dk]         = expf(-(float)_dk         / HOLO_TAU);
-                boundary_decay_rev[_dk]         = expf(-(float)(QK_K-1-_dk)/ HOLO_TAU);
-            }
-            _decay_init = 1;
-        }
-    }
-    float *block_dc_bias  = (float *)calloc(n_blocks, sizeof(float));
-    float *block_s_left   = (float *)calloc(n_blocks, sizeof(float));
-    float *block_s_right  = (float *)calloc(n_blocks, sizeof(float));
-    if (block_dc_bias && block_s_left && block_s_right) {
         float rolling_dc = 0.0f;
         for (int64_t blk = 0; blk < n_blocks; blk++) {
-            const float *bx   = weights + blk * QK_K;
             int          cidx = best_candidate[blk];
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
-            /* ── DC bias (zeroth moment, existing) ── */
-            float dc_bias      = (DC_DECAY * rolling_dc) / (float)QK_K;
-            block_dc_bias[blk] = dc_bias;
-            /* ── Left boundary signal: boundary between block blk-1 and blk ──
-             *
-             * Proj_n̂(Grad(V)): cross-boundary finite difference g_left
-             * Quantize(I_boundary): dequantized value of bx[0] using Phase 1
-             *   seeds for the first sub-block (j=0) of block blk.
-             * s_left = g_left × Q(bx[0]) / d_sub² (dimensionless)
-             * Pre-scaled by HOLO_ALPHA × d_sub × (d_{blk-1}/d_{blk}) */
-            {
-                float g_left = (blk > 0)
-                    ? bx[0] - weights[(blk - 1) * QK_K + QK_K - 1]
-                    : 0.0f;
-                /* Quantize(I_boundary) for left edge: sub-block j=0 */
-                float d_sub_l = dm0 * (float)seeds[blk].Ls[0];
-                float m_sub_l = mm0 * (float)seeds[blk].Lm[0];
-                float q_val_l = 0.0f;
-                if (d_sub_l > 1e-15f) {
-                    int qi = gguf_nearest_int((bx[0] + m_sub_l) / d_sub_l);
-                    if (qi < 0) qi = 0; if (qi > 3) qi = 3;
-                    q_val_l = d_sub_l * (float)qi - m_sub_l;
-                }
-                /* Scale ratio Trans(Δτ): d_{blk-1} / d_{blk} */
-                float d_prev = (blk > 0 && seeds[blk-1].dm > 1e-15f)
-                               ? seeds[blk-1].dm : dm0;
-                float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
-                float scale_ratio_l = d_prev / d_curr;
-                if (scale_ratio_l < 0.1f) scale_ratio_l = 0.1f;
-                if (scale_ratio_l > 10.f) scale_ratio_l = 10.f;
-                /* Normalise s = (g × Q) / d² then re-scale to weight units */
-                float d2 = d_sub_l * d_sub_l;
-                float s  = (d2 > 1e-30f) ? (g_left * q_val_l / d2) : 0.0f;
-                block_s_left[blk] = HOLO_ALPHA * s * d_sub_l * scale_ratio_l;
-            }
-            /* ── Right boundary signal: boundary between block blk and blk+1 ──
-             *
-             * Same derivation but at the right edge (position QK_K-1,
-             * sub-block j = N_SUB-1) looking into block blk+1. */
-            {
-                float g_right = (blk + 1 < n_blocks)
-                    ? weights[(blk + 1) * QK_K] - bx[QK_K - 1]
-                    : 0.0f;
-                /* Quantize(I_boundary) for right edge: sub-block j=N_SUB-1 */
-                float d_sub_r = dm0 * (float)seeds[blk].Ls[N_SUB - 1];
-                float m_sub_r = mm0 * (float)seeds[blk].Lm[N_SUB - 1];
-                float q_val_r = 0.0f;
-                if (d_sub_r > 1e-15f) {
-                    int qi = gguf_nearest_int((bx[QK_K-1] + m_sub_r) / d_sub_r);
-                    if (qi < 0) qi = 0; if (qi > 3) qi = 3;
-                    q_val_r = d_sub_r * (float)qi - m_sub_r;
-                }
-                /* Scale ratio Trans(Δτ): d_{blk+1} / d_{blk} */
-                float d_next = (blk + 1 < n_blocks && seeds[blk+1].dm > 1e-15f)
-                               ? seeds[blk+1].dm : dm0;
-                float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
-                float scale_ratio_r = d_next / d_curr;
-                if (scale_ratio_r < 0.1f) scale_ratio_r = 0.1f;
-                if (scale_ratio_r > 10.f) scale_ratio_r = 10.f;
-                float d2 = d_sub_r * d_sub_r;
-                float s  = (d2 > 1e-30f) ? (g_right * q_val_r / d2) : 0.0f;
-                block_s_right[blk] = HOLO_ALPHA * s * d_sub_r * scale_ratio_r;
-            }
-            /* ── DC residual for the next block's rolling_dc ── */
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
                 float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
                 float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
-                int   base  = 16 * j;
                 for (k = 0; k < 16; k++) {
-                    int   elem  = base + k;
-                    float x_adj = bx[elem] - dc_bias
-                                + block_s_left [blk] * boundary_decay    [elem]
-                                + block_s_right[blk] * boundary_decay_rev[elem];
                     int q = 0;
                     if (d_sub >= 1e-15f) {
                         q = gguf_nearest_int((x_adj + m_sub) / d_sub);
-                        if (q < 0) q = 0; if (q > 3) q = 3;
                     }
-                    float deq  = d_sub * (float)q - m_sub;
-                    dc_res    += bx[elem] - deq;   /* residual vs ORIGINAL weight */
                 }
             }
             rolling_dc = dc_res;
@@ -2959,28 +2820,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         int cidx = best_candidate[blk];
         uint8_t Ls_blk[16], Lm_blk[16];
-        /* ── Holographic boundary reconstruction (Phase 3.9 → Phase 4) ──
-         *
-         * W[b][k] = x[b][k]
-         *         − dc_bias[b]                             (DC error correction)
-         *         + s_left[b]  · exp(−k/τ)                (left  boundary signal)
-         *         + s_right[b] · exp(−(QK_K−1−k)/τ)       (right boundary signal)
-         *
-         * The two boundary signals decay inward from opposite edges and meet
-         * in the middle.  Together they enforce C¹ continuity across every
-         * block boundary in the quantized domain. */
-        float dc_adj   = (block_dc_bias)  ? block_dc_bias [blk] : 0.0f;
-        float s_left   = (block_s_left)   ? block_s_left  [blk] : 0.0f;
-        float s_right  = (block_s_right)  ? block_s_right [blk] : 0.0f;
         float adj_block_x[QK_K];
         {
             int _i;
             for (_i = 0; _i < QK_K; _i++)
-                adj_block_x[_i] = block_x[_i]
-                                 - dc_adj
-                                 + s_left  * boundary_decay    [_i]
-                                 + s_right * boundary_decay_rev[_i];
         }
         memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
@@ -3229,159 +3081,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             Lm_blk[j] = best_lm;
         }
-        /* ══════════════════════════════════════════════════════════════
-         * PHASE 4.5 — LLOYD-MAX CENTROID REFINEMENT (per sub-block)
-         *
-         * Standard WLS and grid search both assume uniform spacing between
-         * the 4 representable values is MSE-optimal.  For non-uniform weight
-         * distributions (the typical case — transformer weights are heavy-
-         * tailed near zero, sparse in the tails), the WLS-optimal (d, m)
-         * does not align with the empirical centroids of the code partitions.
-         *
-         * Lloyd-Max iterates the assignment-then-centroid loop:
-         *
-         *   1. Assign: each weight → nearest representable value, code v∈{0..3}
-         *   2. Centroid: c_v = empirical mean of weights assigned to v
-         *   3. Project: c_v are 4 real numbers; find the arithmetic progression
-         *      {d·v − m : v∈{0,1,2,3}} that best fits c_v in MSE.
-         *      Closed-form solution from normal equations:
-         *
-         *        d_new = (3·c_3 + c_2 − c_1 − 3·c_0) / 10
-         *        m_new = (−7·c_0 − 4·c_1 − c_2 + 2·c_3) / 10
-         *
-         *      (Constants derived from Σq=6, Σq²=14, 4 codes total.)
-         *   4. Re-quantise; repeat until (d, m) stop changing.
-         *
-         * The arithmetic-progression projection is the key constraint that
-         * keeps the output in valid Q2_K format.  In unconstrained Lloyd-Max,
-         * the 4 centroids could be placed freely; here they must sit on an
-         * AP determined by (d, m), which is exactly what Q2_K stores.
-         *
-         * Operating per sub-block: we refine (d_sub_j, m_sub_j) = (d·Ls_j,
-         * m·Lm_j), then re-project onto integer (Ls, Lm) ∈ [0,15].  The
-         * integer rounding can hurt, so we only accept the refined values
-         * if they reduce the sub-block's weighted MSE.
-         *
-         * This is a genuine refinement on top of the grid search: the grid
-         * search minimises element-wise MSE assuming uniform spacing is
-         * locked in; Lloyd-Max iterates toward distribution-optimal spacing
-         * given the actual empirical centroids.
-         * ══════════════════════════════════════════════════════════════ */
-        for (int j = 0; j < N_SUB; j++) {
-            const float *sx = adj_block_x + 16 * j;
-            uint8_t Ls_cur = Ls_blk[j];
-            uint8_t Lm_cur = Lm_blk[j];
-            /* Baseline MSE for current (Ls, Lm) — only accept if we beat this */
-            float baseline_err = 0.0f;
-            {
-                float d_sub = dm * (float)Ls_cur;
-                float m_sub = mm * (float)Lm_cur;
-                for (int k = 0; k < 16; k++) {
-                    float w_imp = (imat_importance)
-                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
-                    int q;
-                    if (d_sub < 1e-15f) { q = 0; }
-                    else {
-                        q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
-                        if (q < 0) q = 0; if (q > 3) q = 3;
-                    }
-                    float deq  = d_sub * (float)q - m_sub;
-                    float diff = sx[k] - deq;
-                    baseline_err += diff * diff * w_imp;
-                }
-            }
-            /* Lloyd-Max iteration on (d_sub, m_sub) */
-            float d_sub = dm * (float)Ls_cur;
-            float m_sub = mm * (float)Lm_cur;
-            float d_sub_best = d_sub, m_sub_best = m_sub;
-            float lloyd_err  = baseline_err;
-            const int MAX_LLOYD_ITERS = 6;
-            for (int it = 0; it < MAX_LLOYD_ITERS; it++) {
-                if (d_sub < 1e-15f) break;
-                /* Step 1+2: assign and accumulate weighted centroids */
-                double sum_v[4]  = {0.0, 0.0, 0.0, 0.0};
-                double cnt_v[4]  = {0.0, 0.0, 0.0, 0.0};
-                for (int k = 0; k < 16; k++) {
-                    float w_imp = (imat_importance)
-                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
-                    int q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
-                    if (q < 0) q = 0; if (q > 3) q = 3;
-                    sum_v[q] += (double)sx[k] * (double)w_imp;
-                    cnt_v[q] += (double)w_imp;
-                }
-                /* Fill empty bins with extrapolation from neighbours to avoid
-                 * degenerate centroids when a code is unused */
-                double c[4];
-                int n_empty = 0;
-                for (int v = 0; v < 4; v++) {
-                    if (cnt_v[v] > 1e-15) {
-                        c[v] = sum_v[v] / cnt_v[v];
-                    } else {
-                        c[v] = (double)(d_sub * (float)v - m_sub); /* fallback to current AP */
-                        n_empty++;
-                    }
-                }
-                if (n_empty >= 3) break;  /* distribution too sparse — give up */
-                /* Step 3: AP projection — closed form for arithmetic progression
-                 * minimising Σ_v (c_v − (d·v − m))² */
-                float d_new = (float)((3.0*c[3] + c[2] - c[1] - 3.0*c[0]) / 10.0);
-                float m_new = (float)((-7.0*c[0] - 4.0*c[1] - c[2] + 2.0*c[3]) / 10.0);
-                if (d_new <= 1e-15f) break;
-                if (m_new <  0.0f) m_new = 0.0f;  /* keep m non-negative */
-                /* Step 4: project onto integer (Ls, Lm) and evaluate */
-                int Ls_try = (dm > 1e-15f) ? gguf_nearest_int(d_new / dm) : Ls_cur;
-                int Lm_try = (mm > 1e-15f) ? gguf_nearest_int(m_new / mm) : Lm_cur;
-                if (Ls_try < 1)  Ls_try = 1;
-                if (Ls_try > 15) Ls_try = 15;
-                if (Lm_try < 0)  Lm_try = 0;
-                if (Lm_try > 15) Lm_try = 15;
-                float d_sub_try = dm * (float)Ls_try;
-                float m_sub_try = mm * (float)Lm_try;
-                float try_err = 0.0f;
-                for (int k = 0; k < 16; k++) {
-                    float w_imp = (imat_importance)
-                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
-                    int q;
-                    if (d_sub_try < 1e-15f) { q = 0; }
-                    else {
-                        q = gguf_nearest_int((sx[k] + m_sub_try) / d_sub_try);
-                        if (q < 0) q = 0; if (q > 3) q = 3;
-                    }
-                    float deq  = d_sub_try * (float)q - m_sub_try;
-                    float diff = sx[k] - deq;
-                    try_err += diff * diff * w_imp;
-                }
-                /* Only accept if strictly improves; this is our safety net */
-                if (try_err < lloyd_err) {
-                    lloyd_err  = try_err;
-                    d_sub_best = d_sub_try;
-                    m_sub_best = m_sub_try;
-                    Ls_cur     = (uint8_t)Ls_try;
-                    Lm_cur     = (uint8_t)Lm_try;
-                    d_sub      = d_sub_try;
-                    m_sub      = m_sub_try;
-                } else {
-                    /* Converged or projection rounding hurt — stop */
-                    break;
-                }
-            }
-            if (lloyd_err < baseline_err) {
-                Ls_blk[j] = Ls_cur;
-                Lm_blk[j] = Lm_cur;
-            }
-        }
         output[blk].d    = gguf_fp32_to_fp16(dm);
         output[blk].dmin = gguf_fp32_to_fp16(mm);
@@ -3602,8 +3301,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     free(_tl_graphs);
     free(block_dc_bias);
-    free(block_s_left);
-    free(block_s_right);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);

     }
     /* ══════════════════════════════════════════════════════════════════
+     * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
      *
+     * Transforms the tensor from a collection of isolated 256-element
+     * Q2_K superblocks into a single, continuous error-cancelling waveform.
      *
+     * After Phase 3 has selected the optimal (d, dmin) candidate for every
+     * block, this sequential pass computes the net DC residual left by each
+     * block using a cheap round-nearest forward quantization, then feeds the
+     * negated, exponentially-decayed residual as a correction bias into the
+     * WLS solver of the immediately following block.
      *
+     * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
      *
+     *   dc_bias[N+1] = −DC_DECAY × R_N / QK_K      (per-element offset)
      *
+     * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
+     * quantizer toward codes whose reconstruction deq ≈ x′, so that
      *
+     *   Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
      *
+     * The accumulated cross-block DC collapses geometrically:
      *
+     *   R₀, DC_DECAY·R₀, DC_DECAY²·R₀, …  → 0
      *
+     * The result is written into block_dc_bias[n_blocks].  Phase 4 reads
+     * this array (safe: written sequentially before the parallel loop).
      * ══════════════════════════════════════════════════════════════════ */
+    #define DC_DECAY 0.85f   /* Boundary-condition leak factor (0 = isolated, 1 = full) */
+    float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
+    if (block_dc_bias) {
         float rolling_dc = 0.0f;
         for (int64_t blk = 0; blk < n_blocks; blk++) {
+            const float *bx  = weights + blk * QK_K;
             int          cidx = best_candidate[blk];
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+            /* Bias applied to THIS block's WLS targets */
+            float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
+            block_dc_bias[blk]  = dc_bias;
+            /* Quick round-nearest quant to estimate DC residual for NEXT block.
+             * We quantize the adjusted target x′ = x − dc_bias, then measure
+             * the residual of the ORIGINAL weight against the chosen code. */
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
                 float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
                 float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
                 for (k = 0; k < 16; k++) {
+                    float x_adj = bx[16*j + k] - dc_bias;
                     int q = 0;
                     if (d_sub >= 1e-15f) {
                         q = gguf_nearest_int((x_adj + m_sub) / d_sub);
+                        if (q < 0) q = 0;
+                        if (q > 3) q = 3;
                     }
+                    float deq = d_sub * (float)q - m_sub;
+                    /* Residual against ORIGINAL weight (not adjusted) */
+                    dc_res += bx[16*j + k] - deq;
                 }
             }
             rolling_dc = dc_res;
         int cidx = best_candidate[blk];
         uint8_t Ls_blk[16], Lm_blk[16];
+        /* ── Rolling DC boundary condition ──────────────────────────────
+         * dc_adj shifts every WLS target in this block so that the net
+         * quantisation error steers toward cancelling the previous block's
+         * DC residual (written by the sequential Phase 3.9 pre-pass). */
+        float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
+        /* Adjusted weight view — WLS and Shor work on this array;
+         * the final error is always reported against the original block_x. */
         float adj_block_x[QK_K];
         {
             int _i;
             for (_i = 0; _i < QK_K; _i++)
+                adj_block_x[_i] = block_x[_i] - dc_adj;
         }
         memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
             Lm_blk[j] = best_lm;
         }
         output[blk].d    = gguf_fp32_to_fp16(dm);
         output[blk].dmin = gguf_fp32_to_fp16(mm);
     free(_tl_graphs);
     free(block_dc_bias);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);