CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 19 days ago

Commit

73e9225

verified ·

1 Parent(s): 2432d03

ALPHA

Browse files

Files changed (1) hide show

hexstate_quantize.c +154 -1

hexstate_quantize.c CHANGED Viewed

@@ -1,5 +1,5 @@
 /* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_quantize.c — HExState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
@@ -3229,6 +3229,159 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             Lm_blk[j] = best_lm;
         }
         output[blk].d    = gguf_fp32_to_fp16(dm);
         output[blk].dmin = gguf_fp32_to_fp16(mm);

 /* ═══════════════════════════════════════════════════════════════════════════
+ * hexstate_quantize.c — HexState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
             Lm_blk[j] = best_lm;
         }
+        /* ══════════════════════════════════════════════════════════════
+         * PHASE 4.5 — LLOYD-MAX CENTROID REFINEMENT (per sub-block)
+         *
+         * Standard WLS and grid search both assume uniform spacing between
+         * the 4 representable values is MSE-optimal.  For non-uniform weight
+         * distributions (the typical case — transformer weights are heavy-
+         * tailed near zero, sparse in the tails), the WLS-optimal (d, m)
+         * does not align with the empirical centroids of the code partitions.
+         *
+         * Lloyd-Max iterates the assignment-then-centroid loop:
+         *
+         *   1. Assign: each weight → nearest representable value, code v∈{0..3}
+         *   2. Centroid: c_v = empirical mean of weights assigned to v
+         *   3. Project: c_v are 4 real numbers; find the arithmetic progression
+         *      {d·v − m : v∈{0,1,2,3}} that best fits c_v in MSE.
+         *      Closed-form solution from normal equations:
+         *
+         *        d_new = (3·c_3 + c_2 − c_1 − 3·c_0) / 10
+         *        m_new = (−7·c_0 − 4·c_1 − c_2 + 2·c_3) / 10
+         *
+         *      (Constants derived from Σq=6, Σq²=14, 4 codes total.)
+         *   4. Re-quantise; repeat until (d, m) stop changing.
+         *
+         * The arithmetic-progression projection is the key constraint that
+         * keeps the output in valid Q2_K format.  In unconstrained Lloyd-Max,
+         * the 4 centroids could be placed freely; here they must sit on an
+         * AP determined by (d, m), which is exactly what Q2_K stores.
+         *
+         * Operating per sub-block: we refine (d_sub_j, m_sub_j) = (d·Ls_j,
+         * m·Lm_j), then re-project onto integer (Ls, Lm) ∈ [0,15].  The
+         * integer rounding can hurt, so we only accept the refined values
+         * if they reduce the sub-block's weighted MSE.
+         *
+         * This is a genuine refinement on top of the grid search: the grid
+         * search minimises element-wise MSE assuming uniform spacing is
+         * locked in; Lloyd-Max iterates toward distribution-optimal spacing
+         * given the actual empirical centroids.
+         * ══════════════════════════════════════════════════════════════ */
+        for (int j = 0; j < N_SUB; j++) {
+            const float *sx = adj_block_x + 16 * j;
+            uint8_t Ls_cur = Ls_blk[j];
+            uint8_t Lm_cur = Lm_blk[j];
+            /* Baseline MSE for current (Ls, Lm) — only accept if we beat this */
+            float baseline_err = 0.0f;
+            {
+                float d_sub = dm * (float)Ls_cur;
+                float m_sub = mm * (float)Lm_cur;
+                for (int k = 0; k < 16; k++) {
+                    float w_imp = (imat_importance)
+                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
+                    int q;
+                    if (d_sub < 1e-15f) { q = 0; }
+                    else {
+                        q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
+                        if (q < 0) q = 0; if (q > 3) q = 3;
+                    }
+                    float deq  = d_sub * (float)q - m_sub;
+                    float diff = sx[k] - deq;
+                    baseline_err += diff * diff * w_imp;
+                }
+            }
+            /* Lloyd-Max iteration on (d_sub, m_sub) */
+            float d_sub = dm * (float)Ls_cur;
+            float m_sub = mm * (float)Lm_cur;
+            float d_sub_best = d_sub, m_sub_best = m_sub;
+            float lloyd_err  = baseline_err;
+            const int MAX_LLOYD_ITERS = 6;
+            for (int it = 0; it < MAX_LLOYD_ITERS; it++) {
+                if (d_sub < 1e-15f) break;
+                /* Step 1+2: assign and accumulate weighted centroids */
+                double sum_v[4]  = {0.0, 0.0, 0.0, 0.0};
+                double cnt_v[4]  = {0.0, 0.0, 0.0, 0.0};
+                for (int k = 0; k < 16; k++) {
+                    float w_imp = (imat_importance)
+                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
+                    int q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
+                    if (q < 0) q = 0; if (q > 3) q = 3;
+                    sum_v[q] += (double)sx[k] * (double)w_imp;
+                    cnt_v[q] += (double)w_imp;
+                }
+                /* Fill empty bins with extrapolation from neighbours to avoid
+                 * degenerate centroids when a code is unused */
+                double c[4];
+                int n_empty = 0;
+                for (int v = 0; v < 4; v++) {
+                    if (cnt_v[v] > 1e-15) {
+                        c[v] = sum_v[v] / cnt_v[v];
+                    } else {
+                        c[v] = (double)(d_sub * (float)v - m_sub); /* fallback to current AP */
+                        n_empty++;
+                    }
+                }
+                if (n_empty >= 3) break;  /* distribution too sparse — give up */
+                /* Step 3: AP projection — closed form for arithmetic progression
+                 * minimising Σ_v (c_v − (d·v − m))² */
+                float d_new = (float)((3.0*c[3] + c[2] - c[1] - 3.0*c[0]) / 10.0);
+                float m_new = (float)((-7.0*c[0] - 4.0*c[1] - c[2] + 2.0*c[3]) / 10.0);
+                if (d_new <= 1e-15f) break;
+                if (m_new <  0.0f) m_new = 0.0f;  /* keep m non-negative */
+                /* Step 4: project onto integer (Ls, Lm) and evaluate */
+                int Ls_try = (dm > 1e-15f) ? gguf_nearest_int(d_new / dm) : Ls_cur;
+                int Lm_try = (mm > 1e-15f) ? gguf_nearest_int(m_new / mm) : Lm_cur;
+                if (Ls_try < 1)  Ls_try = 1;
+                if (Ls_try > 15) Ls_try = 15;
+                if (Lm_try < 0)  Lm_try = 0;
+                if (Lm_try > 15) Lm_try = 15;
+                float d_sub_try = dm * (float)Ls_try;
+                float m_sub_try = mm * (float)Lm_try;
+                float try_err = 0.0f;
+                for (int k = 0; k < 16; k++) {
+                    float w_imp = (imat_importance)
+                                  ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
+                    int q;
+                    if (d_sub_try < 1e-15f) { q = 0; }
+                    else {
+                        q = gguf_nearest_int((sx[k] + m_sub_try) / d_sub_try);
+                        if (q < 0) q = 0; if (q > 3) q = 3;
+                    }
+                    float deq  = d_sub_try * (float)q - m_sub_try;
+                    float diff = sx[k] - deq;
+                    try_err += diff * diff * w_imp;
+                }
+                /* Only accept if strictly improves; this is our safety net */
+                if (try_err < lloyd_err) {
+                    lloyd_err  = try_err;
+                    d_sub_best = d_sub_try;
+                    m_sub_best = m_sub_try;
+                    Ls_cur     = (uint8_t)Ls_try;
+                    Lm_cur     = (uint8_t)Lm_try;
+                    d_sub      = d_sub_try;
+                    m_sub      = m_sub_try;
+                } else {
+                    /* Converged or projection rounding hurt — stop */
+                    break;
+                }
+            }
+            if (lloyd_err < baseline_err) {
+                Ls_blk[j] = Ls_cur;
+                Lm_blk[j] = Lm_cur;
+            }
+        }
         output[blk].d    = gguf_fp32_to_fp16(dm);
         output[blk].dmin = gguf_fp32_to_fp16(mm);