CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 15 days ago

Commit

a034f4d

verified ·

1 Parent(s): 4384a45

Update hexstate_quantize.c

Browse files

Files changed (1) hide show

hexstate_quantize.c +349 -287

hexstate_quantize.c CHANGED Viewed

@@ -633,6 +633,7 @@ static int is_attention_tensor(const char *gguf_name)
  * conservative too" — creating coherent precision allocation.
  * ═══════════════════════════════════════════════════════════════════════════ */
 /* ── Multi-quhit expanded scale table ──
  * Search grid: 24×24 = 576 (d, dmin) candidates
  * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
@@ -1510,16 +1511,6 @@ static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
 #define HEX_POLISH_ULP 4
 #endif
-/* Number of Digital-Twin basin jumps per block (Phase 4.65). Each round
- * builds the block's ideal twin from the exact per-sub-block qkx2 optima,
- * collapses it over a 6×6 (d, dmin) hex lattice, and — only on strict
- * improvement of the true extended objective — re-seats the block in the
- * better basin before the Phase-4.6 polish re-runs. 0 disables the phase
- * and recovers the previous pipeline bit-exactly. */
-#ifndef HEX_TWIN_ROUNDS
-#define HEX_TWIN_ROUNDS 3
-#endif
 /* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
  *
  * The quantity that matters downstream is the layer-output error
@@ -2238,108 +2229,363 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
         err_shaped += hex_spectral_penalty(e_gs, QK4_0);
         int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
-        /* ══ Q4_0 PHASE 4.6: Closed-form d refit + ULP polish ══
-         * After the Hadamard descent may have flipped several codes, the
-         * assembly-phase d is no longer optimal for the committed code set.
-         * A one-shot WLS refit followed by a ±4 ULP micro-search recovers
-         * the FP16 rounding gap.  Accept only on strict improvement of the
-         * extended objective — monotone-safe, cannot raise RMSE. */
-        {
-            /* ── (1) Closed-form WLS d refit against committed codes ── */
-            float num_r = 0.0f, den_r = 0.0f;
-            float dcS_r = 0.0f, dcQ_r = 0.0f;
-            for (int j = 0; j < QK4_0; j++) {
-                float qc = (float)q_final[j] - 8.0f;
-                float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
-                num_r += w * bw[j] * qc;
-                den_r += w * qc * qc;
-                dcS_r += bw[j];
-                dcQ_r += qc;
             }
-            num_r += (HEX_DC_LAMBDA / (float)QK4_0) * dcS_r * dcQ_r;
-            den_r += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ_r * dcQ_r;
-            if (den_r > 1e-15f) {
-                float d_refit = num_r / den_r;
-                float d_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_refit));
-                float err_cur_r = 0.0f, err_try_r = 0.0f;
-                float e_cr[QK4_0], e_tr[QK4_0];
-                for (int j = 0; j < QK4_0; j++) {
-                    float qc = (float)q_final[j] - 8.0f;
-                    float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
-                    e_cr[j] = bw[j] - qc * actual_d;
-                    e_tr[j] = bw[j] - qc * d_try;
-                    err_cur_r += e_cr[j] * e_cr[j] * w;
-                    err_try_r += e_tr[j] * e_tr[j] * w;
                 }
-                err_cur_r += hex_spectral_penalty(e_cr, QK4_0);
-                err_try_r += hex_spectral_penalty(e_tr, QK4_0);
-                if (err_try_r < err_cur_r) {
-                    actual_d = d_try;
-                    output[blk].d = gguf_fp32_to_fp16(d_try);
                 }
             }
-            /* ── (2) ±4 ULP micro-search on the true objective ── */
-            {
-                uint16_t base_d16_p = gguf_fp32_to_fp16(actual_d);
-                uint16_t best_d16_p = base_d16_p;
-                float best_pol_err = 1e30f;
-                for (int delta_p = -4; delta_p <= 4; delta_p++) {
-                    int cd16_p = (int)base_d16_p + delta_p;
-                    if (cd16_p < 0 || cd16_p > 0x7BFF) continue;
-                    float trial_d_p = gguf_fp16_to_fp32((uint16_t)cd16_p);
-                    float trial_id_p = (fabsf(trial_d_p) > 1e-15f) ? 1.0f / trial_d_p : 0.0f;
-                    float err_p = 0.0f;
-                    float e_p[QK4_0];
-                    for (int j = 0; j < QK4_0; j++) {
-                        /* Re-quantize with trial_d_p to get optimal codes */
-                        int q_p = (int)(bw[j] * trial_id_p + 8.5f);
-                        if (q_p < 0) q_p = 0; if (q_p > 15) q_p = 15;
-                        float deq_p = ((float)q_p - 8.0f) * trial_d_p;
-                        float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
-                        e_p[j] = bw[j] - deq_p;
-                        err_p += e_p[j] * e_p[j] * w;
                     }
-                    err_p += hex_spectral_penalty(e_p, QK4_0);
-                    if (err_p < best_pol_err) {
-                        best_pol_err = err_p;
-                        best_d16_p = (uint16_t)cd16_p;
                     }
                 }
-                if (best_d16_p != base_d16_p) {
-                    actual_d = gguf_fp16_to_fp32(best_d16_p);
-                    output[blk].d = best_d16_p;
-                    /* Recompute final codes with the polished d */
-                    float id_pol = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;
-                    for (int j = 0; j < QK4_0; j++) {
-                        int q_pol = (int)(bw[j] * id_pol + 8.5f);
-                        if (q_pol < 0) q_pol = 0; if (q_pol > 15) q_pol = 15;
-                        q_final[j] = q_pol;
                     }
                 }
             }
         }
-        for (int j = 0; j < QK4_0 / 2; j++) {
-            int q0 = q_final[j];
-            int q1 = q_final[j + QK4_0/2];
-            output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4));
-            float deq0 = ((float)q0 - 8.0f) * actual_d;
-            float deq1 = ((float)q1 - 8.0f) * actual_d;
-            total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
         }
     }
-    *out_total_error = total_err;
-    free(greedy_d);
     free(cand_errors);
     free(cand_d16);
     free(best_candidate);
 }
 /* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
  * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
  * candidate generation, so stored codes are unnecessary. */
@@ -3688,7 +3934,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
          * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
          * into L survives wherever it is SSE-neutral, and is overridden
          * only where it was costing true reconstruction error.            */
-        for (int twin_round = 0; ; twin_round++) {
         {
             uint8_t pl_Ls[16], pl_Lm[16];
             for (int j = 0; j < N_SUB; j++) {
@@ -3970,201 +4215,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             output[blk].dmin = gguf_fp32_to_fp16(mm);
         }
-        /* ══ PHASE 4.65: DIGITAL TWIN COLLAPSE (ideal-block basin jump) ══
-         *
-         * The polish above is exact coordinate descent, but coordinate
-         * descent only reaches the bottom of the basin it starts in. This
-         * phase builds the block's DIGITAL TWIN: the residual-free encoding
-         * implied by seeds[blk] — the exact per-sub-block qkx2 optima, i.e.
-         * the block as it "wants" to be quantised before any super-scale
-         * compromise. A 6×6 hexagonal lattice of (d, dmin) scalings around
-         * that ideal is scored EXACTLY (derived sub-scales, per-weight
-         * optimal q, true extended objective vs the ORIGINAL weights). The
-         * 36 scores are loaded into a 2-quhit twin as Boltzmann amplitudes
-         * sharply peaked at the minimum, CZ-entangled, and collapsed via
-         * shor_measure_graph — the same Griffiths–Niu machinery as the
-         * assembly phase. The collapsed cell (guarded by the exact argmin,
-         * so the collapse can never land above the ideal) is committed only
-         * on STRICT improvement of E, after which the Phase-4.6 polish
-         * re-runs inside the new basin. Strict-improvement commits ⇒ the
-         * extended objective is monotone: final RMSE cannot go up, and the
-         * Phase-4.7 candidate floor below still applies unchanged.        */
-        int twin_jumped = 0;
-        if (twin_round < HEX_TWIN_ROUNDS) {
-            /* Committed extended error — true objective, original weights */
-            float E_cur = 0.0f;
-            {
-                float e_c[QK_K];
-                for (int j = 0; j < N_SUB; j++) {
-                    float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
-                    float m_sub = mm * (float)(output[blk].scales[j] >> 4);
-                    for (int k = 0; k < 16; k++) {
-                        int   idx = 16 * j + k;
-                        float w   = (imat_importance) ?
-                                    imat_importance[blk * QK_K + idx] : 1.0f;
-                        e_c[idx] = block_x[idx] -
-                                   (d_sub * (float)L[idx] - m_sub);
-                        E_cur += e_c[idx] * e_c[idx] * w;
-                    }
-                }
-                E_cur += hex_spectral_penalty(e_c, QK_K);
-            }
-            /* The ideal twin's super-scalars: residual-free encoding of the
-             * exact per-sub optima (4-bit sub-scales span 0..15). */
-            float id_s = 0.0f, id_m = 0.0f;
-            for (int j = 0; j < N_SUB; j++) {
-                if (seeds[blk].scales[j] > id_s) id_s = seeds[blk].scales[j];
-                if (seeds[blk].mins[j]   > id_m) id_m = seeds[blk].mins[j];
-            }
-            float d_id = id_s / 15.0f;
-            float m_id = id_m / 15.0f;
-            if (d_id > 1e-15f) {
-                /* 6×6 hex lattice of scalings around the ideal twin */
-                static const float hexfac[6] =
-                    {0.79f, 0.88f, 0.96f, 1.04f, 1.13f, 1.23f};
-                float cell_E[6][6];
-                float bestE = 1e30f;
-                int   ba = 0, bb = 0;
-                for (int a = 0; a < 6; a++) {
-                    float c_dm = gguf_fp16_to_fp32(
-                                     gguf_fp32_to_fp16(d_id * hexfac[a]));
-                    for (int b = 0; b < 6; b++) {
-                        float c_mm = gguf_fp16_to_fp32(
-                                         gguf_fp32_to_fp16(m_id * hexfac[b]));
-                        uint8_t c_Ls[16], c_Lm[16];
-                        hex_derive_subscales(seeds[blk].scales,
-                                             seeds[blk].mins,
-                                             c_dm, c_mm, c_Ls, c_Lm);
-                        float e_t[QK_K];
-                        float E = 0.0f;
-                        for (int j = 0; j < N_SUB; j++) {
-                            float d_sub = c_dm * (float)c_Ls[j];
-                            float m_sub = c_mm * (float)c_Lm[j];
-                            for (int k = 0; k < 16; k++) {
-                                int   idx = 16 * j + k;
-                                float w   = (imat_importance) ?
-                                            imat_importance[blk * QK_K + idx]
-                                            : 1.0f;
-                                int q = 0;
-                                if (d_sub >= 1e-15f) {
-                                    q = gguf_nearest_int(
-                                            (block_x[idx] + m_sub) / d_sub);
-                                    if (q < 0) q = 0; if (q > 3) q = 3;
-                                }
-                                e_t[idx] = block_x[idx] -
-                                           (d_sub * (float)q - m_sub);
-                                E += e_t[idx] * e_t[idx] * w;
-                            }
-                        }
-                        E += hex_spectral_penalty(e_t, QK_K);
-                        cell_E[a][b] = E;
-                        if (E < bestE) { bestE = E; ba = a; bb = b; }
-                    }
-                }
-                /* Load the twin: two quhits over the lattice axes, Boltzmann
-                 * amplitudes sharply peaked at the ideal, CZ-entangled,
-                 * then collapsed — house pattern from the assembly phase. */
-                int _tw_tid = 0;
-                #ifdef _OPENMP
-                _tw_tid = omp_get_thread_num();
-                #endif
-                HPCGraph *tg = _tl_graphs[_tw_tid];
-                hpc_reset_for_subblock(tg, 2);
-                {
-                    double axmin[2][6];
-                    for (int a = 0; a < 6; a++) {
-                        axmin[0][a] = 1e30;
-                        axmin[1][a] = 1e30;
-                        for (int b = 0; b < 6; b++) {
-                            if (cell_E[a][b] < axmin[0][a])
-                                axmin[0][a] = cell_E[a][b];
-                            if (cell_E[b][a] < axmin[1][a])
-                                axmin[1][a] = cell_E[b][a];
-                        }
-                    }
-                    for (int s = 0; s < 2; s++) {
-                        double lo = 1e30, hi = -1e30;
-                        for (int v = 0; v < 6; v++) {
-                            if (axmin[s][v] < lo) lo = axmin[s][v];
-                            if (axmin[s][v] > hi) hi = axmin[s][v];
-                        }
-                        /* Sharp peak: low temperature ⇒ the collapse lands
-                         * on the ideal with overwhelming amplitude. */
-                        double T = (hi - lo) * 0.05 + 1e-12;
-                        double amp_re[6], amp_norm = 0.0;
-                        for (int v = 0; v < 6; v++) {
-                            amp_re[v] = exp(-(axmin[s][v] - lo) / T);
-                            amp_norm += amp_re[v] * amp_re[v];
-                        }
-                        if (amp_norm > 1e-30) {
-                            double inv = 1.0 / sqrt(amp_norm);
-                            for (int v = 0; v < 6; v++) amp_re[v] *= inv;
-                        }
-                        for (int v = 0; v < 6; v++) {
-                            tg->locals[s].edge_re[v] = amp_re[v];
-                            tg->locals[s].edge_im[v] = 0.0;
-                        }
-                        tg->locals[s].primary = VIEW_EDGE;
-                        tg->locals[s].dirty   = DIRTY_VERTEX | DIRTY_DIAGONAL
-                                              | DIRTY_FOLDED;
-                        tg->locals[s].delta_valid = 0;
-                        triality_update_mask(&tg->locals[s]);
-                    }
-                    hpc_cz(tg, 0, 1);
-                    double tw_marg[2][6];
-                    int    tw_meas[2];
-                    memset(tw_marg, 0, sizeof(tw_marg));
-                    memset(tw_meas, 0, sizeof(tw_meas));
-                    shor_measure_graph(tg, 2, tw_marg, tw_meas, 1);
-                    int ca = tw_meas[0], cb = tw_meas[1];
-                    /* Collapse guard: the twin may never land above its own
-                     * ideal — the exact argmin backstops the Born draw. */
-                    if (cell_E[ca][cb] > bestE) { ca = ba; cb = bb; }
-                    if (cell_E[ca][cb] < E_cur) {
-                        /* Commit the collapsed cell — rebuilt exactly as it
-                         * was scored (same pattern as the Phase-4.7 floor). */
-                        float c_dm = gguf_fp16_to_fp32(
-                                         gguf_fp32_to_fp16(d_id * hexfac[ca]));
-                        float c_mm = gguf_fp16_to_fp32(
-                                         gguf_fp32_to_fp16(m_id * hexfac[cb]));
-                        uint8_t c_Ls[16], c_Lm[16];
-                        hex_derive_subscales(seeds[blk].scales,
-                                             seeds[blk].mins,
-                                             c_dm, c_mm, c_Ls, c_Lm);
-                        for (int j = 0; j < N_SUB; j++) {
-                            float d_sub = c_dm * (float)c_Ls[j];
-                            float m_sub = c_mm * (float)c_Lm[j];
-                            for (int k = 0; k < 16; k++) {
-                                int idx = 16 * j + k;
-                                int q = 0;
-                                if (d_sub >= 1e-15f) {
-                                    q = gguf_nearest_int(
-                                            (block_x[idx] + m_sub) / d_sub);
-                                    if (q < 0) q = 0; if (q > 3) q = 3;
-                                }
-                                L[idx] = (uint8_t)q;
-                            }
-                            output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
-                        }
-                        dm = c_dm;
-                        mm = c_mm;
-                        output[blk].d    = gguf_fp32_to_fp16(dm);
-                        output[blk].dmin = gguf_fp32_to_fp16(mm);
-                        twin_jumped = 1;
-                    }
-                }
-            }
-        }
-        if (!twin_jumped) break;   /* no better basin — twin and block agree */
-        }   /* twin_round */
         /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
          *
          * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
@@ -4909,6 +4959,18 @@ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     if (out_error) *out_error = err;
 }
 #ifndef HEXSTATE_LIBRARY
 /* ═══════════════════════════════════════════════════════════════════════════
  * MAIN

  * conservative too" — creating coherent precision allocation.
  * ═══════════════════════════════════════════════════════════════════════════ */
 /* ── Multi-quhit expanded scale table ──
  * Search grid: 24×24 = 576 (d, dmin) candidates
  * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
 #define HEX_POLISH_ULP 4
 #endif
 /* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
  *
  * The quantity that matters downstream is the layer-output error
         err_shaped += hex_spectral_penalty(e_gs, QK4_0);
         int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
+        for (int j = 0; j < QK4_0 / 2; j++) {
+            int q0 = q_final[j];
+            int q1 = q_final[j + QK4_0/2];
+            output[blk].qs[j] = (uint8_t)(q0 | (q1 << 4));
+            float deq0 = ((float)q0 - 8.0f) * actual_d;
+            float deq1 = ((float)q1 - 8.0f) * actual_d;
+            total_err += (bw[j] - deq0) * (bw[j] - deq0) + (bw[j + QK4_0/2] - deq1) * (bw[j + QK4_0/2] - deq1);
+        }
+    }
+    *out_total_error = total_err;
+    free(greedy_d);
+    free(cand_errors);
+    free(cand_d16);
+    free(best_candidate);
+}
+/* ════════════════════════════════════════════════════════════════════════
+ * Q8_0 HPC QUANTIZER — Shor pipeline at 8 bits
+ *
+ * Same pipeline as Q4_0: WLS scale + tight candidate grid scored on the
+ * extended objective (weighted SSE + DC + vesica/wave), triality-quhit
+ * graph with Boltzmann-encoded candidate errors, CZ chain entanglement,
+ * Shor Griffiths-Niu sequential measurement for bin consensus, greedy
+ * override (HEX_GREEDY_OVERRIDE_RATIO), then per-block ULP polish, the
+ * vesica/DC error-shaping descent with an extended-objective guard, and
+ * the candidate floor. Intended for embedding / LM-head tensors (tied
+ * embeddings especially), where 2-4 bit codes destroy logit precision.
+ * At 8 bits the candidate grid is tight (±1.5%) — the win over naive
+ * amax/127 rounding comes from WLS + ULP + spectral selection, not from
+ * coarse scale exploration.
+ * ════════════════════════════════════════════════════════════════════════ */
+#ifndef QK8_0
+#define QK8_0 32
+#endif
+typedef struct { uint16_t d; int8_t qs[QK8_0]; } hex_block_q8_0;
+#define Q8_N_CAND 24
+static const float Q8_NEIGHBOR_MULTS[Q8_N_CAND] = {
+    0.9850f, 0.9865f, 0.9880f, 0.9895f, 0.9910f, 0.9925f,
+    0.9940f, 0.9952f, 0.9964f, 0.9976f, 0.9988f, 1.0000f,
+    1.0010f, 1.0020f, 1.0030f, 1.0040f, 1.0052f, 1.0064f,
+    1.0076f, 1.0088f, 1.0100f, 1.0115f, 1.0130f, 1.0150f,
+};
+/* 24 candidates → 6 quhit states (4 per bin), same folding as Q4_0 */
+static const int Q8_CAND_TO_QUHIT[Q8_N_CAND] = {
+    0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4, 5,5,5,5
+};
+static inline float q8_block_ext_err(const float *bw, const float *iw,
+                                     float d, int8_t *qs_out)
+{
+    float e_arr[QK8_0];
+    float id = (fabsf(d) > 1e-20f) ? 1.0f / d : 0.0f;
+    float err = 0.0f;
+    for (int j = 0; j < QK8_0; j++) {
+        int q = gguf_nearest_int(bw[j] * id);
+        if (q < -127) q = -127; if (q > 127) q = 127;
+        if (qs_out) qs_out[j] = (int8_t)q;
+        float e = bw[j] - (float)q * d;
+        e_arr[j] = e;
+        float w = iw ? iw[j] : 1.0f;
+        err += e * e * w;
+    }
+    return err + hex_spectral_penalty(e_arr, QK8_0);
+}
+static void quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
+                                     hex_block_q8_0 *output,
+                                     float *out_total_error,
+                                     const float *imat_importance, int verbose)
+{
+    int64_t n_blocks = n_elements / QK8_0;
+    float total_err = 0.0f;
+    (void)verbose;
+    float (*cand_errors)[Q8_N_CAND] = (float (*)[Q8_N_CAND])
+        calloc(n_blocks, sizeof(float[Q8_N_CAND]));
+    uint16_t (*cand_d16)[Q8_N_CAND] = (uint16_t (*)[Q8_N_CAND])
+        calloc(n_blocks, sizeof(uint16_t[Q8_N_CAND]));
+    int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
+    if (!cand_errors || !cand_d16 || !best_candidate) {
+        free(cand_errors); free(cand_d16); free(best_candidate);
+        if (out_total_error) *out_total_error = -1.0f;
+        return;
+    }
+    /* ── Phase 1+2: WLS-refined scale + tight candidate grid ── */
+    #pragma omp parallel for schedule(dynamic, 256)
+    for (int64_t blk = 0; blk < n_blocks; blk++) {
+        const float *bw = weights + blk * QK8_0;
+        const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;
+        float amax = 0.0f;
+        for (int j = 0; j < QK8_0; j++) {
+            float av = fabsf(bw[j]);
+            if (av > amax) amax = av;
+        }
+        float wls_d = amax / 127.0f;
+        /* ggml-style fixed-point WLS with DC rank-1 augmentation */
+        for (int it = 0; it < 3 && wls_d > 1e-20f; it++) {
+            float inv_d = 1.0f / wls_d;
+            float num = 0.0f, den = 0.0f, dcS = 0.0f, dcQ = 0.0f;
+            for (int j = 0; j < QK8_0; j++) {
+                int q = gguf_nearest_int(bw[j] * inv_d);
+                if (q < -127) q = -127; if (q > 127) q = 127;
+                float qf = (float)q;
+                float w  = iw ? iw[j] : 1.0f;
+                num += w * bw[j] * qf;
+                den += w * qf * qf;
+                dcS += bw[j];
+                dcQ += qf;
             }
+            num += (HEX_DC_LAMBDA / (float)QK8_0) * dcS * dcQ;
+            den += (HEX_DC_LAMBDA / (float)QK8_0) * dcQ * dcQ;
+            if (den > 1e-15f) {
+                float d_new = num / den;
+                if (d_new > 1e-20f) wls_d = d_new;
+            }
+        }
+        for (int ci = 0; ci < Q8_N_CAND; ci++) {
+            float    trial_d  = wls_d * Q8_NEIGHBOR_MULTS[ci];
+            uint16_t d16      = gguf_fp32_to_fp16(trial_d);
+            float    actual_d = gguf_fp16_to_fp32(d16);
+            cand_d16  [blk][ci] = d16;
+            cand_errors[blk][ci] = q8_block_ext_err(bw, iw, actual_d, NULL);
+        }
+        best_candidate[blk] = 11;   /* ×1.0000 neutral seed */
+    }
+    /* ── Phase 3: Shor graph — triality quhits, CZ chain, GN measurement ── */
+    int shor_ran = 0;
+    if (n_blocks >= 2) {
+        int64_t graph_blocks = (n_blocks > 200) ? 200 : n_blocks;
+        int64_t stride = n_blocks / graph_blocks;
+        HPCGraph *graph = hpc_create(graph_blocks);
+        if (graph) {
+            shor_ran = 1;
+            /* Adaptive temperature from the candidate-error landscape */
+            float temperature = 1e-10f;
+            {
+                double err_accum = 0.0;
+                int err_count = 0;
+                for (int64_t gi = 0; gi < graph_blocks && gi < 100; gi++) {
+                    int64_t blk = gi * stride;
+                    float max_e = 0.0f;
+                    for (int c = 0; c < Q8_N_CAND; c++)
+                        if (cand_errors[blk][c] > max_e)
+                            max_e = cand_errors[blk][c];
+                    err_accum += (double)max_e;
+                    err_count++;
                 }
+                if (err_count > 0) {
+                    temperature = (float)(err_accum / err_count) * 0.1f;
+                    if (temperature < 1e-10f) temperature = 1e-10f;
                 }
             }
+            /* Boltzmann-encode stride-aggregated candidate errors as
+             * quhit amplitudes (24 candidates folded into 6 states) */
+            for (int64_t i = 0; i < graph_blocks; i++) {
+                float agg_errors[Q8_N_CAND];
+                for (int c = 0; c < Q8_N_CAND; c++) agg_errors[c] = 0.0f;
+                int64_t blk_start = i * stride;
+                int64_t blk_end   = blk_start + stride;
+                if (blk_end > n_blocks) blk_end = n_blocks;
+                for (int64_t b = blk_start; b < blk_end; b++)
+                    for (int c = 0; c < Q8_N_CAND; c++)
+                        agg_errors[c] += cand_errors[b][c];
+                float min_err = 1e30f;
+                for (int c = 0; c < Q8_N_CAND; c++)
+                    if (agg_errors[c] < min_err) min_err = agg_errors[c];
+                double amp_re[6] = {0,0,0,0,0,0};
+                double amp_norm = 0.0;
+                for (int ci = 0; ci < Q8_N_CAND; ci++)
+                    amp_re[Q8_CAND_TO_QUHIT[ci]] +=
+                        exp(-(double)(agg_errors[ci] - min_err) /
+                             (2.0 * (double)temperature));
+                for (int v = 0; v < 6; v++) amp_norm += amp_re[v] * amp_re[v];
+                if (amp_norm > 1e-30) {
+                    double inv = 1.0 / sqrt(amp_norm);
+                    for (int v = 0; v < 6; v++) amp_re[v] *= inv;
+                }
+                for (int v = 0; v < 6; v++) {
+                    graph->locals[i].edge_re[v] = amp_re[v];
+                    graph->locals[i].edge_im[v] = 0.0;
+                }
+                graph->locals[i].primary = VIEW_EDGE;
+                graph->locals[i].dirty = DIRTY_VERTEX | DIRTY_DIAGONAL | DIRTY_FOLDED;
+                graph->locals[i].delta_valid = 0;
+                triality_update_mask(&graph->locals[i]);
+            }
+            for (int64_t i = 0; i < graph_blocks - 1; i++)
+                hpc_cz(graph, i, i + 1);
+            double (*marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
+            int *measured = (int *)calloc(graph_blocks, sizeof(int));
+            if (marg && measured) {
+                shor_measure_graph(graph, graph_blocks, marg, measured, 1);
+                /* Per-block selection: best candidate inside the Shor-
+                 * measured bin, then greedy override against the global
+                 * argmin — identical Step-F semantics to Q2_K/Q4_0. */
+                for (int64_t i = 0; i < graph_blocks; i++) {
+                    int bin = measured[i];
+                    if (bin < 0 || bin > 5) {
+                        double bm = -1.0; bin = 0;
+                        for (int v = 0; v < 6; v++)
+                            if (marg[i][v] > bm) { bm = marg[i][v]; bin = v; }
                     }
+                    int64_t blk_start = i * stride;
+                    int64_t blk_end   = blk_start + stride;
+                    if (blk_end > n_blocks) blk_end = n_blocks;
+                    for (int64_t b = blk_start; b < blk_end; b++) {
+                        float bin_best = 1e30f; int bin_cand = -1;
+                        float g_best   = 1e30f; int g_cand   = 0;
+                        for (int c = 0; c < Q8_N_CAND; c++) {
+                            float e = cand_errors[b][c];
+                            if (e < g_best) { g_best = e; g_cand = c; }
+                            if (Q8_CAND_TO_QUHIT[c] == bin && e < bin_best) {
+                                bin_best = e; bin_cand = c;
+                            }
+                        }
+                        int sel = (bin_cand >= 0) ? bin_cand : g_cand;
+                        if (g_best < cand_errors[b][sel] * HEX_GREEDY_OVERRIDE_RATIO)
+                            sel = g_cand;
+                        best_candidate[b] = sel;
                     }
                 }
+            }
+            free(marg); free(measured);
+            hpc_destroy(graph);
+        }
+    }
+    if (!shor_ran) {
+        for (int64_t blk = 0; blk < n_blocks; blk++) {
+            float g_best = cand_errors[blk][0]; int g_cand = 0;
+            for (int c = 1; c < Q8_N_CAND; c++)
+                if (cand_errors[blk][c] < g_best) {
+                    g_best = cand_errors[blk][c]; g_cand = c;
+                }
+            best_candidate[blk] = g_cand;
+        }
+    }
+    /* ── Phase 4: ULP polish + vesica/DC shaping guard + floor ── */
+    #pragma omp parallel for schedule(dynamic, 256) reduction(+:total_err)
+    for (int64_t blk = 0; blk < n_blocks; blk++) {
+        const float *bw = weights + blk * QK8_0;
+        const float *iw = imat_importance ? imat_importance + blk * QK8_0 : NULL;
+        int cidx = best_candidate[blk];
+        uint16_t best_d16 = cand_d16[blk][cidx];
+        float    best_err = cand_errors[blk][cidx];
+        /* ±8 fp16 ULP joint search on the extended objective */
+        for (int du = -8; du <= 8; du++) {
+            if (du == 0) continue;
+            int c16 = (int)cand_d16[blk][cidx] + du;
+            if (c16 <= 0 || c16 > 0x7BFF) continue;
+            float td  = gguf_fp16_to_fp32((uint16_t)c16);
+            float err = q8_block_ext_err(bw, iw, td, NULL);
+            if (err < best_err) { best_err = err; best_d16 = (uint16_t)c16; }
+        }
+        /* Candidate floor: final ≤ best raw grid candidate (by construction
+         * the ULP search already starts from it, so this is implicit). */
+        float  d = gguf_fp16_to_fp32(best_d16);
+        int8_t qs[QK8_0];
+        (void)q8_block_ext_err(bw, iw, d, qs);
+        /* Vesica/DC greedy shaping with extended-objective guard */
+        {
+            int8_t qs_shaped[QK8_0];
+            memcpy(qs_shaped, qs, QK8_0);
+            float e_live[QK8_0], v_live[QK8_0 / 2];
+            float vesica_cur = 0.0f, dc_cur = 0.0f;
+            for (int k = 0; k < QK8_0; k++)
+                e_live[k] = bw[k] - (float)qs_shaped[k] * d;
+            for (int p = 0; p < QK8_0 / 2; p++) {
+                v_live[p] = e_live[p] + e_live[p + QK8_0 / 2];
+                vesica_cur += v_live[p] * v_live[p];
+                dc_cur     += v_live[p];
+            }
+            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+            for (int pass = 0; pass < QK8_0; pass++) {
+                int best_k = -1, best_q_alt = 0;
+                float best_delta = 0.0f;
+                for (int k = 0; k < QK8_0; k++) {
+                    int q_try = (e_live[k] >= 0.0f) ? qs_shaped[k] + 1
+                                                    : qs_shaped[k] - 1;
+                    if (q_try < -127 || q_try > 127) continue;
+                    float e_new = bw[k] - (float)q_try * d;
+                    float de    = e_new - e_live[k];
+                    int   pi    = (k < QK8_0 / 2) ? k : k - QK8_0 / 2;
+                    float v_new = v_live[pi] + de;
+                    float ves_a = vesica_cur - v_live[pi] * v_live[pi]
+                                             + v_new * v_new;
+                    float dc_a  = dc_cur + de;
+                    float delta = metric_cur - (4.0f * ves_a + dc_a * dc_a);
+                    if (delta > best_delta) {
+                        best_delta = delta; best_k = k; best_q_alt = q_try;
                     }
                 }
+                if (best_k < 0) break;
+                {
+                    float e_new = bw[best_k] - (float)best_q_alt * d;
+                    float de    = e_new - e_live[best_k];
+                    int   pi    = (best_k < QK8_0 / 2) ? best_k
+                                                       : best_k - QK8_0 / 2;
+                    float v_new = v_live[pi] + de;
+                    vesica_cur += v_new * v_new - v_live[pi] * v_live[pi];
+                    dc_cur     += de;
+                    metric_cur  = 4.0f * vesica_cur + dc_cur * dc_cur;
+                    v_live[pi]      = v_new;
+                    e_live[best_k]  = e_new;
+                    qs_shaped[best_k] = (int8_t)best_q_alt;
+                }
+            }
+            /* Guard on the extended objective vs originals */
+            float e_b[QK8_0], e_s[QK8_0];
+            float err_b = 0.0f, err_s = 0.0f;
+            for (int k = 0; k < QK8_0; k++) {
+                float w = iw ? iw[k] : 1.0f;
+                e_b[k] = bw[k] - (float)qs[k]        * d;
+                e_s[k] = bw[k] - (float)qs_shaped[k] * d;
+                err_b += e_b[k] * e_b[k] * w;
+                err_s += e_s[k] * e_s[k] * w;
             }
+            err_b += hex_spectral_penalty(e_b, QK8_0);
+            err_s += hex_spectral_penalty(e_s, QK8_0);
+            if (err_s < err_b) memcpy(qs, qs_shaped, QK8_0);
         }
+        output[blk].d = best_d16;
+        for (int k = 0; k < QK8_0; k++) {
+            output[blk].qs[k] = qs[k];
+            float e = bw[k] - (float)qs[k] * d;
+            total_err += e * e;          /* pure reconstruction SSE report */
         }
     }
     free(cand_errors);
     free(cand_d16);
     free(best_candidate);
+    if (out_total_error) *out_total_error = total_err;
 }
 /* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
  * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
  * candidate generation, so stored codes are unnecessary. */
          * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
          * into L survives wherever it is SSE-neutral, and is overridden
          * only where it was costing true reconstruction error.            */
         {
             uint8_t pl_Ls[16], pl_Lm[16];
             for (int j = 0; j < N_SUB; j++) {
             output[blk].dmin = gguf_fp32_to_fp16(mm);
         }
         /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
          *
          * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
     if (out_error) *out_error = err;
 }
+int hexstate_q8_0_block_bytes(void)    { return (int)sizeof(hex_block_q8_0); }
+int hexstate_q8_0_block_elements(void) { return QK8_0; }
+void hexstate_quantize_tensor_q8_0_hpc(const float *weights, int64_t n_elements,
+                                       void *output, float *out_error,
+                                       const float *imat_importance, int verbose)
+{
+    quantize_tensor_q8_0_hpc(weights, n_elements,
+                             (hex_block_q8_0 *)output, out_error,
+                             imat_importance, verbose);
+}
 #ifndef HEXSTATE_LIBRARY
 /* ═══════════════════════════════════════════════════════════════════════════
  * MAIN