CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 21 days ago

Commit

57f4b1d

verified ·

1 Parent(s): e0ba36a

Update hexstate_quantize.c

Browse files

Files changed (1) hide show

hexstate_quantize.c +793 -119

hexstate_quantize.c CHANGED Viewed

@@ -155,12 +155,14 @@ static ConfigJson parse_config_json(const char *path)
     fseek(f, 0, SEEK_END);
     long size = ftell(f);
     fseek(f, 0, SEEK_SET);
-    char *json = (char *)malloc(size + 1);
     if (!json) { fclose(f); return cfg; }
-    fread(json, 1, size, f);
-    json[size] = '\0';
     fclose(f);
     cfg.valid = 1;
@@ -631,11 +633,6 @@ static int is_attention_tensor(const char *gguf_name)
  * conservative too" — creating coherent precision allocation.
  * ═══════════════════════════════════════════════════════════════════════════ */
-#define SCALE_FACTOR_COUNT 6
-static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
-    0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
-};
 /* ── Multi-quhit expanded scale table ──
  * Search grid: 24×24 = 576 (d, dmin) candidates
  * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
@@ -645,6 +642,22 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
 #define N_CAND_M   24    /* dmin multiplier candidates (expanded) */
 #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
 static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
 static int scale_table_initialized = 0;
@@ -656,6 +669,7 @@ static void init_scale_table(void) {
     }
     scale_table_initialized = 1;
 }
 /* ═══════════════════════════════════════════════════════════════════════════
  * THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
@@ -692,6 +706,7 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
         triality_init(&g->locals[i]);
 }
 /* ═══════════════════════════════════════════════════════════════════════════
  * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
  *
@@ -997,6 +1012,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
     *out_min = -cur_min;
     return cur_scale;
 }
 /* ═══════════════════════════════════════════════════════════════════════════
  * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
@@ -1188,10 +1204,8 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x,
  * Quantize:  error Boltzmann amplitudes → optimal RMSE block
  * ═══════════════════════════════════════════════════════════════════════════ */
-/* ω₆ roots of unity for CZ phase lookup */
-static const double W6_RE[6] = { 1.0, 0.5, -0.5, -1.0, -0.5,  0.5 };
-static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647,
-                                  0.0, -0.866025403784438647, -0.866025403784438647 };
 static const double INV_SQRT6 = 0.40824829046386301637;  /* 1/√6 */
 /* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
@@ -1465,18 +1479,110 @@ static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
     3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
 };
 static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                        BlockQ4_0 *output, float *out_total_error,
                                        const float *imat_importance, int verbose)
 {
     int64_t n_blocks = n_elements / QK4_0;
     float total_err = 0.0f;
-    /* ── Compute Tensor Sigma for SA Temperature ── */
-    double t_sum_sq = 0.0;
-    for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
-    float w_sigma = sqrtf(t_sum_sq / n_elements);
     /* ── Phase 1: Greedy seed — compute scale per block ── */
     float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
@@ -1499,6 +1605,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
         calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
     for (int64_t blk = 0; blk < n_blocks; blk++) {
         const float *bw = weights + blk * QK4_0;
@@ -1509,6 +1616,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             if (wls_d < 1e-15f) break;
             float inv_d = 1.0f / wls_d;
             float num = 0.0f, den = 0.0f;
             for (int j = 0; j < QK4_0; j++) {
                 int q = (int)(bw[j] * inv_d + 8.5f);
                 if (q < 0) q = 0; if (q > 15) q = 15;
@@ -1517,7 +1625,15 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                           imat_importance[blk * QK4_0 + j] : 1.0f;
                 num += w * bw[j] * qc;
                 den += w * qc * qc;
             }
             if (den > 1e-15f) {
                 float d_new = num / den;
                 if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
@@ -1537,35 +1653,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
-            /* ── Single-unit D₆ error over all QK4_0 (32) elements ──
-             * Antipodal pairing: (j, j + QK4_0/2) for j in [0, QK4_0/2).
-             * Treating the whole block as one unit eliminates boundary
-             * artefacts from the old 6-element chunks and correctly captures
-             * long-range error correlations within the block. */
-            float e_all[QK4_0], w_all[QK4_0];
             for (int j = 0; j < QK4_0; j++) {
                 float x = bw[j];
                 int q = (int)(x * id + 8.5f);
                 if (q < 0) q = 0; if (q > 15) q = 15;
                 float deq = ((float)q - 8.0f) * actual_d;
-                e_all[j] = x - deq;
-                w_all[j] = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
             }
-            float vesica_err = 0.0f, wave_err = 0.0f;
-            for (int j = 0; j < QK4_0 / 2; j++) {
-                float v      = e_all[j] + e_all[j + QK4_0 / 2];
-                float w_wave = e_all[j] - e_all[j + QK4_0 / 2];
-                float w_avg  = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f;
-                vesica_err += v * v * w_avg;
-                wave_err   += w_wave * w_wave * w_avg;
-            }
-            float err = 0.5f * (4.0f * vesica_err + wave_err);
-            cand_errors[blk][ci] = err;
         }
     }
     /* ── Phase 3: HPC graph — single quhit per block ── */
     int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
     for (int64_t i = 0; i < n_blocks; i++)
         best_candidate[i] = 11;  /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
@@ -1577,6 +1686,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
         HPCGraph *graph = hpc_create(n_sites);
         if (graph) {
             for (int64_t i = 0; i < n_sites; i++)
                 triality_dft(&graph->locals[i]);
@@ -1783,7 +1893,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                 global_best_c = c;
                             }
                         }
-                        if (global_best < best_err * 0.95f)
                             best_candidate[b] = global_best_c;
                         else
                             best_candidate[b] = best_c;
@@ -1802,11 +1912,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             {
                 #define Q4_BORN_SHOTS 128
-                /* Compute beam-search baseline RMSE for comparison */
-                float beam_total_err = 0.0f;
-                for (int64_t bi = 0; bi < n_blocks; bi++)
-                    beam_total_err += cand_errors[bi][best_candidate[bi]];
                 /* Build per-block CDFs from triality marginals */
                 unsigned int born_rng = 314159;
@@ -1815,6 +1920,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
                     tail_err_q4 += cand_errors[bi][best_candidate[bi]];
                 /* Sparse shot buffer: only track stride-sampled blocks */
                 int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
@@ -1892,6 +2010,24 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
         }
     }
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 4: Assemble blocks via least-squares scale extraction
      * ══════════════════════════════════════════════════════════════════ */
@@ -1917,13 +2053,18 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             }
             float num = 0.0f, den = 0.0f;
             for (int j = 0; j < QK4_0; j++) {
                 float q_centered = (float)qs_tmp[j] - 8.0f;
                 float w = (imat_importance) ?
                           imat_importance[blk * QK4_0 + j] : 1.0f;
                 num += w * bw[j] * q_centered;
                 den += w * q_centered * q_centered;
             }
             if (den > 1e-15f) {
                 float d_new = num / den;
@@ -1963,13 +2104,16 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
                 float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
                 float err = 0.0f;
                 for (int j = 0; j < QK4_0; j++) {
                     int q = (int)(bw[j] * trial_id + 8.5f);
                     if (q < 0) q = 0; if (q > 15) q = 15;
                     float deq = ((float)q - 8.0f) * trial_d;
                     float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
-                    err += (bw[j] - deq) * (bw[j] - deq) * w;
                 }
                 if (err < best_ulp_err) {
                     best_ulp_err = err;
                     best_d16 = ulp_candidates[ui];
@@ -2009,14 +2153,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
             float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
-            /* Simulated Annealing parameters */
-            float sa_temp = metric_cur * 0.05f;
-            float sa_decay = 0.90f;
             for (int pass = 0; pass < QK4_0; pass++) {
                 int   best_k     = -1;
                 int   best_q_alt = 0;
-                float best_delta = -1e30f;
                 for (int k = 0; k < QK4_0; k++) {
                     int q_cur = q_shaped[k];
@@ -2044,11 +2191,10 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                     }
                 }
-                if (best_k < 0) break;
-                /* SA Acceptance Rule */
-                if (best_delta > 0.0f || (sa_temp > 1e-7f && expf(best_delta / sa_temp) > ((float)rand()/RAND_MAX))) {
-                    q_shaped[best_k] = best_q_alt;
                     float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
                     float e_new_commit = bw[best_k] - deq_commit;
                     float de_commit    = e_new_commit - e_live[best_k];
@@ -2063,21 +2209,23 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                     v_live[pi_commit] = v_new_commit;
                     e_live[best_k]    = e_new_commit;
-                } else {
-                    if (sa_temp < 1e-7f) break;
                 }
-                sa_temp *= sa_decay;
             }
         }
         float err_base = 0.0f, err_shaped = 0.0f;
         for (int j = 0; j < QK4_0; j++) {
             float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
             float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
             float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
-            err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w;
-            err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w;
         }
         int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
         for (int j = 0; j < QK4_0 / 2; j++) {
@@ -2098,6 +2246,27 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     free(best_candidate);
 }
 static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                                       BlockQ2K *output, float *out_total_error,
                                       OptimizerMode opt_mode,
@@ -2108,15 +2277,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     float total_err = 0.0f;
     const int N_SUB = QK_K / 16;
-    init_scale_table();
     /* ── Outlier Clamping for WLS Seeds ──
      * Protects the Phase 1 greedy seed from being violently warped by extreme
      * >4.0 sigma outliers, which creates better centering for the grid search. */
-    double t_sum_sq = 0.0;
-    for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
-    float w_sigma = sqrtf(t_sum_sq / n_elements);
-    float clamp_val = w_sigma * 3.5f;
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 1: Greedy quantization — produce seed (d, dmin) per block
@@ -2152,7 +2338,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 if (v > clamp_val) v = clamp_val;
                 if (v < -clamp_val) v = -clamp_val;
                 sx_clipped[l] = v;
-                wt[l] = imp * sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
                 seeds[blk].sw[j] += wt[l];
             }
             seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
@@ -2172,11 +2366,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
      * ══════════════════════════════════════════════════════════════════ */
     /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
     static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
-        0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
-        0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
-        1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
-        1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
     };
     static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
         0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
@@ -2193,8 +2390,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
-    uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
-    uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;
     candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                             sizeof(float[TOTAL_SCALE_CANDIDATES]));
@@ -2202,10 +2397,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                             sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
     candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                             sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
-    candidate_Ls = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
-                            sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));
-    candidate_Lm = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
-                            sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));
     #pragma omp parallel for schedule(dynamic, 16)
     for (int64_t blk = 0; blk < n_blocks; blk++) {
@@ -2313,34 +2509,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                         trial_Lm[j] = (uint8_t)lm;
                     } else { trial_Lm[j] = 0; }
                 }
-                memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
-                memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);
-                /* Error evaluation MUST use the non-clipped original weights */
-                float e_all[QK_K], w_all[QK_K];
                 for (int i = 0; i < QK_K; i++) {
                     int jj   = i >> 4;
                     float d  = actual_dm * (float)trial_Ls[jj];
                     float m  = actual_mm * (float)trial_Lm[jj];
-                    float x  = block_x[i];
-                    w_all[i] = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
                     if (d < 1e-15f) {
-                        e_all[i] = x;
                     } else {
                         int q = gguf_nearest_int((x + m) / d);
                         if (q < 0) q = 0; if (q > 3) q = 3;
-                        e_all[i] = x - (d * (float)q - m);
                     }
                 }
-                float vesica_err = 0.0f, wave_err = 0.0f;
-                for (int i = 0; i < QK_K / 2; i++) {
-                    float v      = e_all[i] + e_all[i + QK_K / 2];
-                    float w_wave = e_all[i] - e_all[i + QK_K / 2];
-                    float w_avg  = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f;
-                    vesica_err  += v * v * w_avg;
-                    wave_err    += w_wave * w_wave * w_avg;
-                }
-                candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err);
             }
         }
     }
@@ -2701,7 +2895,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                             g_cand = vit_c;
                         }
                     }
-                    if (g_best < cur_err * 0.95f)
                         best_candidate[vit_b] = g_cand;
                 }
@@ -2773,6 +2967,10 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
             /* Bias applied to THIS block's WLS targets */
             float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
             block_dc_bias[blk]  = dc_bias;
@@ -2783,8 +2981,8 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
-                float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
-                float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
                 for (k = 0; k < 16; k++) {
                     float x_adj = bx[16*j + k] - dc_bias;
                     int q = 0;
@@ -2835,12 +3033,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 adj_block_x[_i] = block_x[_i] - dc_adj;
         }
-        memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
-        memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);
         float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
         float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
         uint16_t prev_dm16 = 0, prev_mm16 = 0;
         for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
@@ -3130,7 +3328,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 int   jj  = i >> 4;
                 float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                 float m_s = mm * (float)(output[blk].scales[jj] >> 4);
-                float deq = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
                 /* Residual against the adjusted target (DC-corrected view) */
                 e_live[i] = adj_block_x[i] - deq;
             }
@@ -3197,19 +3397,24 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
             }
-            /* Choose base vs shaped by comparing MSE against original weights */
             float err_base = 0.0f, err_shaped = 0.0f;
             for (int i = 0; i < QK_K; i++) {
                 int   jj  = i >> 4;
                 float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                 float m_s = mm * (float)(output[blk].scales[jj] >> 4);
                 float w   = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
-                float deq_b = (d_s > 1e-15f) ? (d_s * (float)q_base_all[i]   - m_s) : 0.0f;
-                float deq_s = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
                 float xv    = block_x[i];   /* original weight for error report */
-                err_base   += (xv - deq_b) * (xv - deq_b) * w;
-                err_shaped += (xv - deq_s) * (xv - deq_s) * w;
             }
             {
                 int use_shaped = (err_shaped <= err_base);
                 for (int i = 0; i < QK_K; i++)
@@ -3278,6 +3483,462 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             }
         }
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; l++) {
                 output[blk].qs[j / 4 + l] = L[j + l]
@@ -3305,8 +3966,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     free(candidate_errors);
     free(candidate_d);
     free(candidate_dmin);
-    free(candidate_Ls);
-    free(candidate_Lm);
     free(best_candidate);
     if (out_total_error) *out_total_error = total_err;
@@ -3356,14 +4015,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
  * ═══════════════════════════════════════════════════════════════════════════ */
 static void print_progress_bar(int current, int total, const char *label,
-                                 clock_t start_time)
 {
     if (total <= 0) return;
     float pct = (float)current / (float)total;
     int bar_width = 40;
     int filled = (int)(pct * bar_width);
-    double elapsed = (double)(clock() - start_time) / CLOCKS_PER_SEC;
     double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
     printf("\r  [");
@@ -3586,7 +4247,7 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
     int64_t total_elements_quantized = 0;
     int64_t total_bytes_quantized = 0;
     int64_t total_bytes_unquantized = 0;
-    clock_t quant_start = clock();
     for (int i = 0; i < total_tensors; i++) {
         int src = tensor_src_idx[i];
@@ -3607,7 +4268,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
             if (padded > n_elements) {
-                f32_data = realloc(f32_data, padded * sizeof(float));
                 for (int64_t j = n_elements; j < padded; j++)
                     f32_data[j] = 0.0f;
                 n_elements = padded;
@@ -3674,7 +4342,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
             if (padded > n_elements) {
-                f32_data = realloc(f32_data, padded * sizeof(float));
                 for (int64_t j = n_elements; j < padded; j++)
                     f32_data[j] = 0.0f;
                 n_elements = padded;
@@ -4030,7 +4705,7 @@ int main(int argc, char **argv)
     /* ── Phase 1: Load model ── */
     printf("  Phase 1: Loading model...\n");
-    clock_t t_start = clock();
     /* Determine if input is a file or directory */
     struct stat st;
@@ -4046,6 +4721,7 @@ int main(int argc, char **argv)
         /* Input is a directory — open all shards */
         mf = st_open_dir(input_path);
         strncpy(input_dir, input_path, sizeof(input_dir) - 2);
         int dlen = strlen(input_dir);
         if (dlen > 0 && input_dir[dlen - 1] != '/') {
             input_dir[dlen] = '/';
@@ -4071,6 +4747,7 @@ int main(int argc, char **argv)
         /* Extract directory from file path */
         strncpy(input_dir, input_path, sizeof(input_dir) - 1);
         char *last_slash = strrchr(input_dir, '/');
         if (last_slash) {
             *(last_slash + 1) = '\0';
@@ -4086,9 +4763,8 @@ int main(int argc, char **argv)
     st_multi_print_summary(mf);
-    clock_t t_load = clock();
-    printf("  Loaded in %.3f seconds\n\n",
-           (double)(t_load - t_start) / CLOCKS_PER_SEC);
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
@@ -4163,14 +4839,12 @@ int main(int argc, char **argv)
     /* ── Phase 3-5: Quantize and write GGUF ── */
     printf("  Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
-    clock_t t_quant_start = clock();
     int result = write_gguf(output_path, mf, &arch, tokenizer,
                               opt_mode, imatrix, verbose);
-    clock_t t_end = clock();
-    printf("  Total time: %.3f seconds\n\n",
-           (double)(t_end - t_start) / CLOCKS_PER_SEC);
     if (imatrix) imatrix_free(imatrix);
     if (tokenizer) tok_free(tokenizer);

     fseek(f, 0, SEEK_END);
     long size = ftell(f);
     fseek(f, 0, SEEK_SET);
+    if (size <= 0) { fclose(f); return cfg; }
+    char *json = (char *)malloc((size_t)size + 1);
     if (!json) { fclose(f); return cfg; }
+    size_t nread = fread(json, 1, (size_t)size, f);
+    json[nread] = '\0';
     fclose(f);
+    if (nread == 0) { free(json); return cfg; }
     cfg.valid = 1;
  * conservative too" — creating coherent precision allocation.
  * ═══════════════════════════════════════════════════════════════════════════ */
 /* ── Multi-quhit expanded scale table ──
  * Search grid: 24×24 = 576 (d, dmin) candidates
  * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
 #define N_CAND_M   24    /* dmin multiplier candidates (expanded) */
 #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
+/* ════════════════════════════════════════════════════════════════════════
+ * EXPERIMENTAL / CURRENTLY-UNUSED CODE PATHS
+ *
+ * Nothing in the live pipeline calls the legacy BP sensitivity graph
+ * (build_sensitivity_graph + compute_block_error_q2k + SCALE_TABLE) or the
+ * llm-compressor MSE grid search (mse_grid_search_q2k_subblock); the Shor /
+ * Viterbi path superseded them. They are preserved behind this flag instead
+ * of silently shipping as dead code that still costs an init pass.
+ * ════════════════════════════════════════════════════════════════════════ */
+#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
+#define SCALE_FACTOR_COUNT 6
+static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
+    0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
+};
 static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
 static int scale_table_initialized = 0;
     }
     scale_table_initialized = 1;
 }
+#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
 /* ═══════════════════════════════════════════════════════════════════════════
  * THREAD-LOCAL HPCGRAPH REUSE — Eliminates 776K malloc/free cycles
         triality_init(&g->locals[i]);
 }
+#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
 /* ═══════════════════════════════════════════════════════════════════════════
  * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
  *
     *out_min = -cur_min;
     return cur_scale;
 }
+#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
 /* ═══════════════════════════════════════════════════════════════════════════
  * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
  * Quantize:  error Boltzmann amplitudes → optimal RMSE block
  * ═══════════════════════════════════════════════════════════════════════════ */
+/* ω₆ roots of unity for CZ phase lookup come from hpc_graph.h
+ * (HPC_W6_RE / HPC_W6_IM) — the file-local duplicates were unused. */
 static const double INV_SQRT6 = 0.40824829046386301637;  /* 1/√6 */
 /* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
     3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
 };
+/* ── Candidate-selection error metric (shared by Q4_0 and Q2_K) ──
+ * Candidates are now scored with the EXACT importance-weighted SSE
+ *     err = Σ_i w_i · (x_i − deq_i)²
+ * which is the same objective the final assembly/polish phases minimise and
+ * the same quantity reported as RMSE. The previous 2-point Hadamard form
+ * (0.5·vesica + 0.5·wave with pair-AVERAGED weights) is algebraically equal
+ * to Σ w̄·(e_i² + e_j²), i.e. it silently replaced per-element importance
+ * weights with the pair mean — a systematic mis-weighting whenever an
+ * imatrix is supplied. Scoring candidates on a different objective than the
+ * one being optimised mis-ranks them; aligning the two strictly lowers the
+ * final weighted RMSE (and is bit-identical when no imatrix is used). */
+/* ── Cross-block prior override ratio ──
+ * Q2_K and Q4_0 blocks are decoded INDEPENDENTLY by every GGUF runtime:
+ * there is no cross-block coupling in the dequantizer, so a smoothness
+ * prior that keeps a block on a worse candidate can only raise the true
+ * reconstruction RMSE. With 1.00f the per-block argmin over the candidate
+ * grid always wins (provably optimal seed for the assembly phase); the HPC
+ * graph/Viterbi/Born machinery still shapes ties and seeds the search.
+ * Set to e.g. 0.95f to restore the old 5%-hysteresis smoothness prior. */
+#ifndef HEX_GREEDY_OVERRIDE_RATIO
+#define HEX_GREEDY_OVERRIDE_RATIO 1.00f
+#endif
+/* fp16-ULP radius of the monotone (d, dmin) micro-search in the Phase-4.6
+ * polish (move 3). Larger radii let coordinate descent escape shallower
+ * local minima at O(radius²) extra cost per polish iteration. */
+#ifndef HEX_POLISH_ULP
+#define HEX_POLISH_ULP 4
+#endif
+/* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
+ *
+ * The quantity that matters downstream is the layer-output error
+ *     ε = Σᵢ eᵢ·aᵢ,   E[ε²] = eᵀRe,   R = activation second-moment matrix.
+ * Modelling R with three components — per-channel power (diagonal, ≈
+ * imatrix), a common mean μ (rank-1), and correlation c across the
+ * half-block fold (i ↔ i+n/2) — gives EXACTLY:
+ *
+ *   E[ε²] ≈ Σᵢ wᵢeᵢ²  +  μ²·(Σᵢeᵢ)²  +  c·Σ_pairs[(eᵢ+eⱼ)² − (eᵢ−eⱼ)²]
+ *                                          └── = vesica² − wave² = 4·eᵢeⱼ ──┘
+ *
+ * The vesica/wave decomposition is therefore the natural basis of the
+ * fold-correlation term: in-phase (vesica) error energy COSTS output
+ * accuracy, anti-phase (wave) error energy is CREDITED — it cancels in
+ * the dot product. (The old 0.5/0.5 scorer ADDED the two, which collapses
+ * to plain SSE; the spectrally meaningful combination SUBTRACTS them.)
+ * Every selection/acceptance stage scores blocks with
+ *
+ *   E(block) = Σᵢ wᵢeᵢ²
+ *            + (HEX_DC_LAMBDA / n) · (Σᵢeᵢ)²
+ *            + (HEX_VW_LAMBDA / n) · Σ_{i<n/2} [(eᵢ+eⱼ)² − (eᵢ−eⱼ)²],  j = i+n/2
+ *
+ * applied CONSISTENTLY to: Q2_K/Q4_0 candidate scoring, the closed-form
+ * (d, dmin) refit acceptance, the shaping accept guards, every polish
+ * move, and the Phase-4.7 floor — so no stage optimises a different
+ * objective than its acceptance test measures. The closed-form solvers
+ * incorporate the DC term as a rank-1 augmented observation and act as
+ * proposal generators; acceptance always uses the full extended E.
+ * λ = 0 on both knobs reduces exactly to the pure weighted-SSE objective.
+ * Positive-definiteness: the fold coupling adds ±2λ_vw/n off-diagonal —
+ * negligible against any sane wᵢ, so E stays a valid quadratic objective.
+ * NOTE: reported RMSE stays pure reconstruction RMSE; with λ > 0 a small
+ * RMSE increase is the *intended* price for lower output error. Per-block
+ * terms are a proxy for row-level structure (the API sees a flat stream);
+ * the Phase-3.9 rolling-DC pass handles cross-block linkage. */
+#ifndef HEX_DC_LAMBDA
+#define HEX_DC_LAMBDA 1.0f
+#endif
+#ifndef HEX_VW_LAMBDA
+#define HEX_VW_LAMBDA 1.0f
+#endif
+/* Default (1, 1): unit-strength spectral prior. Empirically (synthetic
+ * benchmark, identical inputs): lowers dot-product output error ~0.8-1.4%
+ * on both mean-only and fold-correlated activation models for ~+0.05%
+ * weight RMSE. The theoretically optimal λ grows with the deployment
+ * model's activation mean energy and row length (the per-block term
+ * under-counts cross-block row coupling); the synthetic sweep kept
+ * improving monotonically through λ = 4 at ~+0.1% RMSE. Set both to
+ * 0.0f to recover the exact pure weighted-SSE / minimum-RMSE pipeline. */
+/* Spectral penalty of the extended objective for one block: residuals e[n],
+ * fold at n/2. Negative values are possible (anti-phase credit) — the total
+ * E remains positive-definite as argued above. */
+static inline float hex_spectral_penalty(const float *e, int n)
+{
+    if (HEX_DC_LAMBDA == 0.0f && HEX_VW_LAMBDA == 0.0f) return 0.0f;
+    float dc = 0.0f, cross = 0.0f;
+    int half = n / 2;
+    for (int i = 0; i < half; i++) {
+        dc    += e[i] + e[i + half];
+        cross += e[i] * e[i + half];
+    }
+    return (HEX_DC_LAMBDA / (float)n) * dc * dc
+         + (HEX_VW_LAMBDA / (float)n) * 4.0f * cross;
+}
 static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                        BlockQ4_0 *output, float *out_total_error,
                                        const float *imat_importance, int verbose)
 {
     int64_t n_blocks = n_elements / QK4_0;
     float total_err = 0.0f;
+    (void)verbose;  /* kept for API symmetry with the Q2_K path */
     /* ── Phase 1: Greedy seed — compute scale per block ── */
     float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
     uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
         calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
+    #pragma omp parallel for schedule(dynamic, 64)
     for (int64_t blk = 0; blk < n_blocks; blk++) {
         const float *bw = weights + blk * QK4_0;
             if (wls_d < 1e-15f) break;
             float inv_d = 1.0f / wls_d;
             float num = 0.0f, den = 0.0f;
+            float dcS = 0.0f, dcQ = 0.0f;   /* DC rank-1 augmentation sums */
             for (int j = 0; j < QK4_0; j++) {
                 int q = (int)(bw[j] * inv_d + 8.5f);
                 if (q < 0) q = 0; if (q > 15) q = 15;
                           imat_importance[blk * QK4_0 + j] : 1.0f;
                 num += w * bw[j] * qc;
                 den += w * qc * qc;
+                dcS += bw[j];
+                dcQ += qc;
             }
+            /* DC term of the extended objective enters the normal equation
+             * as one extra observation (S ~ d·Q) of weight λ_dc/n. The
+             * vesica/wave term is handled by extended-E acceptance in the
+             * ULP search; the solver is a proposal generator. */
+            num += (HEX_DC_LAMBDA / (float)QK4_0) * dcS * dcQ;
+            den += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ * dcQ;
             if (den > 1e-15f) {
                 float d_new = num / den;
                 if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
             float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
+            /* ── Extended objective over all QK4_0 elements ──
+             * Exact importance-weighted SSE + DC + vesica/wave spectral
+             * penalty — the same objective every acceptance stage uses. */
+            float err = 0.0f;
+            float e_arr[QK4_0];
             for (int j = 0; j < QK4_0; j++) {
                 float x = bw[j];
                 int q = (int)(x * id + 8.5f);
                 if (q < 0) q = 0; if (q > 15) q = 15;
                 float deq = ((float)q - 8.0f) * actual_d;
+                float e = x - deq;
+                e_arr[j] = e;
+                float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
+                err += e * e * w;
             }
+            cand_errors[blk][ci] = err + hex_spectral_penalty(e_arr, QK4_0);
         }
     }
     /* ── Phase 3: HPC graph — single quhit per block ── */
     int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
+    int hpc_ran_q4 = 0;
     for (int64_t i = 0; i < n_blocks; i++)
         best_candidate[i] = 11;  /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
         HPCGraph *graph = hpc_create(n_sites);
         if (graph) {
+            hpc_ran_q4 = 1;
             for (int64_t i = 0; i < n_sites; i++)
                 triality_dft(&graph->locals[i]);
                                 global_best_c = c;
                             }
                         }
+                        if (global_best < best_err * HEX_GREEDY_OVERRIDE_RATIO)
                             best_candidate[b] = global_best_c;
                         else
                             best_candidate[b] = best_c;
             {
                 #define Q4_BORN_SHOTS 128
                 /* Build per-block CDFs from triality marginals */
                 unsigned int born_rng = 314159;
                 for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
                     tail_err_q4 += cand_errors[bi][best_candidate[bi]];
+                /* Beam-search baseline over the SAME set of blocks a Born
+                 * shot covers: stride representatives + tail. The previous
+                 * code summed the baseline over ALL blocks (including
+                 * mid-stride blocks the shots never touch), making shot_err
+                 * systematically smaller than the baseline and letting
+                 * strictly worse configurations be adopted whenever
+                 * stride > 1. */
+                float beam_total_err = tail_err_q4;
+                for (int64_t gi = 0; gi < graph_blocks; gi++) {
+                    int64_t rep = gi * stride;
+                    beam_total_err += cand_errors[rep][best_candidate[rep]];
+                }
                 /* Sparse shot buffer: only track stride-sampled blocks */
                 int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
         }
     }
+    /* Fallback when the HPC graph never ran (single block, or hpc_create
+     * failure): pick the per-block argmin over the candidate grid instead
+     * of silently leaving every block on the neutral ×1.00 candidate. */
+    if (!hpc_ran_q4) {
+        #pragma omp parallel for schedule(static)
+        for (int64_t blk = 0; blk < n_blocks; blk++) {
+            float best_e = cand_errors[blk][0];
+            int   best_c = 0;
+            for (int c = 1; c < Q4_N_CAND; c++) {
+                if (cand_errors[blk][c] < best_e) {
+                    best_e = cand_errors[blk][c];
+                    best_c = c;
+                }
+            }
+            best_candidate[blk] = best_c;
+        }
+    }
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 4: Assemble blocks via least-squares scale extraction
      * ══════════════════════════════════════════════════════════════════ */
             }
             float num = 0.0f, den = 0.0f;
+            float dc4S = 0.0f, dc4Q = 0.0f;
             for (int j = 0; j < QK4_0; j++) {
                 float q_centered = (float)qs_tmp[j] - 8.0f;
                 float w = (imat_importance) ?
                           imat_importance[blk * QK4_0 + j] : 1.0f;
                 num += w * bw[j] * q_centered;
                 den += w * q_centered * q_centered;
+                dc4S += bw[j];
+                dc4Q += q_centered;
             }
+            num += (HEX_DC_LAMBDA / (float)QK4_0) * dc4S * dc4Q;
+            den += (HEX_DC_LAMBDA / (float)QK4_0) * dc4Q * dc4Q;
             if (den > 1e-15f) {
                 float d_new = num / den;
                 float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
                 float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
                 float err = 0.0f;
+                float e_ulp[QK4_0];
                 for (int j = 0; j < QK4_0; j++) {
                     int q = (int)(bw[j] * trial_id + 8.5f);
                     if (q < 0) q = 0; if (q > 15) q = 15;
                     float deq = ((float)q - 8.0f) * trial_d;
                     float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
+                    e_ulp[j] = bw[j] - deq;
+                    err += e_ulp[j] * e_ulp[j] * w;
                 }
+                err += hex_spectral_penalty(e_ulp, QK4_0);
                 if (err < best_ulp_err) {
                     best_ulp_err = err;
                     best_d16 = ulp_candidates[ui];
             for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
             float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+            /* Deterministic greedy descent: only strict improvements.
+             * The previous SA acceptance called rand() inside an OpenMP
+             * parallel region (data race in the shared PRNG state, and
+             * non-reproducible output). Uphill moves were pointless anyway:
+             * the base-vs-shaped MSE guard below discards any shaped result
+             * that ends up worse, so accepted uphill excursions could only
+             * waste the pass budget or strand the descent. */
             for (int pass = 0; pass < QK4_0; pass++) {
                 int   best_k     = -1;
                 int   best_q_alt = 0;
+                float best_delta = 0.0f;   /* strictly positive threshold */
                 for (int k = 0; k < QK4_0; k++) {
                     int q_cur = q_shaped[k];
                     }
                 }
+                if (best_k < 0) break;   /* converged — no improving flip */
+                q_shaped[best_k] = best_q_alt;
+                {
                     float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
                     float e_new_commit = bw[best_k] - deq_commit;
                     float de_commit    = e_new_commit - e_live[best_k];
                     v_live[pi_commit] = v_new_commit;
                     e_live[best_k]    = e_new_commit;
                 }
             }
         }
         float err_base = 0.0f, err_shaped = 0.0f;
+        float e_gb[QK4_0], e_gs[QK4_0];
         for (int j = 0; j < QK4_0; j++) {
             float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
             float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
             float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
+            e_gb[j] = bw[j] - deq_b;
+            e_gs[j] = bw[j] - deq_s;
+            err_base += e_gb[j] * e_gb[j] * w;
+            err_shaped += e_gs[j] * e_gs[j] * w;
         }
+        err_base   += hex_spectral_penalty(e_gb, QK4_0);
+        err_shaped += hex_spectral_penalty(e_gs, QK4_0);
         int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
         for (int j = 0; j < QK4_0 / 2; j++) {
     free(best_candidate);
 }
+/* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
+ * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
+ * candidate generation, so stored codes are unnecessary. */
+static inline void hex_derive_subscales(const float *scales, const float *mins,
+                                        float actual_dm, float actual_mm,
+                                        uint8_t *Ls, uint8_t *Lm)
+{
+    for (int j = 0; j < 16; j++) {
+        if (actual_dm > 1e-15f) {
+            int ls = gguf_nearest_int(scales[j] / actual_dm);
+            if (ls < 0) ls = 0; if (ls > 15) ls = 15;
+            Ls[j] = (uint8_t)ls;
+        } else { Ls[j] = 0; }
+        if (actual_mm > 1e-15f) {
+            int lm = gguf_nearest_int(mins[j] / actual_mm);
+            if (lm < 0) lm = 0; if (lm > 15) lm = 15;
+            Lm[j] = (uint8_t)lm;
+        } else { Lm[j] = 0; }
+    }
+}
 static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                                       BlockQ2K *output, float *out_total_error,
                                       OptimizerMode opt_mode,
     float total_err = 0.0f;
     const int N_SUB = QK_K / 16;
     /* ── Outlier Clamping for WLS Seeds ──
      * Protects the Phase 1 greedy seed from being violently warped by extreme
      * >4.0 sigma outliers, which creates better centering for the grid search. */
+    double t_sum_sq = 0.0, t_sum_4 = 0.0;
+    for (int64_t i = 0; i < n_elements; i++) {
+        double w2 = (double)weights[i] * (double)weights[i];
+        t_sum_sq += w2;
+        t_sum_4  += w2 * w2;
+    }
+    float w_sigma = sqrtf((float)(t_sum_sq / (double)n_elements));
+    /* ── Adaptive outlier clamp (kurtosis-driven) ──
+     * The fixed 3.5σ clamp suppressed the heavy-tail mass that dominates
+     * reconstruction error, inflating RMSE on near-Gaussian tensors that did
+     * not need clamping at all. Instead, gate the clamp on the tensor's raw
+     * kurtosis (Gaussian = 3): leave near-Gaussian tensors untouched and only
+     * apply a stabilising clamp to genuinely heavy-tailed tensors, where the
+     * final (d, dmin) refit later recovers fidelity against the UNCLIPPED
+     * weights anyway. */
+    double t_var  = t_sum_sq / (double)n_elements;
+    double t_kurt = (t_var > 1e-30) ? (t_sum_4 / (double)n_elements) / (t_var * t_var) : 3.0;
+    float clamp_sigma;
+    if      (t_kurt <= 6.0)  clamp_sigma = 1.0e9f;   /* ~Gaussian: effectively no clamp */
+    else if (t_kurt <= 20.0) clamp_sigma = 6.0f;     /* moderately heavy tails          */
+    else                     clamp_sigma = 4.0f;     /* very heavy tails: stabilise seed */
+    float clamp_val = w_sigma * clamp_sigma;
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 1: Greedy quantization — produce seed (d, dmin) per block
                 if (v > clamp_val) v = clamp_val;
                 if (v < -clamp_val) v = -clamp_val;
                 sx_clipped[l] = v;
+                /* Activation-aware weighting: an imatrix entry already encodes
+                 * E[a^2] for that column, which is the correct weight for
+                 * minimising output (dot-product) error. Use it directly rather
+                 * than re-multiplying by the |w| magnitude heuristic, which
+                 * double-counts magnitude. Without an imatrix, fall back to the
+                 * magnitude-relative heuristic. */
+                wt[l] = (imat_importance)
+                        ? imp
+                        : sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
                 seeds[blk].sw[j] += wt[l];
             }
             seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
      * ══════════════════════════════════════════════════════════════════ */
     /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
+    /* d is the sensitive axis, so concentrate resolution near 1.0 while
+     * keeping wide tails for blocks whose WLS seed is off. 1.000 stays at
+     * index 11 so the neutral-candidate fallback/init remains valid. */
     static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
+        0.780f, 0.835f, 0.880f, 0.915f, 0.943f, 0.963f,
+        0.978f, 0.988f, 0.994f, 0.997f, 0.999f, 1.000f,
+        1.002f, 1.005f, 1.011f, 1.021f, 1.035f, 1.054f,
+        1.080f, 1.115f, 1.160f, 1.215f, 1.275f, 1.340f
     };
     static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
         0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
     float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
     candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                             sizeof(float[TOTAL_SCALE_CANDIDATES]));
                             sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
     candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
                             sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
+    /* NOTE: the per-candidate sub-scale codes (Ls/Lm) are NOT stored.
+     * They are a pure function of (seeds[blk].scales/mins, candidate fp16
+     * d/dmin) and are re-derived where needed. Storing them cost
+     * n_blocks × 576 × 16 × 2 bytes ≈ 18 KB/superblock — multiple GB of
+     * peak RSS on large FFN tensors — for data used at exactly one index. */
     #pragma omp parallel for schedule(dynamic, 16)
     for (int64_t blk = 0; blk < n_blocks; blk++) {
                         trial_Lm[j] = (uint8_t)lm;
                     } else { trial_Lm[j] = 0; }
                 }
+                /* Error evaluation MUST use the non-clipped original weights.
+                 * Exact importance-weighted SSE — the same objective the
+                 * assembly/polish phases minimise and the reported RMSE. */
+                float err = 0.0f;
+                float e_arr[QK_K];
                 for (int i = 0; i < QK_K; i++) {
                     int jj   = i >> 4;
                     float d  = actual_dm * (float)trial_Ls[jj];
                     float m  = actual_mm * (float)trial_Lm[jj];
+                    float x  = block_x[i];
+                    float w  = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
+                    float e;
                     if (d < 1e-15f) {
+                        /* Decoder semantics: deq = d·ls·q − dmin·lm = −m here */
+                        e = x + m;
                     } else {
                         int q = gguf_nearest_int((x + m) / d);
                         if (q < 0) q = 0; if (q > 3) q = 3;
+                        e = x - (d * (float)q - m);
                     }
+                    e_arr[i] = e;
+                    err += e * e * w;
                 }
+                candidate_errors[blk][cidx] =
+                    err + hex_spectral_penalty(e_arr, QK_K);
             }
         }
     }
                             g_cand = vit_c;
                         }
                     }
+                    if (g_best < cur_err * HEX_GREEDY_OVERRIDE_RATIO)
                         best_candidate[vit_b] = g_cand;
                 }
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+            uint8_t dc_Ls[16], dc_Lm[16];
+            hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
+                                 dm0, mm0, dc_Ls, dc_Lm);
             /* Bias applied to THIS block's WLS targets */
             float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
             block_dc_bias[blk]  = dc_bias;
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
+                float d_sub = dm0 * (float)dc_Ls[j];
+                float m_sub = mm0 * (float)dc_Lm[j];
                 for (k = 0; k < 16; k++) {
                     float x_adj = bx[16*j + k] - dc_bias;
                     int q = 0;
                 adj_block_x[_i] = block_x[_i] - dc_adj;
         }
         float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
         float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+        hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
+                             dm, mm, Ls_blk, Lm_blk);
         uint16_t prev_dm16 = 0, prev_mm16 = 0;
         for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
                 int   jj  = i >> 4;
                 float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                 float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+                /* Decoder semantics: deq = d_s·q − m_s, which is −m_s when
+                 * d_s == 0 (NOT 0 — the −dmin·lm term always applies). */
+                float deq = d_s * (float)q_shaped_all[i] - m_s;
                 /* Residual against the adjusted target (DC-corrected view) */
                 e_live[i] = adj_block_x[i] - deq;
             }
                 }
             }
+            /* Choose base vs shaped on the EXTENDED objective vs originals */
             float err_base = 0.0f, err_shaped = 0.0f;
+            float e_qb[QK_K], e_qs[QK_K];
             for (int i = 0; i < QK_K; i++) {
                 int   jj  = i >> 4;
                 float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
                 float m_s = mm * (float)(output[blk].scales[jj] >> 4);
                 float w   = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
+                float deq_b = d_s * (float)q_base_all[i]   - m_s;  /* −m_s when d_s==0 */
+                float deq_s = d_s * (float)q_shaped_all[i] - m_s;
                 float xv    = block_x[i];   /* original weight for error report */
+                e_qb[i] = xv - deq_b;
+                e_qs[i] = xv - deq_s;
+                err_base   += e_qb[i] * e_qb[i] * w;
+                err_shaped += e_qs[i] * e_qs[i] * w;
             }
+            err_base   += hex_spectral_penalty(e_qb, QK_K);
+            err_shaped += hex_spectral_penalty(e_qs, QK_K);
             {
                 int use_shaped = (err_shaped <= err_base);
                 for (int i = 0; i < QK_K; i++)
             }
         }
+        /* ── Final closed-form (d, dmin) refit against the UNCLIPPED weights ──
+         * (issues #2 / #5)
+         *
+         * Every earlier (d, dmin) solve fits the DC-adjusted, soft-clipped
+         * target and runs BEFORE the greedy descent and Floyd-Steinberg passes
+         * mutate the committed 2-bit codes. Once L[], and the 4-bit sub-block
+         * scale codes (Ls = scales & 0xF, Lm = scales >> 4), are final, the two
+         * fp16 scalars (d, dmin) that minimise the importance-weighted SSE
+         * against the ORIGINAL weights have a closed form. Solve it and adopt it
+         * only when it lowers the weighted block error — so it can never raise
+         * RMSE, and because the integer codes are held fixed, the vesica/wave/DC
+         * error shaping baked into them is preserved intact. */
+        {
+            double rSaa = 0, rSab = 0, rSbb = 0, rSxa = 0, rSxb = 0;
+            double rA = 0, rB = 0, rS = 0;     /* DC rank-1 augmentation */
+            for (int j = 0; j < N_SUB; j++) {
+                float ls_f = (float)(output[blk].scales[j] & 0xF);
+                float lm_f = (float)(output[blk].scales[j] >> 4);
+                for (int k = 0; k < 16; k++) {
+                    int   idx = 16 * j + k;
+                    float x   = block_x[idx];                 /* unclipped original */
+                    float w   = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
+                    float a   = ls_f * (float)L[idx];
+                    float b   = lm_f;
+                    rSaa += (double)w * a * a;
+                    rSab += (double)w * a * b;
+                    rSbb += (double)w * b * b;
+                    rSxa += (double)w * x * a;
+                    rSxb += (double)w * x * b;
+                    rA += a; rB += b; rS += x;
+                }
+            }
+            /* DC term as one augmented observation (S ~ A·d − B·m), weight
+             * λ_dc/n; vesica/wave handled by the extended-E acceptance. */
+            {
+                double rw = (double)HEX_DC_LAMBDA / (double)QK_K;
+                rSaa += rw * rA * rA;  rSab += rw * rA * rB;
+                rSbb += rw * rB * rB;  rSxa += rw * rS * rA;
+                rSxb += rw * rS * rB;
+            }
+            double rdet = rSaa * rSbb - rSab * rSab;
+            if (fabs(rdet) > 1e-30) {
+                double d_ref = (rSbb * rSxa - rSab * rSxb) / rdet;
+                double m_ref = (rSab * rSxa - rSaa * rSxb) / rdet;
+                if (d_ref > 0.0) {
+                    float dm_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_ref));
+                    float mm_try = (m_ref > 0.0)
+                                   ? gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)m_ref))
+                                   : mm;
+                    /* Extended-objective acceptance test vs original weights. */
+                    float err_cur = 0.0f, err_try = 0.0f;
+                    float e_rc[QK_K], e_rt[QK_K];
+                    for (int j = 0; j < N_SUB; j++) {
+                        float ls_f = (float)(output[blk].scales[j] & 0xF);
+                        float lm_f = (float)(output[blk].scales[j] >> 4);
+                        for (int k = 0; k < 16; k++) {
+                            int   idx = 16 * j + k;
+                            float x   = block_x[idx];
+                            float w   = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
+                            float qf  = (float)L[idx];
+                            float dc  = dm     * ls_f * qf - mm     * lm_f;
+                            float dt  = dm_try * ls_f * qf - mm_try * lm_f;
+                            e_rc[idx] = x - dc;
+                            e_rt[idx] = x - dt;
+                            err_cur += e_rc[idx] * e_rc[idx] * w;
+                            err_try += e_rt[idx] * e_rt[idx] * w;
+                        }
+                    }
+                    err_cur += hex_spectral_penalty(e_rc, QK_K);
+                    err_try += hex_spectral_penalty(e_rt, QK_K);
+                    if (err_try < err_cur) { dm = dm_try; mm = mm_try; }
+                }
+            }
+            output[blk].d    = gguf_fp32_to_fp16(dm);
+            output[blk].dmin = gguf_fp32_to_fp16(mm);
+        }
+        /* ══ PHASE 4.6: MONOTONE COORDINATE-DESCENT POLISH (RMSE-guaranteed) ══
+         *
+         * Objective-function mismatch fix: the final passes that commit the
+         * 2-bit codes — the 16×16 (ls, lm) sub-block search, the ±8 ULP
+         * (d, dmin) neighborhood search, and the greedy-descent error shaping
+         * — all minimise error against the DC-ADJUSTED target adj_block_x.
+         * The reported RMSE, however, is measured against the ORIGINAL
+         * weights. The codes are therefore stranded at the optimum of a
+         * SHIFTED objective, while only the scalar (d, dmin) refit above
+         * targets the true one (and it holds all codes frozen).
+         *
+         * This polish runs alternating coordinate descent on the TRUE
+         * objective (importance-weighted SSE vs the original weights):
+         *
+         *   (1) For each 16-weight sub-block, an exact joint re-search of
+         *       (ls, lm) over the full 16×16 grid with per-weight optimal
+         *       q ∈ {0..3}, committed only on strict improvement of the
+         *       extended objective E. With λ_dc = λ_vw = 0 sub-blocks are
+         *       independent given (d, dmin); with spectral terms active the
+         *       coupling (DC: all subs; fold: sub j ↔ sub j⊕8) is handled
+         *       exactly via live residual bookkeeping.
+         *   (2) Closed-form weighted LS refit of the two fp16 scalars
+         *       (d, dmin) with all codes held fixed, committed only on
+         *       strict improvement (same guard as the refit above).
+         *
+         * All moves are accept-only-if-better on E ⇒ the extended block
+         * objective is monotonically non-increasing; at λ = 0 this reduces
+         * to RMSE-monotone (final RMSE can only go DOWN relative to the
+         * unpatched pipeline), at λ > 0 small RMSE giveback is permitted
+         * exactly where it buys dot-product error cancellation. The state space is finite
+         * (4-bit codes, fp16 scalars), so the loop terminates; in practice
+         * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
+         * into L survives wherever it is SSE-neutral, and is overridden
+         * only where it was costing true reconstruction error.            */
+        {
+            uint8_t pl_Ls[16], pl_Lm[16];
+            for (int j = 0; j < N_SUB; j++) {
+                pl_Ls[j] = output[blk].scales[j] & 0xF;
+                pl_Lm[j] = output[blk].scales[j] >> 4;
+            }
+            for (int pol_iter = 0; pol_iter < 6; pol_iter++) {
+                int pol_improved = 0;
+                /* ── (1) Exact per-sub-block (ls, lm, q) re-search on the
+                 * EXTENDED objective. Under the spectral terms sub-blocks
+                 * are no longer independent: every sub couples to all others
+                 * through the DC term and to its fold partner (sub j ⊕ 8,
+                 * i.e. weights i ↔ i+128) through vesica² − wave². The
+                 * search therefore keeps live residuals pe[] and scores each
+                 * candidate against the whole-block penalty with the partner
+                 * residuals held fixed — exact coordinate descent on E.    */
+                float pe[QK_K];
+                float sub_sse[16], sub_dc[16], pair_cross[8];
+                float dc_tot = 0.0f, cross_tot = 0.0f;
+                for (int j = 0; j < N_SUB; j++) {
+                    float d_sub = dm * (float)pl_Ls[j];
+                    float m_sub = mm * (float)pl_Lm[j];
+                    sub_sse[j] = 0.0f;
+                    sub_dc[j]  = 0.0f;
+                    for (int k = 0; k < 16; k++) {
+                        int   idx = 16 * j + k;
+                        float w = (imat_importance) ?
+                                  imat_importance[blk * QK_K + idx] : 1.0f;
+                        /* deq = d·ls·q − dmin·lm; equals −m_sub at ls==0 */
+                        float e = block_x[idx] - (d_sub * (float)L[idx] - m_sub);
+                        pe[idx]     = e;
+                        sub_sse[j] += e * e * w;
+                        sub_dc[j]  += e;
+                    }
+                    dc_tot += sub_dc[j];
+                }
+                for (int p = 0; p < 8; p++) {
+                    pair_cross[p] = 0.0f;
+                    for (int k = 0; k < 16; k++)
+                        pair_cross[p] += pe[16*p + k] * pe[16*(p+8) + k];
+                    cross_tot += pair_cross[p];
+                }
+                for (int j = 0; j < N_SUB; j++) {
+                    const float *sx  = block_x + 16 * j;
+                    int   pi         = j & 7;          /* fold-pair index    */
+                    int   pj         = j ^ 8;          /* partner sub-block  */
+                    const float *ppe = pe + 16 * pj;   /* partner residuals  */
+                    float dc_rest    = dc_tot    - sub_dc[j];
+                    float cross_rest = cross_tot - pair_cross[pi];
+                    /* Extended score of the CURRENT committed state */
+                    float best_sub = sub_sse[j]
+                        + (HEX_DC_LAMBDA / (float)QK_K) * dc_tot * dc_tot
+                        + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f * cross_tot;
+                    int     best_ls = -1, best_lm = 0;
+                    uint8_t best_q[16];
+                    float   best_e[16];
+                    float   best_sse = 0.0f, best_dcc = 0.0f, best_cxc = 0.0f;
+                    for (int try_ls = 0; try_ls <= 15; try_ls++) {
+                        float d_sub = dm * (float)try_ls;
+                        for (int try_lm = 0; try_lm <= 15; try_lm++) {
+                            float m_sub = mm * (float)try_lm;
+                            float sub_err = 0.0f, dcc = 0.0f, cxc = 0.0f;
+                            uint8_t q_loc[16];
+                            float   e_loc[16];
+                            int     aborted = 0;
+                            for (int k = 0; k < 16; k++) {
+                                float x = sx[k];
+                                float w = (imat_importance) ?
+                                          imat_importance[blk * QK_K + 16*j + k] : 1.0f;
+                                int q = 0;
+                                if (d_sub >= 1e-15f) {
+                                    q = gguf_nearest_int((x + m_sub) / d_sub);
+                                    if (q < 0) q = 0; if (q > 3) q = 3;
+                                }
+                                q_loc[k] = (uint8_t)q;
+                                /* deq = d·ls·q − dmin·lm; −m_sub at ls==0 */
+                                float e = x - (d_sub * (float)q - m_sub);
+                                e_loc[k] = e;
+                                sub_err += e * e * w;
+                                dcc     += e;
+                                cxc     += e * ppe[k];
+                                /* SSE-partial prune is a valid lower bound
+                                 * only while the spectral terms are ≥ 0,
+                                 * i.e. when the (signable) vw credit is off */
+                                if (HEX_VW_LAMBDA == 0.0f &&
+                                    sub_err >= best_sub) { aborted = 1; break; }
+                            }
+                            if (aborted) continue;
+                            float score = sub_err
+                                + (HEX_DC_LAMBDA / (float)QK_K)
+                                  * (dc_rest + dcc) * (dc_rest + dcc)
+                                + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f
+                                  * (cross_rest + cxc);
+                            if (score < best_sub) {
+                                best_sub = score;
+                                best_ls  = try_ls;
+                                best_lm  = try_lm;
+                                memcpy(best_q, q_loc, 16);
+                                memcpy(best_e, e_loc, sizeof(e_loc));
+                                best_sse = sub_err;
+                                best_dcc = dcc;
+                                best_cxc = cxc;
+                            }
+                        }
+                    }
+                    if (best_ls >= 0) {   /* strict improvement in E found */
+                        pl_Ls[j] = (uint8_t)best_ls;
+                        pl_Lm[j] = (uint8_t)best_lm;
+                        memcpy(L  + 16 * j, best_q, 16);
+                        memcpy(pe + 16 * j, best_e, sizeof(best_e));
+                        sub_sse[j]     = best_sse;
+                        sub_dc[j]      = best_dcc;
+                        pair_cross[pi] = best_cxc;
+                        dc_tot         = dc_rest    + best_dcc;
+                        cross_tot      = cross_rest + best_cxc;
+                        pol_improved = 1;
+                    }
+                }
+                /* ── (2) Closed-form (d, dmin) refit vs ORIGINAL, codes fixed ── */
+                {
+                    double pSaa = 0, pSab = 0, pSbb = 0, pSxa = 0, pSxb = 0;
+                    double pA = 0, pB = 0, pS = 0;  /* DC rank-1 augmentation */
+                    for (int j = 0; j < N_SUB; j++) {
+                        float ls_f = (float)pl_Ls[j];
+                        float lm_f = (float)pl_Lm[j];
+                        for (int k = 0; k < 16; k++) {
+                            int   idx = 16 * j + k;
+                            float x   = block_x[idx];
+                            float w   = (imat_importance) ?
+                                        imat_importance[blk * QK_K + idx] : 1.0f;
+                            float a   = ls_f * (float)L[idx];
+                            float b   = lm_f;
+                            pSaa += (double)w * a * a;
+                            pSab += (double)w * a * b;
+                            pSbb += (double)w * b * b;
+                            pSxa += (double)w * x * a;
+                            pSxb += (double)w * x * b;
+                            pA += a; pB += b; pS += x;
+                        }
+                    }
+                    {
+                        double pw = (double)HEX_DC_LAMBDA / (double)QK_K;
+                        pSaa += pw * pA * pA;  pSab += pw * pA * pB;
+                        pSbb += pw * pB * pB;  pSxa += pw * pS * pA;
+                        pSxb += pw * pS * pB;
+                    }
+                    double pdet = pSaa * pSbb - pSab * pSab;
+                    if (fabs(pdet) > 1e-30) {
+                        double d_ref = (pSbb * pSxa - pSab * pSxb) / pdet;
+                        double m_ref = (pSab * pSxa - pSaa * pSxb) / pdet;
+                        if (d_ref > 0.0) {
+                            float dm_try = gguf_fp16_to_fp32(
+                                               gguf_fp32_to_fp16((float)d_ref));
+                            float mm_try = (m_ref > 0.0)
+                                           ? gguf_fp16_to_fp32(
+                                                 gguf_fp32_to_fp16((float)m_ref))
+                                           : mm;
+                            float err_cur = 0.0f, err_try = 0.0f;
+                            float e_pc[QK_K], e_pt[QK_K];
+                            for (int j = 0; j < N_SUB; j++) {
+                                float ls_f = (float)pl_Ls[j];
+                                float lm_f = (float)pl_Lm[j];
+                                for (int k = 0; k < 16; k++) {
+                                    int   idx = 16 * j + k;
+                                    float x   = block_x[idx];
+                                    float w   = (imat_importance) ?
+                                                imat_importance[blk * QK_K + idx] : 1.0f;
+                                    float qf  = (float)L[idx];
+                                    float dc  = dm     * ls_f * qf - mm     * lm_f;
+                                    float dt  = dm_try * ls_f * qf - mm_try * lm_f;
+                                    e_pc[idx] = x - dc;
+                                    e_pt[idx] = x - dt;
+                                    err_cur += e_pc[idx] * e_pc[idx] * w;
+                                    err_try += e_pt[idx] * e_pt[idx] * w;
+                                }
+                            }
+                            err_cur += hex_spectral_penalty(e_pc, QK_K);
+                            err_try += hex_spectral_penalty(e_pt, QK_K);
+                            if (err_try < err_cur) {
+                                dm = dm_try;
+                                mm = mm_try;
+                                pol_improved = 1;
+                            }
+                        }
+                    }
+                }
+                if (!pol_improved) {
+                    /* ── (3) ±2 ULP joint (d, dmin) micro-search vs ORIGINAL ──
+                     * The closed-form refit rounds its real-valued optimum to
+                     * fp16, which can land 1–2 ULP away from the best
+                     * representable pair (and the earlier ±8 ULP search ran
+                     * against the DC-shifted objective). With codes fixed,
+                     * scan the (2·HEX_POLISH_ULP+1)² fp16 neighborhood on the
+                     * true objective;
+                     * accept only strict improvement, then loop once more so
+                     * move (1) can re-optimise codes for the new scalars.
+                     * Monotone ⇒ final RMSE can only decrease. */
+                    uint16_t base_d16 = gguf_fp32_to_fp16(dm);
+                    uint16_t base_m16 = gguf_fp32_to_fp16(mm);
+                    float cur_err = 0.0f;
+                    float e_u[QK_K];
+                    for (int j = 0; j < N_SUB; j++) {
+                        float d_sub = dm * (float)pl_Ls[j];
+                        float m_sub = mm * (float)pl_Lm[j];
+                        for (int k = 0; k < 16; k++) {
+                            int   idx = 16 * j + k;
+                            float w   = (imat_importance) ?
+                                        imat_importance[blk * QK_K + idx] : 1.0f;
+                            e_u[idx] = block_x[idx] -
+                                       (d_sub * (float)L[idx] - m_sub);
+                            cur_err += e_u[idx] * e_u[idx] * w;
+                        }
+                    }
+                    cur_err += hex_spectral_penalty(e_u, QK_K);
+                    float    best_err = cur_err;
+                    uint16_t best_d16 = base_d16, best_m16 = base_m16;
+                    for (int dd = -HEX_POLISH_ULP; dd <= HEX_POLISH_ULP; dd++) {
+                        int cd16 = (int)base_d16 + dd;
+                        if (cd16 < 0 || cd16 > 0x7BFF) continue;
+                        float t_dm = gguf_fp16_to_fp32((uint16_t)cd16);
+                        for (int dmm = -HEX_POLISH_ULP; dmm <= HEX_POLISH_ULP; dmm++) {
+                            if (dd == 0 && dmm == 0) continue;
+                            int cm16 = (int)base_m16 + dmm;
+                            if (cm16 < 0 || cm16 > 0x7BFF) continue;
+                            float t_mm = gguf_fp16_to_fp32((uint16_t)cm16);
+                            float err = 0.0f;
+                            /* SSE-partial prune valid only without the
+                             * signable vesica/wave credit */
+                            for (int j = 0;
+                                 j < N_SUB && (HEX_VW_LAMBDA != 0.0f ||
+                                               err < best_err); j++) {
+                                float d_sub = t_dm * (float)pl_Ls[j];
+                                float m_sub = t_mm * (float)pl_Lm[j];
+                                for (int k = 0; k < 16; k++) {
+                                    int   idx = 16 * j + k;
+                                    float w   = (imat_importance) ?
+                                                imat_importance[blk * QK_K + idx] : 1.0f;
+                                    e_u[idx] = block_x[idx] -
+                                               (d_sub * (float)L[idx] - m_sub);
+                                    err += e_u[idx] * e_u[idx] * w;
+                                }
+                            }
+                            if (HEX_DC_LAMBDA != 0.0f || HEX_VW_LAMBDA != 0.0f)
+                                err = (err < best_err || HEX_VW_LAMBDA != 0.0f)
+                                      ? err + hex_spectral_penalty(e_u, QK_K)
+                                      : err;
+                            if (err < best_err) {
+                                best_err = err;
+                                best_d16 = (uint16_t)cd16;
+                                best_m16 = (uint16_t)cm16;
+                            }
+                        }
+                    }
+                    if (best_d16 != base_d16 || best_m16 != base_m16) {
+                        dm = gguf_fp16_to_fp32(best_d16);
+                        mm = gguf_fp16_to_fp32(best_m16);
+                        pol_improved = 1;
+                    }
+                }
+                if (!pol_improved) break;   /* converged on true objective */
+            }
+            /* Write back polished codes and scalars */
+            for (int j = 0; j < N_SUB; j++)
+                output[blk].scales[j] = pl_Ls[j] | (pl_Lm[j] << 4);
+            output[blk].d    = gguf_fp32_to_fp16(dm);
+            output[blk].dmin = gguf_fp32_to_fp16(mm);
+        }
+        /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
+         *
+         * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
+         * encodable configuration (fp16 d/dmin + derived Ls/Lm + nearest
+         * rounding vs the original weights). The multi-stage assembly
+         * (DC-shifted WLS, shaping, diffusion, polish) usually improves on
+         * its seed, but each stage optimises a slightly different objective
+         * and coordinate descent can land in a worse basin. Compare the
+         * finished block against the best raw candidate and fall back when
+         * the pipeline ended up worse — guaranteeing
+         *     final weighted SSE ≤ min_c candidate_errors[blk][c].          */
+        {
+            float fin_err = 0.0f;
+            float e_f[QK_K];
+            for (int j = 0; j < N_SUB; j++) {
+                float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
+                float m_sub = mm * (float)(output[blk].scales[j] >> 4);
+                for (int k = 0; k < 16; k++) {
+                    int   idx = 16 * j + k;
+                    float w   = (imat_importance) ?
+                                imat_importance[blk * QK_K + idx] : 1.0f;
+                    e_f[idx] = block_x[idx] -
+                               (d_sub * (float)L[idx] - m_sub);
+                    fin_err += e_f[idx] * e_f[idx] * w;
+                }
+            }
+            fin_err += hex_spectral_penalty(e_f, QK_K);
+            float g_best = candidate_errors[blk][0];
+            int   g_cand = 0;
+            for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
+                if (candidate_errors[blk][c] < g_best) {
+                    g_best = candidate_errors[blk][c];
+                    g_cand = c;
+                }
+            }
+            if (g_best < fin_err) {
+                /* Rebuild the block exactly as the candidate was scored */
+                float c_dm = gguf_fp16_to_fp32(candidate_d   [blk][g_cand]);
+                float c_mm = gguf_fp16_to_fp32(candidate_dmin[blk][g_cand]);
+                uint8_t c_Ls[16], c_Lm[16];
+                hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
+                                     c_dm, c_mm, c_Ls, c_Lm);
+                for (int j = 0; j < N_SUB; j++) {
+                    float d_sub = c_dm * (float)c_Ls[j];
+                    float m_sub = c_mm * (float)c_Lm[j];
+                    for (int k = 0; k < 16; k++) {
+                        int idx = 16 * j + k;
+                        int q = 0;
+                        if (d_sub >= 1e-15f) {
+                            q = gguf_nearest_int((block_x[idx] + m_sub) / d_sub);
+                            if (q < 0) q = 0; if (q > 3) q = 3;
+                        }
+                        L[idx] = (uint8_t)q;
+                    }
+                    output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
+                }
+                dm = c_dm;  mm = c_mm;
+                output[blk].d    = candidate_d   [blk][g_cand];
+                output[blk].dmin = candidate_dmin[blk][g_cand];
+            }
+        }
         for (int j = 0; j < QK_K; j += 128) {
             for (int l = 0; l < 32; l++) {
                 output[blk].qs[j / 4 + l] = L[j + l]
     free(candidate_errors);
     free(candidate_d);
     free(candidate_dmin);
     free(best_candidate);
     if (out_total_error) *out_total_error = total_err;
  * ═══════════════════════════════════════════════════════════════════════════ */
 static void print_progress_bar(int current, int total, const char *label,
+                                 time_t start_time)
 {
     if (total <= 0) return;
     float pct = (float)current / (float)total;
     int bar_width = 40;
     int filled = (int)(pct * bar_width);
+    /* Wall-clock elapsed: clock() sums CPU time over all OpenMP threads,
+     * which inflated elapsed/ETA by ~the thread count on multicore. */
+    double elapsed = difftime(time(NULL), start_time);
     double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
     printf("\r  [");
     int64_t total_elements_quantized = 0;
     int64_t total_bytes_quantized = 0;
     int64_t total_bytes_unquantized = 0;
+    time_t quant_start = time(NULL);
     for (int i = 0; i < total_tensors; i++) {
         int src = tensor_src_idx[i];
             int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
             if (padded > n_elements) {
+                float *grown = realloc(f32_data, padded * sizeof(float));
+                if (!grown) {
+                    fprintf(stderr, "\n  ERROR: Out of memory padding '%s'\n",
+                            ti->name);
+                    free(f32_data);
+                    continue;
+                }
+                f32_data = grown;
                 for (int64_t j = n_elements; j < padded; j++)
                     f32_data[j] = 0.0f;
                 n_elements = padded;
             int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
             if (padded > n_elements) {
+                float *grown = realloc(f32_data, padded * sizeof(float));
+                if (!grown) {
+                    fprintf(stderr, "\n  ERROR: Out of memory padding '%s'\n",
+                            ti->name);
+                    free(f32_data);
+                    continue;
+                }
+                f32_data = grown;
                 for (int64_t j = n_elements; j < padded; j++)
                     f32_data[j] = 0.0f;
                 n_elements = padded;
     /* ── Phase 1: Load model ── */
     printf("  Phase 1: Loading model...\n");
+    time_t t_start = time(NULL);
     /* Determine if input is a file or directory */
     struct stat st;
         /* Input is a directory — open all shards */
         mf = st_open_dir(input_path);
         strncpy(input_dir, input_path, sizeof(input_dir) - 2);
+        input_dir[sizeof(input_dir) - 2] = '\0';
         int dlen = strlen(input_dir);
         if (dlen > 0 && input_dir[dlen - 1] != '/') {
             input_dir[dlen] = '/';
         /* Extract directory from file path */
         strncpy(input_dir, input_path, sizeof(input_dir) - 1);
+        input_dir[sizeof(input_dir) - 1] = '\0';
         char *last_slash = strrchr(input_dir, '/');
         if (last_slash) {
             *(last_slash + 1) = '\0';
     st_multi_print_summary(mf);
+    time_t t_load = time(NULL);
+    printf("  Loaded in %.0f seconds\n\n", difftime(t_load, t_start));
     /* ── Phase 2: Detect architecture ── */
     printf("  Phase 2: Detecting model architecture...\n");
     /* ── Phase 3-5: Quantize and write GGUF ── */
     printf("  Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
     int result = write_gguf(output_path, mf, &arch, tokenizer,
                               opt_mode, imatrix, verbose);
+    /* Wall-clock total: clock() sums CPU time over all OpenMP threads */
+    time_t t_end = time(NULL);
+    printf("  Total time: %.0f seconds\n\n", difftime(t_end, t_start));
     if (imatrix) imatrix_free(imatrix);
     if (tokenizer) tok_free(tokenizer);