diff --git "a/hexstate_quantize.c" "b/hexstate_quantize.c"
--- "a/hexstate_quantize.c"
+++ "b/hexstate_quantize.c"
@@ -1,5 +1,5 @@
 /* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_quantize.c — HExState GGUF Quantizer
+ * hexstate_quantize.c — HexState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
@@ -15,32 +15,32 @@
  * This tool adapts the HExState HPC Ouroboros factoring engine for
  * LLM weight quantization. The core mathematical machinery is reused:
  *
- *   Factoring Domain          →  Quantization Domain
- *   ─────────────────────────────────────────────────
- *   HPCGraph + CZ edges       →  Block sensitivity graph
- *   Complex Amplitude BP      →  Importance propagation
- *   MCMC period sampler       →  Optimal scale search
- *   try_period() validation   →  Error bound checking
- *   LLL lattice reduction     →  (future) Adaptive bit allocation
+ * Factoring Domain          →  Quantization Domain
+ * ─────────────────────────────────────────────────
+ * HPCGraph + CZ edges       →  Block sensitivity graph
+ * Complex Amplitude BP      →  Importance propagation
+ * MCMC period sampler       →  Optimal scale search
+ * try_period() validation   →  Error bound checking
+ * LLL lattice reduction     →  (future) Adaptive bit allocation
  *
  * Additional techniques ported from llm-compressor:
- *   MSE grid search           →  Optimal min/max range shrinking
- *   Importance matrix (imatrix) →  Per-channel error weighting
+ * MSE grid search           →  Optimal min/max range shrinking
+ * Importance matrix (imatrix) →  Per-channel error weighting
  *
  * Build:
- *   make -f Makefile.quantize
+ * make -f Makefile.quantize
  *
  * Usage:
- *   ./hexstate_quantize <input> <output.gguf> [options]
+ * ./hexstate_quantize <input> <output.gguf> [options]
  *
  * Input can be:
- *   - A single .safetensors file
- *   - A model directory containing sharded .safetensors files
+ * - A single .safetensors file
+ * - A model directory containing sharded .safetensors files
  *
  * Options:
- *   --optimizer hpc|mse|hybrid   Scale optimization strategy (default: hybrid)
- *   --imatrix <file>             Importance matrix for weighted quantization
- *   --verbose                    Per-block diagnostics
+ * --optimizer hpc|mse|hybrid   Scale optimization strategy (default: hybrid)
+ * --imatrix <file>             Importance matrix for weighted quantization
+ * --verbose                    Per-block diagnostics
  * ═══════════════════════════════════════════════════════════════════════════ */
 
 #include <stdio.h>
@@ -555,8 +555,8 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
  * SHOULD THIS TENSOR BE QUANTIZED?
  *
  * Decision rules:
- *   - Quantize: weight matrices (2D, large)
- *   - Keep F32: norms, biases, embeddings, 1D tensors
+ * - Quantize: weight matrices (2D, large)
+ * - Keep F32: norms, biases, embeddings, 1D tensors
  * ═══════════════════════════════════════════════════════════════════════════ */
 
 static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
@@ -620,12 +620,12 @@ static int is_attention_tensor(const char *gguf_name)
  * For Q2_K: 256-weight superblocks.
  *
  * The 6 values per site correspond to 6 candidate scale factors:
- *   v=0: scale * 0.85  (aggressive, high compression)
- *   v=1: scale * 0.90
- *   v=2: scale * 0.95
- *   v=3: scale * 1.00  (standard)
- *   v=4: scale * 1.05
- *   v=5: scale * 1.10  (conservative, less compression error)
+ * v=0: scale * 0.85  (aggressive, high compression)
+ * v=1: scale * 0.90
+ * v=2: scale * 0.95
+ * v=3: scale * 1.00  (standard)
+ * v=4: scale * 1.05
+ * v=5: scale * 1.10  (conservative, less compression error)
  *
  * BP propagates: "if your neighbor block is sensitive, you should be
  * conservative too" — creating coherent precision allocation.
@@ -637,12 +637,12 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
 };
 
 /* ── Multi-quhit expanded scale table ──
- * Search grid: 10×10 = 100 (d, dmin) candidates
- * Quhit encoding: bin 10 → 6 for D=6 quhits (BP operates on 6-state marginals)
- * Beam search: operates on all 100 candidates directly */
+ * Search grid: 24×24 = 576 (d, dmin) candidates
+ * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
+ * Beam search: operates on all 576 candidates directly */
 #define QUHITS_PER_BLOCK  2
-#define N_CAND_D   16    /* d multiplier candidates (was 10) */
-#define N_CAND_M   16    /* dmin multiplier candidates (was 10) */
+#define N_CAND_D   24    /* d multiplier candidates (expanded) */
+#define N_CAND_M   24    /* dmin multiplier candidates (expanded) */
 #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
 
 static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
@@ -650,7 +650,7 @@ static int scale_table_initialized = 0;
 
 static void init_scale_table(void) {
     if (scale_table_initialized) return;
-    /* 100 candidates: uniform spacing centered on 1.0 */
+    /* candidates: uniform spacing centered on 1.0 */
     for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) {
         SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1));
     }
@@ -695,20 +695,15 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
 /* ═══════════════════════════════════════════════════════════════════════════
  * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
  *
- * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4
- * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
- * x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x))
- * is close enough for error norm purposes (~1% relative error).
+ * powf() costs ~50-100 cycles. Use log2f+exp2f (~25 cycles) for the
+ * exact x^2.4 = x^2 × 2^(0.4·log2(x)) computation instead.
  * ═══════════════════════════════════════════════════════════════════════════ */
 static inline float fast_pow_2_4(float x)
 {
-    /* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
-     * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
-     * x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex.
-     * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
-     * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */
+    /* x^2.4 = x^2 × 2^(0.4 × log2(x)).  log2f+exp2f ≈ 25 cycles total vs
+     * 50-100 for powf, and produces the exact ^2.4 norm the grid search needs. */
     float x2 = x * x;
-    return x2 * sqrtf(cbrtf(x2));  /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */
+    return x2 * exp2f(0.4f * log2f(x));  /* x^2 × x^0.4 = x^2.4 */
 }
 
 /* Compute the Q2_K sub-block reconstruction error for a block at a given
@@ -743,16 +738,16 @@ static float compute_block_error_q2k(const float *weights, int block_size,
 }
 
 /* Build multi-quhit HPC sensitivity graph.
- * 2 quhits per block → 36 scale candidates per block.
+ * 2 quhits per block → 576 scale candidates per block.
  *
  * Graph layout: sites [0..2*n-1] where:
- *   site 2*i     = coarse quhit for block i
- *   site 2*i + 1 = fine quhit for block i
+ * site 2*i     = coarse quhit for block i
+ * site 2*i + 1 = fine quhit for block i
  *
  * Edges:
- *   Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
- *   Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
- *                CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
+ * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
+ * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
+ * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
 static HPCGraph *build_sensitivity_graph(const float *weights,
                                            int64_t n_elements,
                                            int block_size,
@@ -774,13 +769,13 @@ static HPCGraph *build_sensitivity_graph(const float *weights,
     for (int64_t i = 0; i < n_sites; i++)
         triality_dft(&graph->locals[i]);
 
-    /* Compute errors for all 36 scale candidates per block,
+    /* Compute errors for all candidates per block,
      * then project onto coarse (quhit 0) and fine (quhit 1) marginals */
     for (int64_t i = 0; i < graph_blocks; i++) {
         int64_t block_idx = i * stride;
         const float *block_weights = weights + block_idx * block_size;
 
-        /* Evaluate all 36 candidates */
+        /* Evaluate all candidates */
         float errors[TOTAL_SCALE_CANDIDATES];
         float min_err = 1e30f;
         for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
@@ -873,12 +868,12 @@ static HPCGraph *build_sensitivity_graph(const float *weights,
  * For a Q2_K sub-block, progressively shrink the min/max range to find
  * the candidate that minimizes weighted reconstruction error.
  *
- *   for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
- *     candidate_min = p * min
- *     candidate_max = p * max
- *     error = ||x - quantize(x, candidate_min, candidate_max)||^norm
- *     if error < best: update best
- *     else: patience--; if patience == 0: break
+ * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
+ * candidate_min = p * min
+ * candidate_max = p * max
+ * error = ||x - quantize(x, candidate_min, candidate_max)||^norm
+ * if error < best: update best
+ * else: patience--; if patience == 0: break
  *
  * This is a direct C port of llm-compressor's _grid_search_mse.
  * ═══════════════════════════════════════════════════════════════════════════ */
@@ -977,7 +972,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
     float cur_scale = best_scale;
     if (cur_scale > 1e-15f) {
         float iscale = 1.0f / cur_scale;
-        for (int itry = 0; itry < 3; itry++) {
+        for (int itry = 0; itry < 5; itry++) {
             float sumlx = 0;
             int suml2 = 0;
             for (int i = 0; i < n; i++) {
@@ -992,8 +987,9 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
             float sum = 0;
             for (int i = 0; i < n; i++)
                 sum += x[i] - cur_scale * L[i];
-            cur_min = 0.7f * cur_min + 0.3f * sum / n;
-            if (cur_min > 0) cur_min = 0;
+            /* True coordinate-descent optimal: min* = sum/n (no momentum).
+             * Clamp to ≤ 0 since min must be non-positive by convention. */
+            cur_min = fminf(0.0f, sum / n);
             if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale;
         }
     }
@@ -1006,12 +1002,12 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
  * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
  *
  * Two-phase approach:
- *   Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
- *            This produces per-sub-block (scale, min) with 16-step search.
- *   Phase B: HPC BP refines the superblock-level d/dmin rounding.
- *            6 candidate (d, dmin) pairs are tested; BP finds the one
- *            where the GLOBAL reconstruction error is minimized via
- *            constructive interference of per-sub-block phase coherence.
+ * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
+ * This produces per-sub-block (scale, min) with 16-step search.
+ * Phase B: HPC BP refines the superblock-level d/dmin rounding.
+ * 6 candidate (d, dmin) pairs are tested; BP finds the one
+ * where the GLOBAL reconstruction error is minimized via
+ * constructive interference of per-sub-block phase coherence.
  * ═══════════════════════════════════════════════════════════════════════════ */
 
 /* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants).
@@ -1174,22 +1170,22 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x,
  * Instead of iterative message-passing (BP), this uses the EXACT sequential
  * measurement protocol from Shor's algorithm:
  *
- *   For each block k (MSB → LSB):
- *     1. Compute feed-forward phase correction from previously measured blocks
- *     2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
- *     3. Bake C_k into locals: α(d) *= C_k(d)
- *     4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
- *     5. Apply IDFT6 in-place: interference creates peaks at optimal scales
- *     6. Born rule measurement → select optimal scale candidate
- *     7. Collapse site + absorb edge weights into neighbors (back-action)
+ * For each block k (MSB → LSB):
+ * 1. Compute feed-forward phase correction from previously measured blocks
+ * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
+ * 3. Bake C_k into locals: α(d) *= C_k(d)
+ * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
+ * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
+ * 6. Born rule measurement → select optimal scale candidate
+ * 7. Collapse site + absorb edge weights into neighbors (back-action)
  *
  * This IS the quantum Fourier transform that creates constructive
  * interference at the optimal RMSE configuration, exactly as Shor's
  * algorithm creates interference at the correct period.
  *
  * Domain mapping:
- *   Factoring: oracle phase 2π×d×c_k/N → period r
- *   Quantize:  error Boltzmann amplitudes → optimal RMSE block
+ * Factoring: oracle phase 2π×d×c_k/N → period r
+ * Quantize:  error Boltzmann amplitudes → optimal RMSE block
  * ═══════════════════════════════════════════════════════════════════════════ */
 
 /* ω₆ roots of unity for CZ phase lookup */
@@ -1280,17 +1276,17 @@ static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome)
  * Ported 1:1 from tesseract_factor.c lines 2343-2500.
  *
  * Measures sites MSB→LSB. For each site k:
- *   1. Compute feed-forward phase correction θ_k from previously measured sites
- *   2. Compute neighbor contribution C_k(d) analytically
- *   3. Bake C_k into locals
- *   4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
- *   5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
- *   6. Compute |β(v)|² as measurement probabilities
- *   7. Sample/argmax → outcome
- *   8. Collapse + back-action via shor_collapse_site()
+ * 1. Compute feed-forward phase correction θ_k from previously measured sites
+ * 2. Compute neighbor contribution C_k(d) analytically
+ * 3. Bake C_k into locals
+ * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
+ * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
+ * 6. Compute |β(v)|² as measurement probabilities
+ * 7. Sample/argmax → outcome
+ * 8. Collapse + back-action via shor_collapse_site()
  *
  * Returns: marginals are written into marg_out[n_sites][6].
- *          measured_out[n_sites] receives the measurement outcomes.
+ * measured_out[n_sites] receives the measurement outcomes.
  * ═══════════════════════════════════════════════════════════════════════════ */
 static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
                                 double (*marg_out)[6], int *measured_out,
@@ -1446,25 +1442,27 @@ static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
  * HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors)
  *
  * Same architecture as Q2_K HPC pipeline, but simpler:
- *   - One parameter per block (scale d only, no dmin)
- *   - Single quhit per block (6 states)
- *   - 10 candidate scales → bin to 6 for BP
- *   - 12-beam Hensel search for globally optimal configuration
- *   - Triality 3-view marginals for robust scoring
+ * - One parameter per block (scale d only, no dmin)
+ * - Single quhit per block (6 states)
+ * - 24 candidate scales → bin to 6 for BP
+ * - 48-beam Hensel search for globally optimal configuration
+ * - Triality 3-view marginals for robust scoring
  *
  * Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d
  * ═══════════════════════════════════════════════════════════════════════════ */
 
-#define Q4_N_CAND 16  /* scale candidates for Q4_0 (was 10) */
-#define Q4_N_BEAMS 24 /* beam width (was 12) */
+#define Q4_N_CAND 24  /* expanded scale candidates for Q4_0 */
+#define Q4_N_BEAMS 48 /* expanded beam width */
 
-/* Tight neighborhood around WLS optimum: ±10% */
+/* Tight neighborhood around WLS optimum */
 static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = {
-    0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f,
-    0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f
+    0.850f, 0.880f, 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f,
+    0.975f, 0.985f, 0.995f, 1.000f, 1.005f, 1.015f, 1.025f, 1.035f,
+    1.050f, 1.070f, 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
 };
 static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
-    0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+    3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
 };
 
 static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
@@ -1474,6 +1472,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     int64_t n_blocks = n_elements / QK4_0;
     float total_err = 0.0f;
 
+    /* ── Compute Tensor Sigma for SA Temperature ── */
+    double t_sum_sq = 0.0;
+    for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
+    float w_sigma = sqrtf(t_sum_sq / n_elements);
+
     /* ── Phase 1: Greedy seed — compute scale per block ── */
     float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
 
@@ -1501,7 +1504,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
 
         /* ── Step 2a: WLS solve to find optimal d* ── */
         float wls_d = greedy_d[blk];
-        for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
+        uint16_t prev_wls_d16 = 0;
+        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
             if (wls_d < 1e-15f) break;
             float inv_d = 1.0f / wls_d;
             float num = 0.0f, den = 0.0f;
@@ -1519,6 +1523,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
                     wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new));
             }
+            uint16_t cur_wls_d16 = gguf_fp32_to_fp16(wls_d);
+            if (cur_wls_d16 == prev_wls_d16) break;  /* converged in FP16 */
+            prev_wls_d16 = cur_wls_d16;
         }
 
         /* ── Step 2b: Generate candidates centered on WLS optimum ── */
@@ -1529,36 +1536,30 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             cand_d16[blk][ci] = d16;
 
             float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
-            float err = 0.0f;
-
-            for (int j = 0; j < QK4_0; j += 6) {
-                int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j);
-                int half_g = g_len / 2;
-                float e_cur[6], w_cur[6];
-                
-                for (int kk = 0; kk < g_len; kk++) {
-                    int idx = j + kk;
-                    float x = bw[idx];
-                    int q = (int)(x * id + 8.5f);
-                    if (q < 0) q = 0; if (q > 15) q = 15;
-                    float deq = ((float)q - 8.0f) * actual_d;
-                    e_cur[kk] = x - deq;
-                    w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f;
-                }
-                
-                /* Decompose into vesica (DC) and wave (AC) components */
-                float vesica_err = 0.0f, wave_err = 0.0f;
-                for (int p = 0; p < half_g; p++) {
-                    float v = e_cur[p] + e_cur[p + half_g];
-                    float w_wave = e_cur[p] - e_cur[p + half_g];
-                    float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
-                    vesica_err += v * v * w_avg;
-                    wave_err += w_wave * w_wave * w_avg;
-                }
-                /* Triality weighting: penalize vesica 4×, wave 1×.
-                 * Factor of 0.5 keeps scale consistent with standard MSE. */
-                err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
+
+            /* ── Single-unit D₆ error over all QK4_0 (32) elements ──
+             * Antipodal pairing: (j, j + QK4_0/2) for j in [0, QK4_0/2).
+             * Treating the whole block as one unit eliminates boundary
+             * artefacts from the old 6-element chunks and correctly captures
+             * long-range error correlations within the block. */
+            float e_all[QK4_0], w_all[QK4_0];
+            for (int j = 0; j < QK4_0; j++) {
+                float x = bw[j];
+                int q = (int)(x * id + 8.5f);
+                if (q < 0) q = 0; if (q > 15) q = 15;
+                float deq = ((float)q - 8.0f) * actual_d;
+                e_all[j] = x - deq;
+                w_all[j] = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
             }
+            float vesica_err = 0.0f, wave_err = 0.0f;
+            for (int j = 0; j < QK4_0 / 2; j++) {
+                float v      = e_all[j] + e_all[j + QK4_0 / 2];
+                float w_wave = e_all[j] - e_all[j + QK4_0 / 2];
+                float w_avg  = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f;
+                vesica_err += v * v * w_avg;
+                wave_err   += w_wave * w_wave * w_avg;
+            }
+            float err = 0.5f * (4.0f * vesica_err + wave_err);
             cand_errors[blk][ci] = err;
         }
     }
@@ -1566,7 +1567,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
     /* ── Phase 3: HPC graph — single quhit per block ── */
     int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
     for (int64_t i = 0; i < n_blocks; i++)
-        best_candidate[i] = 10;  /* Q4_NEIGHBOR_MULTS[10] = 1.00 */
+        best_candidate[i] = 11;  /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
 
     if (n_blocks >= 2) {
         float temperature = 0.5f;
@@ -1752,7 +1753,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                     group_cidx = history[curr_hist].cand_idx;
                     curr_hist = history[curr_hist].parent_idx;
                 } else {
-                    group_cidx = 10;
+                    group_cidx = 11;
                 }
 
                 if (stride <= 1) {
@@ -1797,17 +1798,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
              * The beam search found the MAP candidate sequence. But the
              * triality marginals encode quantum phase-coherent structure
              * that a greedy beam can miss.
-             *
-             * Like tesseract_factor's MCMC period recovery (lines 1920-1964):
-             *   1. Take N independent Born samples from triality marginals
-             *   2. Each sample → full candidate assignment across all blocks
-             *   3. Evaluate actual RMSE for each assignment
-             *   4. Keep assignment with lowest total RMSE
-             *
-             * Reuses the EXISTING converged Möbius sheet — zero new BP.
              * ══════════════════════════════════════════════════════════════ */
             {
-                #define Q4_BORN_SHOTS 64
+                #define Q4_BORN_SHOTS 128
 
                 /* Compute beam-search baseline RMSE for comparison */
                 float beam_total_err = 0.0f;
@@ -1847,7 +1840,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                         /* Find the best candidate WITHIN this quhit bin */
                         int64_t blk = gi * stride;
                         float best_bin_err = 1e30f;
-                        int best_bin_cand = 10; /* default */
+                        int best_bin_cand = 11; /* default */
                         for (int ci = 0; ci < Q4_N_CAND; ci++) {
                             if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) {
                                 if (cand_errors[blk][ci] < best_bin_err) {
@@ -1872,6 +1865,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 free(shot_sparse_q4);
             }
 
+            /* Born refinement pass: non-stride blocks were set during beam
+             * traceback and never revisited by Born shots.  For each such block
+             * pick the lowest-error candidate within the same quhit bin that
+             * the winning Born shot chose for its stride-representative. */
+            if (stride > 1) {
+                for (int64_t b = 0; b < n_blocks; b++) {
+                    if (b % stride == 0) continue;
+                    int64_t rep = (b / stride) * stride;
+                    int target_bin = Q4_CAND_TO_QUHIT[best_candidate[rep]];
+                    float best_b_err = 1e30f;
+                    int  best_b_cand = best_candidate[rep];
+                    for (int ci = 0; ci < Q4_N_CAND; ci++) {
+                        if (Q4_CAND_TO_QUHIT[ci] != target_bin) continue;
+                        if (cand_errors[b][ci] < best_b_err) {
+                            best_b_err  = cand_errors[b][ci];
+                            best_b_cand = ci;
+                        }
+                    }
+                    best_candidate[b] = best_b_cand;
+                }
+            }
+
             free(marg);
             hpc_destroy(graph);
         }
@@ -1879,22 +1894,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
 
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 4: Assemble blocks via least-squares scale extraction
-     *
-     * The factorer assembles a frequency register from BP marginals,
-     * then EXTRACTS the exact period via continued fractions.
-     *
-     * We do the same: the beam search / Born shots selected a grid
-     * candidate (the "assembled frequency"). Now we EXTRACT the exact
-     * optimal FP16 scale via weighted least-squares (the "CF step").
-     *
-     * For Q4_0:  d_optimal = Σ(w_j × x_j × q̃_j) / Σ(w_j × q̃_j²)
-     * where q̃_j = (q_j - 8) and q_j is quantized at the grid scale.
-     *
-     * This iterates: quantize at d_init → compute d_optimal → re-quantize
-     * → re-compute until convergence. 3 iterations suffice since Q4_0
-     * has only 16 levels — the assignment stabilizes immediately.
-     *
-     * The grid gave us 16 possible scales. This gives us 65,536 (all FP16).
      * ══════════════════════════════════════════════════════════════════ */
 
     #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
@@ -1905,13 +1904,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
         /* Start from the grid-selected scale (the "assembled frequency") */
         float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
 
-        /* Analog assembly: iterate to full convergence.
-         * 5 iterations for stable (d, q-values) coupling. */
+        /* Analog assembly: iterate to full convergence. */
         for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
             if (d_current < 1e-15f) break;
             float id = 1.0f / d_current;
 
-            /* Quantize at current scale */
             int qs_tmp[QK4_0];
             for (int j = 0; j < QK4_0; j++) {
                 int q = (int)(bw[j] * id + 8.5f);
@@ -1919,8 +1916,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                 qs_tmp[j] = q;
             }
 
-            /* Weighted least-squares: d = Σ(w × x × q̃) / Σ(w × q̃²)
-             * where q̃ = q - 8 (centered quantized value) */
             float num = 0.0f, den = 0.0f;
             for (int j = 0; j < QK4_0; j++) {
                 float q_centered = (float)qs_tmp[j] - 8.0f;
@@ -1932,7 +1927,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
 
             if (den > 1e-15f) {
                 float d_new = num / den;
-                /* Clamp magnitude to prevent runaway (Q4_0 d can be negative) */
                 float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
                 if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) {
                     uint16_t d16 = gguf_fp32_to_fp16(d_new);
@@ -1941,28 +1935,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             }
         }
 
-        /* ── FP16 ULP neighborhood search + sign-flip exploration ──
-         * The WLS solve found the continuous-optimal d. But FP16 truncation
-         * may shift the optimum. Try ±4 ULP around d in FP16 space, plus
-         * the negated scale, and pick the one with minimum reconstruction error. */
+        /* ── FP16 ULP neighborhood search + sign-flip exploration ── */
         {
             uint16_t base_d16 = gguf_fp32_to_fp16(d_current);
             uint16_t best_d16 = base_d16;
             float best_ulp_err = 1e30f;
 
-            /* Try ±4 ULP neighborhood + sign flip = up to 17 candidates */
-            uint16_t ulp_candidates[17];
+            /* Try ±8 ULP neighborhood + sign flip = up to 34 candidates */
+            uint16_t ulp_candidates[35];
             int n_ulp = 0;
-            for (int delta = -4; delta <= 4; delta++) {
+            for (int delta = -8; delta <= 8; delta++) {
                 int cand16 = (int)base_d16 + delta;
-                if (cand16 >= 0 && cand16 <= 0x7BFF) /* valid positive FP16 */
+                if (cand16 >= 0 && cand16 <= 0x7BFF)
                     ulp_candidates[n_ulp++] = (uint16_t)cand16;
             }
-            /* Sign-flipped d: negate and try ±0 ULP */
             {
                 float neg_d = -d_current;
                 uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d);
-                ulp_candidates[n_ulp++] = neg_d16;
+                for (int delta = -8; delta <= 8; delta++) {
+                    int cand16 = (int)neg_d16 + delta;
+                    if (cand16 >= 0 && cand16 <= 0x7BFF)
+                        ulp_candidates[n_ulp++] = (uint16_t)cand16;
+                }
             }
 
             for (int ui = 0; ui < n_ulp; ui++) {
@@ -1984,18 +1978,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             d_current = gguf_fp16_to_fp32(best_d16);
         }
 
-        /* Store the extracted optimal FP16 scale */
         output[blk].d = gguf_fp32_to_fp16(d_current);
         float actual_d = d_current;
         float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;
 
-        /* ── D₆ Hadamard Error Shaping for Q4_0 ──
-         * 32 elements per block = 5 full D₆ groups of 6 + 2 tail.
-         * Apply the same antipodal fold as Q2_K: minimize vesica energy
-         * to push quantization noise into wave (high-frequency) modes
-         * that cancel in dot products. */
-
-        /* Step 1: Standard nearest-rounding as baseline */
+        /* ── D₆ Hadamard Error Shaping with Simulated Annealing ── */
         int q_base[QK4_0], q_shaped[QK4_0];
         float q_cont[QK4_0];
         for (int j = 0; j < QK4_0; j++) {
@@ -2006,73 +1993,83 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
         }
         memcpy(q_shaped, q_base, QK4_0 * sizeof(int));
 
-        /* Step 2: D₆ greedy flipping on 5 groups of 6 */
-        for (int g = 0; g < 5; g++) {
-            int g_off = g * 6;
-
-            for (int pass = 0; pass < 6; pass++) {
-                int best_k = -1;
-                int best_q_alt = 0;
-                float best_delta = 0.0f;
-
-                /* Current group errors */
-                float e_cur[6];
-                for (int kk = 0; kk < 6; kk++) {
-                    float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d;
-                    e_cur[kk] = bw[g_off+kk] - deq;
-                }
-
-                /* Current D₆ metric: vesica energy + DC² */
-                float vesica_cur = 0.0f, dc_cur = 0.0f;
-                for (int p = 0; p < 3; p++) {
-                    float v = e_cur[p] + e_cur[p+3];
-                    vesica_cur += v * v;
-                }
-                for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
-                float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
-
-                /* Try flipping each element */
-                for (int k = 0; k < 6; k++) {
-                    int idx = g_off + k;
-                    int q_cur = q_shaped[idx];
+        {
+            float e_live[QK4_0];
+            for (int j = 0; j < QK4_0; j++) {
+                float deq = ((float)q_shaped[j] - 8.0f) * actual_d;
+                e_live[j] = bw[j] - deq;
+            }
 
-                    int q_try;
-                    if (q_cont[idx] - (float)q_cur >= 0) {
-                        q_try = q_cur + 1;
-                    } else {
-                        q_try = q_cur - 1;
-                    }
+            float v_live[QK4_0 / 2];
+            float vesica_cur = 0.0f, dc_cur = 0.0f;
+            for (int j = 0; j < QK4_0 / 2; j++) {
+                v_live[j] = e_live[j] + e_live[j + QK4_0 / 2];
+                vesica_cur += v_live[j] * v_live[j];
+            }
+            for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
+            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+
+            /* Simulated Annealing parameters */
+            float sa_temp = metric_cur * 0.05f;
+            float sa_decay = 0.90f;
+
+            for (int pass = 0; pass < QK4_0; pass++) {
+                int   best_k     = -1;
+                int   best_q_alt = 0;
+                float best_delta = -1e30f;
+
+                for (int k = 0; k < QK4_0; k++) {
+                    int q_cur = q_shaped[k];
+                    int q_try = (q_cont[k] - (float)q_cur >= 0.0f)
+                                ? q_cur + 1 : q_cur - 1;
                     if (q_try < 0 || q_try > 15) continue;
 
-                    /* Alt errors */
-                    float e_alt[6];
-                    for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
                     float deq_try = ((float)q_try - 8.0f) * actual_d;
-                    e_alt[k] = bw[idx] - deq_try;
+                    float e_new   = bw[k] - deq_try;
+                    float de      = e_new - e_live[k];
 
-                    /* Alt D₆ metric */
-                    float vesica_alt = 0.0f, dc_alt = 0.0f;
-                    for (int p = 0; p < 3; p++) {
-                        float v = e_alt[p] + e_alt[p+3];
-                        vesica_alt += v * v;
-                    }
-                    for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
+                    int pi = (k < QK4_0 / 2) ? k : k - QK4_0 / 2;
+                    float v_old = v_live[pi];
+                    float v_new = v_old + de;
+
+                    float vesica_alt = vesica_cur - v_old * v_old + v_new * v_new;
+                    float dc_alt     = dc_cur     + de;
                     float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;
 
                     float delta = metric_cur - metric_alt;
                     if (delta > best_delta) {
                         best_delta = delta;
-                        best_k = k;
+                        best_k     = k;
                         best_q_alt = q_try;
                     }
                 }
 
                 if (best_k < 0) break;
-                q_shaped[g_off + best_k] = best_q_alt;
+
+                /* SA Acceptance Rule */
+                if (best_delta > 0.0f || (sa_temp > 1e-7f && expf(best_delta / sa_temp) > ((float)rand()/RAND_MAX))) {
+                    q_shaped[best_k] = best_q_alt;
+                    float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
+                    float e_new_commit = bw[best_k] - deq_commit;
+                    float de_commit    = e_new_commit - e_live[best_k];
+
+                    int pi_commit = (best_k < QK4_0 / 2) ? best_k : best_k - QK4_0 / 2;
+                    float v_old_commit = v_live[pi_commit];
+                    float v_new_commit = v_old_commit + de_commit;
+
+                    vesica_cur += v_new_commit * v_new_commit - v_old_commit * v_old_commit;
+                    dc_cur     += de_commit;
+                    metric_cur  = 4.0f * vesica_cur + dc_cur * dc_cur;
+
+                    v_live[pi_commit] = v_new_commit;
+                    e_live[best_k]    = e_new_commit;
+                } else {
+                    if (sa_temp < 1e-7f) break;
+                }
+                sa_temp *= sa_decay;
             }
         }
 
-        /* Step 3: Error comparison — keep shaped only if MSE doesn't worsen >5% */
         float err_base = 0.0f, err_shaped = 0.0f;
         for (int j = 0; j < QK4_0; j++) {
             float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
@@ -2081,9 +2078,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
             err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w;
             err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w;
         }
-        int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base;
+        int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
 
-        /* Pack nibbles and compute error */
         for (int j = 0; j < QK4_0 / 2; j++) {
             int q0 = q_final[j];
             int q1 = q_final[j + QK4_0/2];
@@ -2114,15 +2110,22 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
 
     init_scale_table();
 
+    /* ── Outlier Clamping for WLS Seeds ──
+     * Protects the Phase 1 greedy seed from being violently warped by extreme
+     * >4.0 sigma outliers, which creates better centering for the grid search. */
+    double t_sum_sq = 0.0;
+    for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
+    float w_sigma = sqrtf(t_sum_sq / n_elements);
+    float clamp_val = w_sigma * 3.5f;
+
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 1: Greedy quantization — produce seed (d, dmin) per block
      * ══════════════════════════════════════════════════════════════════ */
 
-    /* Store Phase A/B results for all blocks */
     typedef struct {
-        float dm, mm;                 /* greedy d, dmin (fp32) */
-        uint16_t d_fp16, dmin_fp16;   /* greedy d, dmin (fp16) */
-        uint8_t Ls[16], Lm[16];       /* sub-block scale/min indices */
+        float dm, mm;
+        uint16_t d_fp16, dmin_fp16;
+        uint8_t Ls[16], Lm[16];
         float scales[16], mins[16], sw[16];
     } BlockSeed;
 
@@ -2138,15 +2141,21 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i];
         float sigma2 = sumx2 / (float)QK_K;
 
+        /* Phase 1 WLS uses clamped values to generate stable seeds */
+        float sx_clipped[16];
         for (int j = 0; j < N_SUB; j++) {
             const float *sx = block_x + 16 * j;
             seeds[blk].sw[j] = 0;
             for (int l = 0; l < 16; l++) {
                 float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f;
-                wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]);
+                float v = sx[l];
+                if (v > clamp_val) v = clamp_val;
+                if (v < -clamp_val) v = -clamp_val;
+                sx_clipped[l] = v;
+                wt[l] = imp * sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
                 seeds[blk].sw[j] += wt[l];
             }
-            seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt,
+            seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
                                         L + 16 * j, &seeds[blk].mins[j], Laux);
         }
 
@@ -2160,36 +2169,30 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
 
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 2: WLS-Optimal Candidate Generation
-     *
-     * Instead of a fixed multiplier grid centered on greedy seeds,
-     * we first solve a 3-iteration Weighted Least-Squares to find
-     * the true optimal (d*, dmin*) per block, then generate the
-     * 16×16 candidate grid centered on THOSE optimal values.
-     * This makes the candidate space data-driven, not fabricated.
      * ══════════════════════════════════════════════════════════════════ */
 
-    /* Wide neighborhood around WLS optimum: ±20% with asymmetric spacing
-     * — finer near 1.0 for precision, wider at edges for exploration.
-     * Critical for large-σ weights where the optimal (d,dmin) may be
-     * far from the WLS seed. */
+    /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
     static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
-        0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
-        1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
+        0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
+        0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
+        1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
+        1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
     };
     static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
-        0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
-        1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
+        0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
+        0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
+        1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
+        1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
     };
-    /* Map 16 candidates → 6 quhit states for BP encoding */
-    static const int CAND_TO_QUHIT[16] = {
-        0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
+    /* Map 24 candidates → 6 quhit states for BP encoding */
+    static const int CAND_TO_QUHIT[24] = {
+        0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+        3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
     };
 
-    /* candidate_errors[blk][256] — weighted MSE per candidate */
     float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
     uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
-    /* Per-candidate Ls/Lm — must recompute for each (d, dmin) */
     uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
     uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;
 
@@ -2208,18 +2211,23 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     for (int64_t blk = 0; blk < n_blocks; blk++) {
         const float *block_x = weights + blk * QK_K;
 
-        /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ──
-         * Seed from Phase 1 greedy, iterate 3× to converge.
-         * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
-         * This is a 2-variable WLS: minimize Σ w×(x - d×a + dmin×b)² */
+        /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── */
         float wls_dm = seeds[blk].dm;
         float wls_mm = seeds[blk].mm;
         uint8_t wls_Ls[16], wls_Lm[16];
         memcpy(wls_Ls, seeds[blk].Ls, 16);
         memcpy(wls_Lm, seeds[blk].Lm, 16);
 
+        /* Generate soft-clipped buffer for WLS internal stability */
+        float clipped_block_x[QK_K];
+        for(int i=0; i<QK_K; i++) {
+            float v = block_x[i];
+            if (v > clamp_val) v = clamp_val;
+            if (v < -clamp_val) v = -clamp_val;
+            clipped_block_x[i] = v;
+        }
+
         for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
-            /* Quantize all elements at current (wls_dm, wls_mm) */
             uint8_t L_wls[QK_K];
             for (int j = 0; j < N_SUB; j++) {
                 float d_sub = wls_dm * (float)wls_Ls[j];
@@ -2229,19 +2237,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     continue;
                 }
                 for (int k = 0; k < 16; k++) {
-                    int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
+                    int q = gguf_nearest_int((clipped_block_x[16*j+k] + m_sub) / d_sub);
                     if (q < 0) q = 0; if (q > 3) q = 3;
                     L_wls[16*j+k] = (uint8_t)q;
                 }
             }
 
-            /* Accumulate 2×2 normal equations */
             double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
             for (int j = 0; j < N_SUB; j++) {
                 float ls_f = (float)wls_Ls[j];
                 float lm_f = (float)wls_Lm[j];
                 for (int k = 0; k < 16; k++) {
-                    float x = block_x[16*j+k];
+                    float x = clipped_block_x[16*j+k];
                     float w = (imat_importance) ?
                               imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                     float a = ls_f * (float)L_wls[16*j+k];
@@ -2254,19 +2261,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
             }
 
-            /* Solve via Cramer's rule */
             double det = Saa * Sbb - Sab * Sab;
             if (fabs(det) > 1e-30) {
                 double d_new  = (Sbb * Sxa - Sab * Sxb) / det;
                 double dm_new = (Sab * Sxa - Saa * Sxb) / det;
-                /* Clamp: positive and within 4× of seed (prevent runaway) */
                 if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10))
                     wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
                 if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10))
                     wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
             }
 
-            /* Re-derive Ls/Lm for updated (d*, dmin*) */
             for (int j = 0; j < N_SUB; j++) {
                 if (wls_dm > 1e-15f) {
                     int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm);
@@ -2281,9 +2285,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             }
         }
 
-        /* ── Step 2b: Generate 16×16 candidates centered on WLS optimum ──
-         * Grid is now centered on (wls_dm, wls_mm) not (greedy_dm, greedy_mm).
-         * Tighter spacing because we're already near the true minimum. */
+        /* ── Step 2b: Generate Candidates ── */
         for (int di = 0; di < N_CAND_D; di++) {
             float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di];
             uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm);
@@ -2298,87 +2300,58 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 candidate_d[blk][cidx] = trial_d16;
                 candidate_dmin[blk][cidx] = trial_dmin16;
 
-                /* Recompute Ls/Lm for THIS candidate dm/mm */
                 uint8_t trial_Ls[16], trial_Lm[16];
                 for (int j = 0; j < N_SUB; j++) {
                     if (actual_dm > 1e-15f) {
                         int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm);
                         if (ls < 0) ls = 0; if (ls > 15) ls = 15;
                         trial_Ls[j] = (uint8_t)ls;
-                    } else {
-                        trial_Ls[j] = 0;
-                    }
+                    } else { trial_Ls[j] = 0; }
                     if (actual_mm > 1e-15f) {
                         int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm);
                         if (lm < 0) lm = 0; if (lm > 15) lm = 15;
                         trial_Lm[j] = (uint8_t)lm;
-                    } else {
-                        trial_Lm[j] = 0;
-                    }
+                    } else { trial_Lm[j] = 0; }
                 }
                 memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
                 memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);
 
-                /* Fully re-quantize and measure weighted MSE */
-                float err = 0.0f;
-                for (int j = 0; j < N_SUB; j++) {
-                    float d = actual_dm * (float)trial_Ls[j];
-                    float m = actual_mm * (float)trial_Lm[j];
+                /* Error evaluation MUST use the non-clipped original weights */
+                float e_all[QK_K], w_all[QK_K];
+                for (int i = 0; i < QK_K; i++) {
+                    int jj   = i >> 4;
+                    float d  = actual_dm * (float)trial_Ls[jj];
+                    float m  = actual_mm * (float)trial_Lm[jj];
+                    float x  = block_x[i]; 
+                    w_all[i] = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
                     if (d < 1e-15f) {
-                        for (int k = 0; k < 16; k++) {
-                            float x = block_x[16 * j + k];
-                            float w = (imat_importance) ?
-                                      imat_importance[blk * QK_K + 16 * j + k] : 1.0f;
-                            err += x * x * w;
-                        }
-                        continue;
-                    }
-                    for (int k = 0; k < 16; k += 6) {
-                        int g_len = (k + 6 <= 16) ? 6 : (16 - k);
-                        int half_g = g_len / 2;
-                        float e_cur[6], w_cur[6];
-                        
-                        for (int kk = 0; kk < g_len; kk++) {
-                            int idx = 16 * j + k + kk;
-                            float x = block_x[idx];
-                            int q = gguf_nearest_int((x + m) / d);
-                            if (q < 0) q = 0; if (q > 3) q = 3;
-                            float deq = d * (float)q - m;
-                            e_cur[kk] = x - deq;
-                            w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
-                        }
-                        
-                        /* Decompose into vesica and wave */
-                        float vesica_err = 0.0f, wave_err = 0.0f;
-                        for (int p = 0; p < half_g; p++) {
-                            float v = e_cur[p] + e_cur[p + half_g];
-                            float w_wave = e_cur[p] - e_cur[p + half_g];
-                            float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
-                            vesica_err += v * v * w_avg;
-                            wave_err += w_wave * w_wave * w_avg;
-                        }
-                        /* Triality weighting: penalize vesica 4×, wave 1× */
-                        err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
+                        e_all[i] = x;
+                    } else {
+                        int q = gguf_nearest_int((x + m) / d);
+                        if (q < 0) q = 0; if (q > 3) q = 3;
+                        e_all[i] = x - (d * (float)q - m);
                     }
                 }
-                candidate_errors[blk][cidx] = err;
+                float vesica_err = 0.0f, wave_err = 0.0f;
+                for (int i = 0; i < QK_K / 2; i++) {
+                    float v      = e_all[i] + e_all[i + QK_K / 2];
+                    float w_wave = e_all[i] - e_all[i + QK_K / 2];
+                    float w_avg  = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f;
+                    vesica_err  += v * v * w_avg;
+                    wave_err    += w_wave * w_wave * w_avg;
+                }
+                candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err);
             }
         }
     }
 
     /* ══════════════════════════════════════════════════════════════════
      * PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement
-     *
-     * Build a multi-quhit graph where each block has 2 quhits
-     * encoding the 36 candidate errors. Shor's sequential measurement
-     * (IDFT6 + feed-forward + collapse/back-action) extracts exact
-     * marginals for optimal (d, dmin) per block — replaces BP.
      * ══════════════════════════════════════════════════════════════════ */
 
-    /* Default: use greedy candidate (index 5*10+5 = 55, mult 1.00×1.00) */
     int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
     for (int64_t i = 0; i < n_blocks; i++)
-        best_candidate[i] = 10 * N_CAND_M + 10;  /* NEIGHBOR_MULTS_D[10]=1.00, _M[10]=1.00 */
+        best_candidate[i] = 11 * N_CAND_M + 11;  /* index 11 = 1.0 multiplier */
 
     if (opt_mode != OPT_MSE && n_blocks >= 2) {
         int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks;
@@ -2391,14 +2364,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             for (int64_t i = 0; i < n_sites; i++)
                 triality_dft(&graph->locals[i]);
 
-            /* Encode each stride group's AGGREGATED candidate errors as dual-quhit
-             * amplitudes. For stride > 1, average errors across ALL blocks in
-             * the group — not just the first block. This is critical for large
-             * tensors where stride=97 means 96/97 blocks were being ignored. */
-
-            /* Compute adaptive temperature from median error spread.
-             * This ensures the Boltzmann encoding produces meaningful distributions
-             * regardless of weight magnitude (σ=0.0003 vs σ=0.024). */
             {
                 double err_accum = 0.0;
                 int err_count = 0;
@@ -2413,18 +2378,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
                 if (err_count > 0) {
                     float median_err = (float)(err_accum / err_count);
-                    /* Temperature = 10% of median max error — sharp enough to
-                     * discriminate, soft enough for Shor interference */
                     temperature = median_err * 0.1f;
                     if (temperature < 1e-10f) temperature = 1e-10f;
                 }
             }
 
             for (int64_t i = 0; i < graph_blocks; i++) {
-                /* Aggregate errors across entire stride group */
                 float agg_errors[TOTAL_SCALE_CANDIDATES];
-                for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
-                    agg_errors[c] = 0.0f;
+                for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f;
 
                 int64_t blk_start = i * stride;
                 int64_t blk_end = blk_start + stride;
@@ -2435,7 +2396,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
                         agg_errors[c] += candidate_errors[b][c];
                 }
-                /* Average across group */
                 if (group_size > 1) {
                     float inv_gs = 1.0f / (float)group_size;
                     for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
@@ -2447,7 +2407,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     if (agg_errors[c] < min_err)
                         min_err = agg_errors[c];
 
-                /* Quhit 0 (coarse = d dimension): marginalize over dmin */
                 double coarse_re[6];
                 double coarse_norm = 0.0;
                 for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0;
@@ -2459,14 +2418,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                                               (2.0 * (double)temperature));
                     }
                 }
-                for (int qi = 0; qi < 6; qi++)
-                    coarse_norm += coarse_re[qi] * coarse_re[qi];
+                for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi];
                 if (coarse_norm > 1e-30) {
                     double inv = 1.0 / sqrt(coarse_norm);
                     for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
                 }
 
-                /* Quhit 1 (fine = dmin dimension): marginalize over d */
                 double fine_re[6];
                 double fine_norm = 0.0;
                 for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0;
@@ -2478,14 +2435,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                                             (2.0 * (double)temperature));
                     }
                 }
-                for (int qi = 0; qi < 6; qi++)
-                    fine_norm += fine_re[qi] * fine_re[qi];
+                for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi];
                 if (fine_norm > 1e-30) {
                     double inv = 1.0 / sqrt(fine_norm);
                     for (int v = 0; v < 6; v++) fine_re[v] *= inv;
                 }
 
-                /* Write quhits */
                 int64_t s0 = 2 * i, s1 = 2 * i + 1;
                 for (int v = 0; v < 6; v++) {
                     graph->locals[s0].edge_re[v] = coarse_re[v];
@@ -2503,31 +2458,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 triality_update_mask(&graph->locals[s1]);
             }
 
-            /* Build edges */
             for (int64_t i = 0; i < graph_blocks; i++) {
-                hpc_cz(graph, 2 * i, 2 * i + 1);  /* intra-block: d ↔ dmin */
+                hpc_cz(graph, 2 * i, 2 * i + 1);
                 if (i + 1 < graph_blocks) {
-                    hpc_cz(graph, 2 * i, 2 * (i + 1));         /* d ↔ d neighbor */
-                    hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* dmin ↔ dmin    */
+                    hpc_cz(graph, 2 * i, 2 * (i + 1));
+                    hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1);
                 }
             }
 
-            /* ── Shor's Griffiths-Niu Sequential Measurement (dual quhit) ──
-             * Replaces BP with exact marginals via IDFT6 + feed-forward +
-             * collapse/back-action (ported 1:1 from tesseract_factor.c).
-             *
-             * The dual-quhit graph has 2×graph_blocks sites:
-             *   Even sites (s0 = 2*i):   coarse (d dimension)
-             *   Odd sites  (s1 = 2*i+1): fine (dmin dimension)
-             *
-             * Single-pass sequential measurement produces exact marginals
-             * for both dimensions simultaneously through the CZ correlations. */
             double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6]));
             int *shor_measured = (int *)calloc(n_sites, sizeof(int));
 
             shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1);
 
-            /* Extract coarse (d) and fine (dmin) marginals from Shor output */
             double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
             double (*fine_marg)[6]   = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
 
@@ -2541,277 +2484,233 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             free(shor_marg);
             free(shor_measured);
 
-            /* ══ Hensel-Inspired Beam Search Constraint Propagation ══
-             * Like tesseract_factor's Hensel lift: process blocks sequentially,
-             * maintain K best configurations, prune by accumulated error.
+            /* ══════════════════════════════════════════════════════════════
+             * PHASE 3 — DETERMINISTIC VITERBI DP
              *
-             * The constraint: blocks are selected JOINTLY. */
-
-            #define N_BEAMS 24  /* K beams — widened for 31B (was 12) */
-
-            typedef struct {
-                double acc_error;
-                int history_idx;  /* index into the backpointer array */
-            } QuantBeam;
-
-            typedef struct {
-                int cand_idx;
-                int parent_idx;
-            } BeamHistory;
-
-            QuantBeam beams[N_BEAMS];
-            int active_beams = 1;
-
-            /* Pre-allocate history to avoid O(N^2) memory copies */
-            BeamHistory *history = (BeamHistory *)malloc(n_blocks * N_BEAMS * sizeof(BeamHistory));
-
-            for (int b = 0; b < N_BEAMS; b++) {
-                beams[b].acc_error = 0.0;
-                beams[b].history_idx = -1;
-            }
+             * Replaces the probabilistic beam-search + Born-rule Monte-Carlo
+             * shots with an exact, fully-deterministic DP over the 36-state
+             * Shor quhit space (6 coarse bins × 6 fine bins).
+             *
+             * For each graph block i and combined state s = qi_d*6 + qi_m:
+             *
+             *   bin_best_err[i][s]  = min candidate error in that (d,m)-bin
+             *                         aggregated over the stride group
+             *   bin_log_prior[i][s] = log P_coarse(qi_d) + log P_fine(qi_m)
+             *                         from Shor marginals → HPC prior bonus
+             *
+             * Local Viterbi cost (lower = better):
+             *   vcost[i][s] = bin_best_err[i][s]
+             *               − VITERBI_BETA × scale_err × bin_log_prior[i][s]
+             *
+             * Transition cost (cross-block smoothness prior):
+             *   trans(s′→s) = VITERBI_ALPHA × scale_err
+             *                 × (|qi_d − qi_d′| + |qi_m − qi_m′|)
+             *
+             * DP recurrence:
+             *   dp[0][s] = vcost[0][s]
+             *   dp[i][s] = vcost[i][s] + min_{s′}(dp[i-1][s′] + trans(s′→s))
+             *
+             * Traceback yields the globally optimal sequence of bin choices,
+             * which is then mapped to per-block best_candidate[] indices.
+             * A 5%-threshold greedy override rescues blocks where the local
+             * MSE-optimal candidate is meaningfully better than the bin winner.
+             * ══════════════════════════════════════════════════════════════ */
 
-            /* Process blocks sequentially with beam search */
-            for (int64_t i = 0; i < graph_blocks; i++) {
-                double c_total = 0.0, f_total = 0.0;
-                for (int v = 0; v < 6; v++) {
-                    c_total += coarse_marg[i][v];
-                    f_total += fine_marg[i][v];
-                }
+            #define VIT_N_STATES  36      /* 6 coarse × 6 fine quhit bins        */
+            #define VITERBI_BETA  0.25f   /* log-prior bonus weight               */
+            #define VITERBI_ALPHA 0.08f   /* cross-block smoothness penalty weight */
 
-                /* Candidate scores for this block: triality prob × (1/normalized_error) */
-                double cand_score[TOTAL_SCALE_CANDIDATES];
-                int64_t blk = i * stride;
-                int d_bin_count[6] = {0}, m_bin_count[6] = {0};
-                for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++;
-                for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++;
-                /* Per-block error normalization: divide by block mean error
-                 * so small-weight blocks don't dominate beam selection */
-                float blk_mean_err = 0.0f;
-                for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
-                    blk_mean_err += candidate_errors[blk][c];
-                blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES;
-                if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
-                for (int di = 0; di < N_CAND_D; di++) {
-                    int qi_d = CAND_TO_QUHIT[di];
-                    double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0;
-                    p_d /= (double)d_bin_count[qi_d];
-                    for (int mi = 0; mi < N_CAND_M; mi++) {
-                        int qi_m = CAND_TO_QUHIT[mi];
-                        double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0;
-                        p_m /= (double)m_bin_count[qi_m];
-                        int cidx = di * N_CAND_M + mi;
-                        cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15);
+            {
+                int64_t vit_gi, vit_b;
+                int     vit_s, vit_sp;
+
+                /* Per-graph-block per-state workspace */
+                float (*vit_bin_err )[VIT_N_STATES] =
+                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+                int   (*vit_bin_cand)[VIT_N_STATES] =
+                    (int   (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int  [VIT_N_STATES]));
+                float (*vit_log_pri )[VIT_N_STATES] =
+                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+                float (*vit_dp      )[VIT_N_STATES] =
+                    (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+                int   (*vit_back    )[VIT_N_STATES] =
+                    (int   (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int  [VIT_N_STATES]));
+
+                /* ── Step A: build per-block per-bin statistics ── */
+                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+                    double c_tot = 0.0, f_tot = 0.0;
+
+                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+                        vit_bin_err [vit_gi][vit_s] = 1e30f;
+                        vit_bin_cand[vit_gi][vit_s] = -1;
                     }
-                }
 
-                /* Extend beams × 36 candidates, keep top K */
-                typedef struct { double score; int beam_idx; int cand_idx; } BeamExt;
-                BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES];
-                int n_ext = 0;
+                    /* Best candidate per (qi_d, qi_m) bin over stride group */
+                    for (vit_b = vit_gi * stride;
+                         vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
+                         vit_b++) {
+                        int vit_c;
+                        for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+                            int qi_d = CAND_TO_QUHIT[vit_c / N_CAND_M];
+                            int qi_m = CAND_TO_QUHIT[vit_c % N_CAND_M];
+                            vit_s = qi_d * 6 + qi_m;
+                            float e = candidate_errors[vit_b][vit_c];
+                            if (e < vit_bin_err[vit_gi][vit_s]) {
+                                vit_bin_err[vit_gi][vit_s] = e;
+                                /* Canonical candidate = stride-rep block's best */
+                                if (vit_b == vit_gi * stride)
+                                    vit_bin_cand[vit_gi][vit_s] = vit_c;
+                            }
+                        }
+                    }
 
-                for (int b = 0; b < active_beams; b++) {
-                    for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
-                        /* Score = -(accumulated_error + this_block_error) × triality_prob */
-                        double ext_err = beams[b].acc_error + candidate_errors[blk][c];
-                        double ext_score = cand_score[c] / (ext_err + 1e-15);
-                        extensions[n_ext].score = ext_score;
-                        extensions[n_ext].beam_idx = b;
-                        extensions[n_ext].cand_idx = c;
-                        n_ext++;
+                    /* HPC log-prior from Shor marginals */
+                    for (int v = 0; v < 6; v++) {
+                        c_tot += coarse_marg[vit_gi][v];
+                        f_tot += fine_marg  [vit_gi][v];
+                    }
+                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+                        int qi_d = vit_s / 6, qi_m = vit_s % 6;
+                        double pc = (c_tot > 1e-30)
+                                    ? coarse_marg[vit_gi][qi_d] / c_tot : 1.0/6.0;
+                        double pf = (f_tot > 1e-30)
+                                    ? fine_marg  [vit_gi][qi_m] / f_tot : 1.0/6.0;
+                        vit_log_pri[vit_gi][vit_s] =
+                            (float)(log(pc + 1e-30) + log(pf + 1e-30));
                     }
                 }
 
-                /* Top-K selection */
-                int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS;
-                int top_indices[N_BEAMS];
-                for (int k = 0; k < top_k; k++) {
-                    int best = -1;
-                    double best_s = -1e30;
-                    for (int e = 0; e < n_ext; e++) {
-                        if (extensions[e].score > best_s) {
-                            best_s = extensions[e].score;
-                            best = e;
+                /* ── Step B: scale_err normaliser for transition cost ── */
+                float vit_scale_err = 0.0f;
+                int   vit_scale_cnt = 0;
+                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+                        if (vit_bin_err[vit_gi][vit_s] < 1e29f) {
+                            vit_scale_err += vit_bin_err[vit_gi][vit_s];
+                            vit_scale_cnt++;
                         }
                     }
-                    top_indices[k] = best;
-                    extensions[best].score = -2e30;  /* poison */
                 }
-
-                /* Build new beams from top-K extensions using backpointers */
-                QuantBeam new_beams[N_BEAMS];
-                for (int k = 0; k < top_k; k++) {
-                    int ext_idx = top_indices[k];
-                    int src_beam = extensions[ext_idx].beam_idx;
-                    int cand = extensions[ext_idx].cand_idx;
-
-                    int hist_idx = i * N_BEAMS + k;
-                    history[hist_idx].cand_idx = cand;
-                    history[hist_idx].parent_idx = beams[src_beam].history_idx;
-
-                    new_beams[k].history_idx = hist_idx;
-                    new_beams[k].acc_error = beams[src_beam].acc_error
-                                            + candidate_errors[blk][cand];
+                vit_scale_err = (vit_scale_cnt > 0)
+                                ? vit_scale_err / (float)vit_scale_cnt : 1e-10f;
+                if (vit_scale_err < 1e-20f) vit_scale_err = 1e-20f;
+
+                /* ── Step C: Forward Viterbi pass ── */
+
+                /* Block 0 — no predecessor */
+                for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+                    float local = (vit_bin_err[0][vit_s] < 1e29f)
+                        ? vit_bin_err[0][vit_s]
+                          - VITERBI_BETA * vit_scale_err * vit_log_pri[0][vit_s]
+                        : 1e30f;
+                    vit_dp  [0][vit_s] = local;
+                    vit_back[0][vit_s] = -1;
                 }
 
-                for (int k = 0; k < top_k; k++)
-                    beams[k] = new_beams[k];
-                active_beams = top_k;
-            }
-
-            /* Trace back the best beam's selections.
-             * The beam search selects one candidate per GRAPH NODE (stride group).
-             * For stride > 1, each block within the stride group independently
-             * picks its own best candidate — using the beam's coarse/fine quhit
-             * bins as a constraint, but evaluating its own candidate_errors.
-             * This eliminates stride-aliasing: previously 96/97 blocks were
-             * forced to use a candidate chosen for 1 representative block. */
-            int curr_hist = beams[0].history_idx;
-            for (int64_t i = graph_blocks - 1; i >= 0; i--) {
-                int group_cidx;
-                if (curr_hist >= 0) {
-                    group_cidx = history[curr_hist].cand_idx;
-                    curr_hist = history[curr_hist].parent_idx;
-                } else {
-                    group_cidx = 10 * N_CAND_M + 10;
-                }
-
-                if (stride <= 1) {
-                    /* No stride group — direct assignment */
-                    best_candidate[i] = group_cidx;
-                } else {
-                    /* Per-block local optimization within the stride group.
-                     * The beam-selected candidate determines the target quhit
-                     * bins (d_bin, dmin_bin). Each block picks its own best
-                     * candidate that falls in compatible bins, or falls back
-                     * to the globally best candidate for that block. */
-                    int group_di = group_cidx / N_CAND_M;
-                    int group_mi = group_cidx % N_CAND_M;
-                    int target_d_bin = CAND_TO_QUHIT[group_di];
-                    int target_m_bin = CAND_TO_QUHIT[group_mi];
-
-                    for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
-                        /* Find best candidate in same quhit bins */
-                        float best_err = 1e30f;
-                        int best_c = group_cidx;
-
-                        for (int di = 0; di < N_CAND_D; di++) {
-                            if (CAND_TO_QUHIT[di] != target_d_bin) continue;
-                            for (int mi = 0; mi < N_CAND_M; mi++) {
-                                if (CAND_TO_QUHIT[mi] != target_m_bin) continue;
-                                int cidx = di * N_CAND_M + mi;
-                                if (candidate_errors[b][cidx] < best_err) {
-                                    best_err = candidate_errors[b][cidx];
-                                    best_c = cidx;
-                                }
-                            }
+                /* Blocks 1..graph_blocks-1 */
+                for (vit_gi = 1; vit_gi < graph_blocks; vit_gi++) {
+                    for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+                        float local;
+                        float best_pred = 1e30f;
+                        int   best_sp   = 0;
+                        int qi_d = vit_s / 6;
+                        int qi_m = vit_s % 6;
+
+                        if (vit_bin_err[vit_gi][vit_s] > 1e29f) {
+                            vit_dp  [vit_gi][vit_s] = 1e30f;
+                            vit_back[vit_gi][vit_s] = 0;
+                            continue;
                         }
-
-                        /* Also check if the block's overall best is significantly
-                         * better — if so, use it (greedy override) */
-                        float global_best = 1e30f;
-                        int global_best_c = group_cidx;
-                        for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
-                            if (candidate_errors[b][c] < global_best) {
-                                global_best = candidate_errors[b][c];
-                                global_best_c = c;
+                        local = vit_bin_err[vit_gi][vit_s]
+                              - VITERBI_BETA * vit_scale_err * vit_log_pri[vit_gi][vit_s];
+
+                        /* Min-cost predecessor with Manhattan transition penalty */
+                        for (vit_sp = 0; vit_sp < VIT_N_STATES; vit_sp++) {
+                            float prev = vit_dp[vit_gi - 1][vit_sp];
+                            if (prev > 1e29f) continue;
+                            int td = abs(qi_d - (vit_sp / 6));
+                            int tm = abs(qi_m - (vit_sp % 6));
+                            float trans = VITERBI_ALPHA * vit_scale_err * (float)(td + tm);
+                            float total = prev + trans;
+                            if (total < best_pred) {
+                                best_pred = total;
+                                best_sp   = vit_sp;
                             }
                         }
-
-                        /* Use bin-constrained choice unless the global best
-                         * is >5% better — preserves Shor coherence while
-                         * allowing escape from bad bin assignments */
-                        if (global_best < best_err * 0.95f)
-                            best_candidate[b] = global_best_c;
-                        else
-                            best_candidate[b] = best_c;
+                        vit_dp  [vit_gi][vit_s] = (best_pred < 1e29f)
+                                                   ? best_pred + local : 1e30f;
+                        vit_back[vit_gi][vit_s] = best_sp;
                     }
                 }
-            }
-
-            free(history);
 
-            /* ══════════════════════════════════════════════════════════════
-             * Phase 3.5: Born-Rule Multi-Shot Scale Refinement (Q2_K)
-             *
-             * 2D Born sampling: sample coarse quhit (d dimension) and
-             * fine quhit (dmin dimension) jointly from triality marginals.
-             * Each shot produces a (d_idx, dmin_idx) pair per block.
-             * ══════════════════════════════════════════════════════════════ */
-            {
-                #define Q2K_BORN_SHOTS 64
-
-                float beam_total_err = 0.0f;
-                for (int64_t bi = 0; bi < n_blocks; bi++)
-                    beam_total_err += candidate_errors[bi][best_candidate[bi]];
-
-                unsigned int born_rng_q2 = 271828;
-                /* Compute tail error once (blocks beyond graph coverage) */
-                float tail_err = 0.0f;
-                for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
-                    tail_err += candidate_errors[bi][best_candidate[bi]];
-
-                /* Sparse shot buffer: only track stride-sampled blocks */
-                int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int));
-
-                for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
-                    float shot_err = tail_err;
-
-                    for (int64_t gi = 0; gi < graph_blocks; gi++) {
-                        /* Born sample coarse (d) quhit */
-                        double c_total = 0.0;
-                        for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v];
-                        born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
-                        double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0;
-                        double target_c = rnd_c * c_total;
-                        double cum_c = 0.0;
-                        int qi_d = 5;
-                        for (int v = 0; v < 6; v++) {
-                            cum_c += coarse_marg[gi][v];
-                            if (cum_c > target_c) { qi_d = v; break; }
-                        }
-
-                        /* Born sample fine (dmin) quhit */
-                        double f_total = 0.0;
-                        for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v];
-                        born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
-                        double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0;
-                        double target_f = rnd_f * f_total;
-                        double cum_f = 0.0;
-                        int qi_m = 5;
-                        for (int v = 0; v < 6; v++) {
-                            cum_f += fine_marg[gi][v];
-                            if (cum_f > target_f) { qi_m = v; break; }
+                /* ── Step D: Traceback ── */
+                int *vit_path = (int *)malloc(graph_blocks * sizeof(int));
+                {
+                    int   best_s = 0;
+                    float best_f = vit_dp[graph_blocks - 1][0];
+                    for (vit_s = 1; vit_s < VIT_N_STATES; vit_s++) {
+                        if (vit_dp[graph_blocks - 1][vit_s] < best_f) {
+                            best_f = vit_dp[graph_blocks - 1][vit_s];
+                            best_s = vit_s;
                         }
+                    }
+                    vit_path[graph_blocks - 1] = best_s;
+                    for (vit_gi = graph_blocks - 2; vit_gi >= 0; vit_gi--)
+                        vit_path[vit_gi] = vit_back[vit_gi + 1][vit_path[vit_gi + 1]];
+                }
 
-                        /* Find best candidate within the sampled (d_bin, m_bin) */
-                        int64_t blk = gi * stride;
-                        float best_bin_err = 1e30f;
-                        int best_bin_cand = 10 * N_CAND_M + 10;
-                        for (int di = 0; di < N_CAND_D; di++) {
-                            if (CAND_TO_QUHIT[di] != qi_d) continue;
-                            for (int mi = 0; mi < N_CAND_M; mi++) {
-                                if (CAND_TO_QUHIT[mi] != qi_m) continue;
-                                int cidx = di * N_CAND_M + mi;
-                                if (candidate_errors[blk][cidx] < best_bin_err) {
-                                    best_bin_err = candidate_errors[blk][cidx];
-                                    best_bin_cand = cidx;
-                                }
+                /* ── Step E: Map Viterbi path → best_candidate[] ── */
+                for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+                    vit_s = vit_path[vit_gi];
+                    int qi_d = vit_s / 6;
+                    int qi_m = vit_s % 6;
+                    int64_t blk_rep = vit_gi * stride;
+
+                    /* Stride-representative block: use precomputed bin winner */
+                    if (vit_bin_cand[vit_gi][vit_s] >= 0)
+                        best_candidate[blk_rep] = vit_bin_cand[vit_gi][vit_s];
+
+                    /* Non-representative blocks in the stride group */
+                    for (vit_b = blk_rep + 1;
+                         vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
+                         vit_b++) {
+                        int vit_c;
+                        float best_e = 1e30f;
+                        int   best_c = best_candidate[blk_rep];
+                        for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+                            if (CAND_TO_QUHIT[vit_c / N_CAND_M] != qi_d) continue;
+                            if (CAND_TO_QUHIT[vit_c % N_CAND_M] != qi_m) continue;
+                            if (candidate_errors[vit_b][vit_c] < best_e) {
+                                best_e = candidate_errors[vit_b][vit_c];
+                                best_c = vit_c;
                             }
                         }
-
-                        shot_sparse[gi] = best_bin_cand;
-                        shot_err += candidate_errors[blk][best_bin_cand];
+                        best_candidate[vit_b] = best_c;
                     }
+                }
 
-                    if (shot_err < beam_total_err) {
-                        /* Only now apply the sparse updates to best_candidate */
-                        for (int64_t gi = 0; gi < graph_blocks; gi++)
-                            best_candidate[gi * stride] = shot_sparse[gi];
-                        beam_total_err = shot_err;
+                /* ── Step F: 5 % greedy override (pure MSE safety net) ── */
+                for (vit_b = 0; vit_b < n_blocks; vit_b++) {
+                    int vit_c;
+                    float cur_err = candidate_errors[vit_b][best_candidate[vit_b]];
+                    float g_best  = cur_err;
+                    int   g_cand  = best_candidate[vit_b];
+                    for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+                        if (candidate_errors[vit_b][vit_c] < g_best) {
+                            g_best = candidate_errors[vit_b][vit_c];
+                            g_cand = vit_c;
+                        }
                     }
+                    if (g_best < cur_err * 0.95f)
+                        best_candidate[vit_b] = g_cand;
                 }
 
-                free(shot_sparse);
+                free(vit_path);
+                free(vit_dp);
+                free(vit_back);
+                free(vit_bin_err);
+                free(vit_bin_cand);
+                free(vit_log_pri);
             }
 
             free(coarse_marg);
@@ -2819,7 +2718,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             hpc_destroy(graph);
         }
     } else {
-        /* OPT_MSE or single block: pick candidate with lowest raw error */
         for (int64_t blk = 0; blk < n_blocks; blk++) {
             float best_err = candidate_errors[blk][0];
             int best_idx = 0;
@@ -2834,27 +2732,80 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     }
 
     /* ══════════════════════════════════════════════════════════════════
-     * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
+     * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
+     *
+     * Transforms the tensor from a collection of isolated 256-element
+     * Q2_K superblocks into a single, continuous error-cancelling waveform.
+     *
+     * After Phase 3 has selected the optimal (d, dmin) candidate for every
+     * block, this sequential pass computes the net DC residual left by each
+     * block using a cheap round-nearest forward quantization, then feeds the
+     * negated, exponentially-decayed residual as a correction bias into the
+     * WLS solver of the immediately following block.
+     *
+     * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
+     *
+     *   dc_bias[N+1] = −DC_DECAY × R_N / QK_K      (per-element offset)
      *
-     * Like Q4_0's CF analog: the beam search / Born shots selected a
-     * grid candidate (d_grid, dmin_grid). Now we EXTRACT the exact
-     * optimal FP16 (d, dmin) via weighted least-squares, holding the
-     * sub-block Ls/Lm and quantized levels fixed.
+     * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
+     * quantizer toward codes whose reconstruction deq ≈ x′, so that
      *
-     * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
+     *   Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
      *
-     * Full analog assembly: at each iteration, EXHAUSTIVELY search
-     * all 16×16 = 256 possible (Ls[j], Lm[j]) pairs per sub-block
-     * to find the assignment that minimizes weighted reconstruction
-     * error. Then WLS-solve for the global (d, dmin). Repeat 5×.
+     * The accumulated cross-block DC collapses geometrically:
      *
-     * This guarantees every parameter is at its conditional optimum —
-     * the perfect bit analog at 2-bit resolution.
+     *   R₀, DC_DECAY·R₀, DC_DECAY²·R₀, …  → 0
+     *
+     * The result is written into block_dc_bias[n_blocks].  Phase 4 reads
+     * this array (safe: written sequentially before the parallel loop).
+     * ══════════════════════════════════════════════════════════════════ */
+
+    #define DC_DECAY 0.85f   /* Boundary-condition leak factor (0 = isolated, 1 = full) */
+
+    float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
+
+    if (block_dc_bias) {
+        float rolling_dc = 0.0f;
+
+        for (int64_t blk = 0; blk < n_blocks; blk++) {
+            const float *bx  = weights + blk * QK_K;
+            int          cidx = best_candidate[blk];
+            float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
+            float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+
+            /* Bias applied to THIS block's WLS targets */
+            float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
+            block_dc_bias[blk]  = dc_bias;
+
+            /* Quick round-nearest quant to estimate DC residual for NEXT block.
+             * We quantize the adjusted target x′ = x − dc_bias, then measure
+             * the residual of the ORIGINAL weight against the chosen code. */
+            float dc_res = 0.0f;
+            int   j, k;
+            for (j = 0; j < N_SUB; j++) {
+                float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
+                float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
+                for (k = 0; k < 16; k++) {
+                    float x_adj = bx[16*j + k] - dc_bias;
+                    int q = 0;
+                    if (d_sub >= 1e-15f) {
+                        q = gguf_nearest_int((x_adj + m_sub) / d_sub);
+                        if (q < 0) q = 0;
+                        if (q > 3) q = 3;
+                    }
+                    float deq = d_sub * (float)q - m_sub;
+                    /* Residual against ORIGINAL weight (not adjusted) */
+                    dc_res += bx[16*j + k] - deq;
+                }
+            }
+            rolling_dc = dc_res;
+        }
+    }
+
+    /* ══════════════════════════════════════════════════════════════════
+     * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
      * ══════════════════════════════════════════════════════════════════ */
 
-    /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
-     * This eliminates ~776K malloc/free cycles from the inner loop.
-     * Each thread reuses its graph via hpc_reset_for_subblock(). */
     int _n_omp_threads = 1;
     #ifdef _OPENMP
     _n_omp_threads = omp_get_max_threads();
@@ -2869,32 +2820,36 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         int cidx = best_candidate[blk];
         uint8_t Ls_blk[16], Lm_blk[16];
 
-        /* Start from HPC-selected candidate */
+        /* ── Rolling DC boundary condition ──────────────────────────────
+         * dc_adj shifts every WLS target in this block so that the net
+         * quantisation error steers toward cancelling the previous block's
+         * DC residual (written by the sequential Phase 3.9 pre-pass). */
+        float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
+
+        /* Adjusted weight view — WLS and Shor work on this array;
+         * the final error is always reported against the original block_x. */
+        float adj_block_x[QK_K];
+        {
+            int _i;
+            for (_i = 0; _i < QK_K; _i++)
+                adj_block_x[_i] = block_x[_i] - dc_adj;
+        }
+
         memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
         memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);
 
         float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
         float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
 
-        /* ── Analog assembly: iterate to convergence ──
-         * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes
-         * after 2-3 passes. Additional iterations produce negligible
-         * change in the committed FP16 values.
-         *   A) Sub-block Shor measurement to find coupled (Ls,Lm) states
-         *   B) Optimal q-value assignment
-         *   C) WLS solve for (d, dmin) */
-        for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
-
-            /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
-             * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
-             * Keep the 6 best pairs as quhit states for a 16-node graph.
-             * Run BP to jointly select the globally optimal (Ls, Lm). */
+        uint16_t prev_dm16 = 0, prev_mm16 = 0;
+        for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
+
             uint8_t state_ls[N_SUB][6];
             uint8_t state_lm[N_SUB][6];
             float state_err[N_SUB][6];
 
             for (int j = 0; j < N_SUB; j++) {
-                const float *sx = block_x + 16 * j;
+                const float *sx = adj_block_x + 16 * j;
                 for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f;
 
                 for (int try_ls = 0; try_ls <= 15; try_ls++) {
@@ -2917,7 +2872,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                             sub_err += diff * diff * w;
                         }
 
-                        /* Insert into top 6 */
                         for (int v = 0; v < 6; v++) {
                             if (sub_err < state_err[j][v]) {
                                 for (int u = 5; u > v; u--) {
@@ -2935,7 +2889,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
             }
 
-            /* Reset thread-local sub-block graph (zero allocations) */
             int _tid = 0;
             #ifdef _OPENMP
             _tid = omp_get_thread_num();
@@ -2946,19 +2899,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 float min_sub_err[N_SUB];
                 for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
 
-                /* Initialize unary potentials from local errors */
                 for (int j = 0; j < N_SUB; j++) {
                     triality_dft(&sg->locals[j]);
                     double amp_re[6];
                     double amp_norm = 0.0;
                     for (int v = 0; v < 6; v++) {
-                    /* Adaptive temperature: scale with local error spread
-                     * so Shor measurement produces meaningful interference
-                     * patterns regardless of weight magnitude */
-                    float err_spread = state_err[j][5] - state_err[j][0];
-                    float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
-                    if (sub_temp < 1e-12f) sub_temp = 1e-12f;
-                    amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
+                        float err_spread = state_err[j][5] - state_err[j][0];
+                        float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
+                        if (sub_temp < 1e-12f) sub_temp = 1e-12f;
+                        amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
                         amp_norm += amp_re[v] * amp_re[v];
                     }
                     if (amp_norm > 1e-30) {
@@ -2975,12 +2924,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     triality_update_mask(&sg->locals[j]);
                 }
 
-                /* Add coupling edges between adjacent sub-blocks */
                 for (int j = 0; j < N_SUB - 1; j++)
                     hpc_cz(sg, j, j + 1);
 
-                /* ── Shor sequential measurement on sub-block graph ──
-                 * Stack-allocated arrays: eliminates 2 calloc/free per iteration */
                 double sub_marg[N_SUB][6];
                 int sub_measured[N_SUB];
                 memset(sub_marg, 0, sizeof(sub_marg));
@@ -2988,7 +2934,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
 
                 shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
 
-                /* Extract optimal Ls/Lm from Shor marginals */
                 for (int j = 0; j < N_SUB; j++) {
                     double best_prob = -1.0;
                     int best_v = 0;
@@ -3003,7 +2948,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 }
             }
 
-            /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
             uint8_t L[QK_K];
             for (int j = 0; j < N_SUB; j++) {
                 float d_sub = dm * (float)Ls_blk[j];
@@ -3013,22 +2957,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                     continue;
                 }
                 for (int k = 0; k < 16; k++) {
-                    int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
+                    int q = gguf_nearest_int((adj_block_x[16*j+k] + m_sub) / d_sub);
                     if (q < 0) q = 0; if (q > 3) q = 3;
                     L[16*j+k] = (uint8_t)q;
                 }
             }
 
-            /* ── Step C: WLS solve for (d, dmin) ──
-             * x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
-             * Let a = Ls[j]×q[j,k], b = Lm[j]
-             * Normal equations via Cramer's rule */
             double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
             for (int j = 0; j < N_SUB; j++) {
                 float ls_f = (float)Ls_blk[j];
                 float lm_f = (float)Lm_blk[j];
                 for (int k = 0; k < 16; k++) {
-                    float x = block_x[16*j+k];
+                    float x = adj_block_x[16*j+k];
                     float w = (imat_importance) ?
                               imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                     float a = ls_f * (float)L[16*j+k];
@@ -3045,7 +2985,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             if (fabs(det) > 1e-30) {
                 double d_new  = (Sbb * Sxa - Sab * Sxb) / det;
                 double dm_new = (Sab * Sxa - Saa * Sxb) / det;
-                /* Clamp: positive and within 4× of candidate seed */
                 float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
                 float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
                 if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10))
@@ -3053,28 +2992,27 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                 if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10))
                     mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
             }
-            if (isnan(dm) || isnan(mm)) {
-                printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det);
-                exit(1);
-            }
+
+            uint16_t cur_dm16 = gguf_fp32_to_fp16(dm);
+            uint16_t cur_mm16 = gguf_fp32_to_fp16(mm);
+            if (cur_dm16 == prev_dm16 && cur_mm16 == prev_mm16) break;
+            prev_dm16 = cur_dm16;
+            prev_mm16 = cur_mm16;
         }
 
-        /* ── FP16 ULP neighborhood search for (d, dmin) ──
-         * The WLS solve found continuous-optimal (d, dmin). But FP16
-         * truncation may shift the optimum. Try ±4 ULP around both
-         * d and dmin, pick the pair with minimum reconstruction error. */
+        /* ── FP16 ULP neighborhood search for (d, dmin) — Expanded to ±8 ── */
         {
             uint16_t base_d16 = gguf_fp32_to_fp16(dm);
             uint16_t base_m16 = gguf_fp32_to_fp16(mm);
             uint16_t best_d16 = base_d16, best_m16 = base_m16;
             float best_ulp_err = 1e30f;
 
-            for (int dd = -2; dd <= 2; dd++) {
+            for (int dd = -8; dd <= 8; dd++) {
                 int cd16 = (int)base_d16 + dd;
                 if (cd16 < 0 || cd16 > 0x7BFF) continue;
                 float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16);
 
-                for (int dm_delta = -2; dm_delta <= 2; dm_delta++) {
+                for (int dm_delta = -8; dm_delta <= 8; dm_delta++) {
                     int cm16 = (int)base_m16 + dm_delta;
                     if (cm16 < 0 || cm16 > 0x7BFF) continue;
                     float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16);
@@ -3084,7 +3022,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
                         float d_sub = trial_dm * (float)Ls_blk[j];
                         float m_sub = trial_mm * (float)Lm_blk[j];
                         for (int k = 0; k < 16; k++) {
-                            float x = block_x[16*j+k];
+                            float x = adj_block_x[16*j+k];
                             float w = (imat_importance) ?
                                       imat_importance[blk * QK_K + 16*j+k] : 1.0f;
                             int q;
@@ -3109,21 +3047,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             mm = gguf_fp16_to_fp32(best_m16);
         }
 
-        /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
-         * The WLS solve may have shifted (d, dmin) after the last Step A.
-         * Neighborhood search ±2 around current values (25 pairs vs 256)
-         * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
         for (int j = 0; j < N_SUB; j++) {
-            const float *sx = block_x + 16 * j;
+            const float *sx = adj_block_x + 16 * j;
             float best_sub_err = 1e30f;
             uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
-            int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
-            int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
-            int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
-            int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
-            for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
+            for (int try_ls = 0; try_ls <= 15; try_ls++) {
                 float d_sub = dm * (float)try_ls;
-                for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
+                for (int try_lm = 0; try_lm <= 15; try_lm++) {
                     float m_sub = mm * (float)try_lm;
                     float sub_err = 0.0f;
                     for (int k = 0; k < 16; k++) {
@@ -3151,150 +3081,201 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
             Lm_blk[j] = best_lm;
         }
 
-        /* Store the extracted optimal FP16 (d, dmin) */
         output[blk].d    = gguf_fp32_to_fp16(dm);
         output[blk].dmin = gguf_fp32_to_fp16(mm);
 
         for (int j = 0; j < N_SUB; j++)
             output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4);
 
-        /* ── Final quantization with D₆ Hadamard Error Shaping ──
+        /* ── Final quantization: D₆ Hadamard Greedy Descent (deterministic) ──
          *
-         * Standard Q2_K rounds each weight independently: q = round((x+m)/d).
-         * But within a sub-block, weights share (d, m), so their quantization
-         * errors are CORRELATED. Independent rounding is suboptimal.
+         * The original Simulated Annealing acceptance rule is replaced by a
+         * strict greedy descent: only accept a flip if it strictly reduces the
+         * D₆ Hadamard metric (4·‖vesica‖² + DC²).  This makes error shaping
+         * fully deterministic and thread-safe (no rand() inside omp parallel),
+         * consistent with the Viterbi philosophy applied in Phase 3.
          *
-         * The D₆ fold (antipodal Hadamard from the triality quhit) decomposes
-         * the error vector into vesica (sum) and wave (difference) components:
-         *   vesica[k] = (e[k] + e[k+3]) / √2    — DC-like, accumulates in dot products
-         *   wave[k]   = (e[k] - e[k+3]) / √2    — noise-like, cancels in dot products
-         *
-         * We WANT large wave error and small vesica error. So we greedily
-         * flip rounding decisions (floor↔ceil) to minimize vesica energy,
-         * even if total element-wise error increases slightly.
-         *
-         * Process: 16 elements per sub-block, treat as 2 groups of 6 + 4 tail.
-         * Apply DFT₆-fold to each group of 6, minimize vesica component.
+         * The metric measures both:
+         *  - Vesica Piscis term: correlated error between weights i and i+QK_K/2
+         *    (targets the first non-DC harmonic — halfwave symmetry)
+         *  - DC term: total signed error across the 256-weight superblock
+         *    (captured and propagated to the next block by Phase 3.9)
          */
         uint8_t L[QK_K];
-        for (int j = 0; j < N_SUB; j++) {
-            float d = dm * (float)(output[blk].scales[j] & 0xF);
-            if (d < 1e-15f) {
-                for (int k = 0; k < 16; k++) L[16 * j + k] = 0;
-                continue;
+        {
+            float q_cont_all[QK_K];
+            int   q_base_all[QK_K];
+            int   q_shaped_all[QK_K];
+
+            for (int i = 0; i < QK_K; i++) {
+                int   jj  = i >> 4;
+                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+                if (d_s < 1e-15f) {
+                    q_cont_all[i] = 0.0f;
+                    q_base_all[i] = 0;
+                } else {
+                    /* Quantize the DC-adjusted target */
+                    float qc = (adj_block_x[i] + m_s) / d_s;
+                    q_cont_all[i] = qc;
+                    int qr = gguf_nearest_int(qc);
+                    if (qr < 0) qr = 0; if (qr > 3) qr = 3;
+                    q_base_all[i] = qr;
+                }
             }
-            float m = mm * (float)(output[blk].scales[j] >> 4);
-            float id = 1.0f / d;
-
-            /* Step 1: Standard nearest-rounding as baseline */
-            int q_base[16];
-            float q_cont[16];  /* continuous q values before rounding */
-            for (int k = 0; k < 16; k++) {
-                q_cont[k] = (block_x[16*j+k] + m) * id;
-                q_base[k] = gguf_nearest_int(q_cont[k]);
-                if (q_base[k] < 0) q_base[k] = 0;
-                if (q_base[k] > 3) q_base[k] = 3;
+            memcpy(q_shaped_all, q_base_all, QK_K * sizeof(int));
+
+            float e_live[QK_K];
+            for (int i = 0; i < QK_K; i++) {
+                int   jj  = i >> 4;
+                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+                float deq = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
+                /* Residual against the adjusted target (DC-corrected view) */
+                e_live[i] = adj_block_x[i] - deq;
             }
 
-            /* Step 2: D₆ Hadamard Error Shaping
-             * For each 6-element group, greedily flip the rounding decision
-             * that most reduces the D₆-folded vesica error component.
-             *
-             * D₆ fold on 6-element groups: antipodal pairs (0,3), (1,4), (2,5)
-             * vesica[k] = e[k] + e[k+3]  (k=0,1,2) — DC-like, propagates
-             * wave[k]   = e[k] - e[k+3]  (k=0,1,2) — noise-like, cancels
-             *
-             * Weight vesica 4× over wave + penalize DC (sum of all 6 errors) */
-            int q_shaped[16];
-            memcpy(q_shaped, q_base, 16 * sizeof(int));
-
-            /* Process groups: [0..5], [6..11], tail [12..15] handled by D₆ metric on available pairs */
-            for (int g = 0; g < 2; g++) {
-                int g_off = g * 6;
-                if (g_off + 5 >= 16) break;
-
-                /* Multiple greedy passes — each pass finds the single best flip */
-                for (int pass = 0; pass < 6; pass++) {
-                    int best_k = -1;
-                    int best_q_alt = 0;
-                    float best_delta = 0.0f;  /* improvement = current_metric - alt_metric */
-
-                    /* Compute current group errors */
-                    float e_cur[6];
-                    for (int kk = 0; kk < 6; kk++) {
-                        int ii = g_off + kk;
-                        float deq = d * (float)q_shaped[ii] - m;
-                        e_cur[kk] = block_x[16*j+ii] - deq;
-                    }
+            float v_live[QK_K / 2];
+            float vesica_cur = 0.0f, dc_cur = 0.0f;
+            for (int i = 0; i < QK_K / 2; i++) {
+                v_live[i] = e_live[i] + e_live[i + QK_K / 2];
+                vesica_cur += v_live[i] * v_live[i];
+            }
+            for (int i = 0; i < QK_K; i++) dc_cur += e_live[i];
+            float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
 
-                    /* Current D₆ metric: vesica energy + DC² */
-                    float vesica_cur = 0.0f, dc_cur = 0.0f;
-                    for (int p = 0; p < 3; p++) {
-                        float v = e_cur[p] + e_cur[p+3];
-                        vesica_cur += v * v;
-                    }
-                    for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
-                    float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
-
-                    /* Try flipping each element */
-                    for (int k = 0; k < 6; k++) {
-                        int idx = g_off + k;
-                        int q_cur = q_shaped[idx];
-
-                        /* Try the alternative rounding */
-                        int q_try;
-                        if (q_cont[idx] - (float)q_cur >= 0) {
-                            q_try = q_cur + 1;
-                        } else {
-                            q_try = q_cur - 1;
-                        }
-                        if (q_try < 0 || q_try > 3) continue;
-
-                        /* Compute alt errors (only element k changes) */
-                        float e_alt[6];
-                        for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
-                        float deq_try = d * (float)q_try - m;
-                        e_alt[k] = block_x[16*j+idx] - deq_try;
-
-                        /* Alt D₆ metric */
-                        float vesica_alt = 0.0f, dc_alt = 0.0f;
-                        for (int p = 0; p < 3; p++) {
-                            float v = e_alt[p] + e_alt[p+3];
-                            vesica_alt += v * v;
-                        }
-                        for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
-                        float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;
-
-                        float delta = metric_cur - metric_alt;
-                        if (delta > best_delta) {
-                            best_delta = delta;
-                            best_k = k;
-                            best_q_alt = q_try;
-                        }
+            /* Deterministic greedy descent: accept only strict improvements */
+            for (int pass = 0; pass < QK_K; pass++) {
+                int   best_k     = -1;
+                int   best_q_alt = 0;
+                float best_delta = 0.0f;   /* strictly positive threshold */
+
+                for (int k = 0; k < QK_K; k++) {
+                    int jj  = k >> 4;
+                    float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+                    if (d_s < 1e-15f) continue;
+
+                    int q_cur = q_shaped_all[k];
+                    int q_try = (q_cont_all[k] - (float)q_cur >= 0.0f)
+                                ? q_cur + 1 : q_cur - 1;
+                    if (q_try < 0 || q_try > 3) continue;
+
+                    float m_s   = mm * (float)(output[blk].scales[jj] >> 4);
+                    float e_new = adj_block_x[k] - (d_s * (float)q_try - m_s);
+                    float de    = e_new - e_live[k];
+
+                    int   pi    = (k < QK_K / 2) ? k : k - QK_K / 2;
+                    float v_new = v_live[pi] + de;
+
+                    float vesica_alt = vesica_cur - v_live[pi]*v_live[pi] + v_new*v_new;
+                    float dc_alt     = dc_cur + de;
+                    float delta      = metric_cur - (4.0f * vesica_alt + dc_alt * dc_alt);
+
+                    if (delta > best_delta) {
+                        best_delta = delta;
+                        best_k     = k;
+                        best_q_alt = q_try;
                     }
+                }
 
-                    if (best_k < 0) break;  /* no improvement found */
-                    q_shaped[g_off + best_k] = best_q_alt;  /* commit the flip */
+                if (best_k < 0) break;   /* converged — no further improvement */
+
+                q_shaped_all[best_k] = best_q_alt;
+                {
+                    int   jj_c  = best_k >> 4;
+                    float d_c   = dm * (float)(output[blk].scales[jj_c] & 0xF);
+                    float m_c   = mm * (float)(output[blk].scales[jj_c] >> 4);
+                    float e_new_c = adj_block_x[best_k] - (d_c * (float)best_q_alt - m_c);
+                    float de_c    = e_new_c - e_live[best_k];
+                    int   pi_c    = (best_k < QK_K / 2) ? best_k : best_k - QK_K / 2;
+                    float v_new_c = v_live[pi_c] + de_c;
+                    vesica_cur   += v_new_c * v_new_c - v_live[pi_c] * v_live[pi_c];
+                    dc_cur       += de_c;
+                    metric_cur    = 4.0f * vesica_cur + dc_cur * dc_cur;
+                    v_live[pi_c]  = v_new_c;
+                    e_live[best_k]= e_new_c;
                 }
             }
 
-            /* Step 3: Final error comparison — only keep shaped if it improves
-             * or is within 5% of baseline (vesica shaping trades element MSE
-             * for better spectral distribution of error) */
+            /* Choose base vs shaped by comparing MSE against original weights */
             float err_base = 0.0f, err_shaped = 0.0f;
-            for (int k = 0; k < 16; k++) {
-                float x = block_x[16*j+k];
-                float w = (imat_importance) ?
-                          imat_importance[blk * QK_K + 16*j + k] : 1.0f;
-                float deq_b = d * (float)q_base[k] - m;
-                float deq_s = d * (float)q_shaped[k] - m;
-                err_base += (x - deq_b) * (x - deq_b) * w;
-                err_shaped += (x - deq_s) * (x - deq_s) * w;
+            for (int i = 0; i < QK_K; i++) {
+                int   jj  = i >> 4;
+                float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+                float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+                float w   = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
+                float deq_b = (d_s > 1e-15f) ? (d_s * (float)q_base_all[i]   - m_s) : 0.0f;
+                float deq_s = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
+                float xv    = block_x[i];   /* original weight for error report */
+                err_base   += (xv - deq_b) * (xv - deq_b) * w;
+                err_shaped += (xv - deq_s) * (xv - deq_s) * w;
+            }
+            {
+                int use_shaped = (err_shaped <= err_base);
+                for (int i = 0; i < QK_K; i++)
+                    L[i] = (uint8_t)(use_shaped ? q_shaped_all[i] : q_base_all[i]);
             }
+        }
+
+        /* ── Cross-weight error diffusion — intra-sub-block Floyd-Steinberg ──
+         *
+         * Implements cross-weight error diffusion within each 16-weight sub-block.
+         * After the greedy descent has committed quantisation codes, the residual
+         * of each weight is partially propagated forward to the next position in
+         * the same sub-block (7/16 of the error), re-quantising if the diffused
+         * target falls in a different bin.
+         *
+         * This is the "cross-weight" dimension of the error-diffusion request:
+         * neighbouring weights share and partially absorb each other's rounding
+         * error, shaping the within-block spectrum away from the DC component
+         * that Phase 3.9 already propagates between blocks.
+         *
+         * Staying within sub-blocks avoids scale-mismatch artefacts that would
+         * arise from diffusing across the dm * Ls[j] boundary between sub-blocks.
+         *
+         * The diffused codes are accepted only when they reduce the weighted MSE
+         * against the ORIGINAL weight (not the adjusted target), so the diffusion
+         * cannot increase the total reconstruction error.
+         */
+        {
+            int fs_j, fs_k;
+            for (fs_j = 0; fs_j < N_SUB; fs_j++) {
+                int   base  = fs_j * 16;
+                float d_s   = dm * (float)(output[blk].scales[fs_j] & 0xF);
+                float m_s   = mm * (float)(output[blk].scales[fs_j] >> 4);
+                if (d_s < 1e-15f) continue;
+
+                float carry = 0.0f;   /* FS carry from position k-1 */
+
+                for (fs_k = 0; fs_k < 16; fs_k++) {
+                    int   idx    = base + fs_k;
+                    float x_orig = block_x[idx];
+                    float x_adj  = adj_block_x[idx] + carry;  /* adjusted + diffused */
+
+                    /* Propose new code from diffused target */
+                    int q_fs = gguf_nearest_int((x_adj + m_s) / d_s);
+                    if (q_fs < 0) q_fs = 0; if (q_fs > 3) q_fs = 3;
+
+                    if (q_fs != (int)L[idx]) {
+                        /* Accept only when MSE against original weight improves */
+                        float w_imp = (imat_importance)
+                                      ? imat_importance[blk * QK_K + idx] : 1.0f;
+                        float deq_old = d_s * (float)L[idx]  - m_s;
+                        float deq_new = d_s * (float)q_fs     - m_s;
+                        float e_old   = (x_orig - deq_old) * (x_orig - deq_old) * w_imp;
+                        float e_new   = (x_orig - deq_new) * (x_orig - deq_new) * w_imp;
+                        if (e_new < e_old)
+                            L[idx] = (uint8_t)q_fs;
+                    }
 
-            int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base;
-            for (int k = 0; k < 16; k++)
-                L[16 * j + k] = (uint8_t)q_final[k];
+                    /* Propagate 7/16 of the residual (adj target vs committed code) */
+                    {
+                        float deq_final = d_s * (float)L[idx] - m_s;
+                        float residual  = (adj_block_x[idx] - deq_final);
+                        carry = (fs_k < 15) ? residual * (7.0f / 16.0f) : 0.0f;
+                    }
+                }
+            }
         }
 
         for (int j = 0; j < QK_K; j += 128) {
@@ -3315,11 +3296,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         total_err += berr;
     }
 
-    /* Free thread-local sub-block graphs */
     for (int _ti = 0; _ti < _n_omp_threads; _ti++)
         hpc_destroy(_tl_graphs[_ti]);
     free(_tl_graphs);
 
+    free(block_dc_bias);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);
@@ -3332,14 +3313,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     if (verbose) {
         float rmse = sqrtf(total_err / (float)n_elements);
 
-        /* Compute weight σ for fidelity classification */
         double w_sum2 = 0.0;
         for (int64_t i = 0; i < n_elements; i++)
             w_sum2 += (double)weights[i] * (double)weights[i];
-        float w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
+        w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
         float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f;
 
-        /* Fidelity classification */
         const char *fidelity_class;
         const char *fidelity_icon;
         if (rmse <= 1.0e-04f) {
@@ -3493,8 +3472,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
 
         if (should_quantize(ti, gguf_names[i])) {
             if (is_attention_tensor(gguf_names[i])) {
-                /* Promote attention Q/K/V/O to Q4_0 for higher precision.
-                 * Attention scores are most sensitive to quantization noise. */
                 tensor_types[i] = GGML_TYPE_Q4_0;
                 int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0;
                 tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0);
@@ -3506,18 +3483,15 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
                 tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements);
             }
         } else if (ti->n_dims >= 2) {
-            /* 2D non-quantized tensors (embeddings, output) → F16 */
             tensor_types[i] = GGML_TYPE_F16;
             tensor_sizes[i] = ti->n_elements * sizeof(uint16_t);
         } else {
-            /* 1D tensors (norms, biases) → F32 */
             tensor_types[i] = GGML_TYPE_F32;
             tensor_sizes[i] = ti->n_elements * sizeof(float);
         }
 
         tensor_offsets[i] = data_offset;
 
-        /* Align each tensor to 32 bytes */
         data_offset += tensor_sizes[i];
         data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) &
                       ~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1);
@@ -3592,7 +3566,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
         int src = tensor_src_idx[i];
         const STTensorInfo *ti = st_multi_tensor_info(mf, src);
         uint64_t dims[ST_MAX_DIMS];
-        /* GGUF uses reversed dimension order from SafeTensors/PyTorch */
         int nd = ti->n_dims;
         for (int d = 0; d < nd; d++) {
             dims[d] = (uint64_t)ti->shape[nd - 1 - d];
@@ -3622,7 +3595,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
         print_progress_bar(i, total_tensors, gguf_names[i], quant_start);
 
         if (tensor_types[i] == GGML_TYPE_Q2_K) {
-            /* ── HPC-Optimized Q2_K Quantization ── */
             float *f32_data = st_multi_tensor_to_f32(mf, src);
             if (!f32_data) {
                 fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s' to F32\n",
@@ -3633,7 +3605,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             int64_t n_elements = ti->n_elements;
             float tensor_error = 0.0f;
 
-            /* Pad to QK_K boundary */
             int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
             if (padded > n_elements) {
                 f32_data = realloc(f32_data, padded * sizeof(float));
@@ -3645,7 +3616,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             int64_t n_blocks = n_elements / QK_K;
             BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K));
 
-            /* Look up imatrix importance for this tensor */
             const float *imp = NULL;
             if (imatrix) {
                 const IMatrixEntry *ime = imatrix_find_any(imatrix,
@@ -3666,13 +3636,11 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
 
             float rmse = sqrtf(tensor_error / (float)ti->n_elements);
 
-            /* Compute weight σ for fidelity gate */
             double wss = 0.0;
             for (int64_t j = 0; j < ti->n_elements; j++)
                 wss += (double)f32_data[j] * (double)f32_data[j];
             float w_sig = (float)sqrt(wss / (double)ti->n_elements);
 
-            /* Fidelity gate: classify RMSE vs 1e-04 target */
             const char *fid;
             if      (rmse <= 1.0e-04f) fid = "★★★★ ULTRA";
             else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH";
@@ -3695,7 +3663,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             free(quant_data);
             free(f32_data);
         } else if (tensor_types[i] == GGML_TYPE_Q4_0) {
-            /* ── HPC-Optimized Q4_0 Quantization (attention tensors) ── */
             float *f32_data = st_multi_tensor_to_f32(mf, src);
             if (!f32_data) {
                 fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s' to F32\n",
@@ -3705,7 +3672,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
 
             int64_t n_elements = ti->n_elements;
 
-            /* Pad to QK4_0 boundary */
             int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
             if (padded > n_elements) {
                 f32_data = realloc(f32_data, padded * sizeof(float));
@@ -3718,7 +3684,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0));
             float tensor_error = 0.0f;
 
-            /* Look up imatrix importance for this tensor */
             const float *imp = NULL;
             if (imatrix) {
                 const IMatrixEntry *ime = imatrix_find_any(imatrix,
@@ -3739,7 +3704,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
 
             float rmse = sqrtf(tensor_error / (float)ti->n_elements);
 
-            /* Compute weight σ for fidelity gate */
             double wss4 = 0.0;
             for (int64_t j = 0; j < ti->n_elements; j++)
                 wss4 += (double)f32_data[j] * (double)f32_data[j];
@@ -3767,7 +3731,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             free(q4_data);
             free(f32_data);
         } else if (tensor_types[i] == GGML_TYPE_F16) {
-            /* ── Store as F16 (embeddings, output, 2D non-quantized) ── */
             float *f32_data = st_multi_tensor_to_f32(mf, src);
             if (!f32_data) {
                 fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s'\n",
@@ -3775,7 +3738,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
                 continue;
             }
 
-            /* Convert F32 → F16 */
             uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t));
             for (int64_t j = 0; j < ti->n_elements; j++)
                 f16_data[j] = gguf_fp32_to_fp16(f32_data[j]);
@@ -3793,7 +3755,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             free(f16_data);
             free(f32_data);
         } else {
-            /* ── Keep as F32 (1D: norms, biases) ── */
             float *f32_data = st_multi_tensor_to_f32(mf, src);
             if (!f32_data) {
                 fprintf(stderr, "\n  ERROR: Failed to convert tensor '%s'\n",
@@ -3814,7 +3775,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
             free(f32_data);
         }
 
-        /* Pad to alignment */
         gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);
     }
 
@@ -3823,8 +3783,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
     long final_size = ftell(fp);
     fclose(fp);
 
-    /* ── Final summary with Shor fidelity metrics ── */
-    /* Compute original model size (all as F32) */
     int64_t original_f32_size = 0;
     for (int i = 0; i < total_tensors; i++) {
         const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]);
@@ -3840,7 +3798,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
     float mean_mse_per_tensor = (quant_count > 0) ?
                                  total_error_sum / (float)quant_count : 0.0f;
 
-    /* Fidelity classification */
     const char *overall_fid, *overall_icon;
     if      (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)";  overall_icon = "★★★★"; }
     else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)";   overall_icon = "★★★☆"; }
@@ -3931,12 +3888,12 @@ void hexstate_init(void)
 /* Quantize a single tensor's F32 data to Q2_K using HPC optimization.
  *
  * Parameters:
- *   weights:     input F32 data (must be padded to multiple of 256)
- *   n_elements:  number of elements (must be multiple of 256)
- *   output:      output buffer (must be n_elements/256 * 84 bytes)
- *   out_error:   pointer to receive total MSE (can be NULL)
- *   opt_mode:    0=HPC, 1=MSE, 2=Hybrid (recommended)
- *   verbose:     1 for per-block diagnostics
+ * weights:     input F32 data (must be padded to multiple of 256)
+ * n_elements:  number of elements (must be multiple of 256)
+ * output:      output buffer (must be n_elements/256 * 84 bytes)
+ * out_error:   pointer to receive total MSE (can be NULL)
+ * opt_mode:    0=HPC, 1=MSE, 2=Hybrid (recommended)
+ * verbose:     1 for per-block diagnostics
  */
 void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements,
                                     void *output, float *out_error,
@@ -3967,12 +3924,12 @@ int hexstate_q2k_block_elements(void) { return QK_K; }
 
 /* HPC-optimized Q4_0 quantization for attention tensors.
  * Called from Python requantizer via ctypes.
- *   weights:     input F32 weights
- *   n_elements:  number of elements (must be multiple of 32)
- *   output:      output buffer (must be n_elements/32 * 18 bytes)
- *   out_error:   pointer to receive total MSE (can be NULL)
- *   imat_importance: optional per-element importance weights
- *   verbose:     1 for per-block diagnostics
+ * weights:     input F32 weights
+ * n_elements:  number of elements (must be multiple of 32)
+ * output:      output buffer (must be n_elements/32 * 18 bytes)
+ * out_error:   pointer to receive total MSE (can be NULL)
+ * imat_importance: optional per-element importance weights
+ * verbose:     1 for per-block diagnostics
  */
 void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
                                          void *output, float *out_error,
@@ -4220,4 +4177,4 @@ int main(int argc, char **argv)
     st_multi_close(mf);
     return result;
 }
-#endif /* HEXSTATE_LIBRARY */
+#endif /* HEXSTATE_LIBRARY */
\ No newline at end of file