diff --git "a/hexstate_quantize.c" "b/hexstate_quantize.c" --- "a/hexstate_quantize.c" +++ "b/hexstate_quantize.c" @@ -1,5 +1,5 @@ /* ═══════════════════════════════════════════════════════════════════════════ - * hexstate_quantize.c — HExState GGUF Quantizer + * hexstate_quantize.c — HexState GGUF Quantizer * * ╔═══════════════════════════════════════════════════════════════╗ * ║ HPC-Optimized GGUF Quantization Engine ║ @@ -15,32 +15,32 @@ * This tool adapts the HExState HPC Ouroboros factoring engine for * LLM weight quantization. The core mathematical machinery is reused: * - * Factoring Domain → Quantization Domain - * ───────────────────────────────────────────────── - * HPCGraph + CZ edges → Block sensitivity graph - * Complex Amplitude BP → Importance propagation - * MCMC period sampler → Optimal scale search - * try_period() validation → Error bound checking - * LLL lattice reduction → (future) Adaptive bit allocation + * Factoring Domain → Quantization Domain + * ───────────────────────────────────────────────── + * HPCGraph + CZ edges → Block sensitivity graph + * Complex Amplitude BP → Importance propagation + * MCMC period sampler → Optimal scale search + * try_period() validation → Error bound checking + * LLL lattice reduction → (future) Adaptive bit allocation * * Additional techniques ported from llm-compressor: - * MSE grid search → Optimal min/max range shrinking - * Importance matrix (imatrix) → Per-channel error weighting + * MSE grid search → Optimal min/max range shrinking + * Importance matrix (imatrix) → Per-channel error weighting * * Build: - * make -f Makefile.quantize + * make -f Makefile.quantize * * Usage: - * ./hexstate_quantize [options] + * ./hexstate_quantize [options] * * Input can be: - * - A single .safetensors file - * - A model directory containing sharded .safetensors files + * - A single .safetensors file + * - A model directory containing sharded .safetensors files * * Options: - * --optimizer hpc|mse|hybrid Scale optimization strategy (default: hybrid) - * --imatrix Importance matrix for weighted quantization - * --verbose Per-block diagnostics + * --optimizer hpc|mse|hybrid Scale optimization strategy (default: hybrid) + * --imatrix Importance matrix for weighted quantization + * --verbose Per-block diagnostics * ═══════════════════════════════════════════════════════════════════════════ */ #include @@ -555,8 +555,8 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen) * SHOULD THIS TENSOR BE QUANTIZED? * * Decision rules: - * - Quantize: weight matrices (2D, large) - * - Keep F32: norms, biases, embeddings, 1D tensors + * - Quantize: weight matrices (2D, large) + * - Keep F32: norms, biases, embeddings, 1D tensors * ═══════════════════════════════════════════════════════════════════════════ */ static int should_quantize(const STTensorInfo *ti, const char *gguf_name) @@ -620,12 +620,12 @@ static int is_attention_tensor(const char *gguf_name) * For Q2_K: 256-weight superblocks. * * The 6 values per site correspond to 6 candidate scale factors: - * v=0: scale * 0.85 (aggressive, high compression) - * v=1: scale * 0.90 - * v=2: scale * 0.95 - * v=3: scale * 1.00 (standard) - * v=4: scale * 1.05 - * v=5: scale * 1.10 (conservative, less compression error) + * v=0: scale * 0.85 (aggressive, high compression) + * v=1: scale * 0.90 + * v=2: scale * 0.95 + * v=3: scale * 1.00 (standard) + * v=4: scale * 1.05 + * v=5: scale * 1.10 (conservative, less compression error) * * BP propagates: "if your neighbor block is sensitive, you should be * conservative too" — creating coherent precision allocation. @@ -637,12 +637,12 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = { }; /* ── Multi-quhit expanded scale table ── - * Search grid: 10×10 = 100 (d, dmin) candidates - * Quhit encoding: bin 10 → 6 for D=6 quhits (BP operates on 6-state marginals) - * Beam search: operates on all 100 candidates directly */ + * Search grid: 24×24 = 576 (d, dmin) candidates + * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals) + * Beam search: operates on all 576 candidates directly */ #define QUHITS_PER_BLOCK 2 -#define N_CAND_D 16 /* d multiplier candidates (was 10) */ -#define N_CAND_M 16 /* dmin multiplier candidates (was 10) */ +#define N_CAND_D 24 /* d multiplier candidates (expanded) */ +#define N_CAND_M 24 /* dmin multiplier candidates (expanded) */ #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M) static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES]; @@ -650,7 +650,7 @@ static int scale_table_initialized = 0; static void init_scale_table(void) { if (scale_table_initialized) return; - /* 100 candidates: uniform spacing centered on 1.0 */ + /* candidates: uniform spacing centered on 1.0 */ for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) { SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1)); } @@ -695,20 +695,15 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites) /* ═══════════════════════════════════════════════════════════════════════════ * FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search * - * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4 - * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation: - * x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x)) - * is close enough for error norm purposes (~1% relative error). + * powf() costs ~50-100 cycles. Use log2f+exp2f (~25 cycles) for the + * exact x^2.4 = x^2 × 2^(0.4·log2(x)) computation instead. * ═══════════════════════════════════════════════════════════════════════════ */ static inline float fast_pow_2_4(float x) { - /* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5)) - * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain: - * x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex. - * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12) - * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */ + /* x^2.4 = x^2 × 2^(0.4 × log2(x)). log2f+exp2f ≈ 25 cycles total vs + * 50-100 for powf, and produces the exact ^2.4 norm the grid search needs. */ float x2 = x * x; - return x2 * sqrtf(cbrtf(x2)); /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */ + return x2 * exp2f(0.4f * log2f(x)); /* x^2 × x^0.4 = x^2.4 */ } /* Compute the Q2_K sub-block reconstruction error for a block at a given @@ -743,16 +738,16 @@ static float compute_block_error_q2k(const float *weights, int block_size, } /* Build multi-quhit HPC sensitivity graph. - * 2 quhits per block → 36 scale candidates per block. + * 2 quhits per block → 576 scale candidates per block. * * Graph layout: sites [0..2*n-1] where: - * site 2*i = coarse quhit for block i - * site 2*i + 1 = fine quhit for block i + * site 2*i = coarse quhit for block i + * site 2*i + 1 = fine quhit for block i * * Edges: - * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling - * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor - * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */ + * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling + * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor + * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */ static HPCGraph *build_sensitivity_graph(const float *weights, int64_t n_elements, int block_size, @@ -774,13 +769,13 @@ static HPCGraph *build_sensitivity_graph(const float *weights, for (int64_t i = 0; i < n_sites; i++) triality_dft(&graph->locals[i]); - /* Compute errors for all 36 scale candidates per block, + /* Compute errors for all candidates per block, * then project onto coarse (quhit 0) and fine (quhit 1) marginals */ for (int64_t i = 0; i < graph_blocks; i++) { int64_t block_idx = i * stride; const float *block_weights = weights + block_idx * block_size; - /* Evaluate all 36 candidates */ + /* Evaluate all candidates */ float errors[TOTAL_SCALE_CANDIDATES]; float min_err = 1e30f; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { @@ -873,12 +868,12 @@ static HPCGraph *build_sensitivity_graph(const float *weights, * For a Q2_K sub-block, progressively shrink the min/max range to find * the candidate that minimizes weighted reconstruction error. * - * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink): - * candidate_min = p * min - * candidate_max = p * max - * error = ||x - quantize(x, candidate_min, candidate_max)||^norm - * if error < best: update best - * else: patience--; if patience == 0: break + * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink): + * candidate_min = p * min + * candidate_max = p * max + * error = ||x - quantize(x, candidate_min, candidate_max)||^norm + * if error < best: update best + * else: patience--; if patience == 0: break * * This is a direct C port of llm-compressor's _grid_search_mse. * ═══════════════════════════════════════════════════════════════════════════ */ @@ -977,7 +972,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax, float cur_scale = best_scale; if (cur_scale > 1e-15f) { float iscale = 1.0f / cur_scale; - for (int itry = 0; itry < 3; itry++) { + for (int itry = 0; itry < 5; itry++) { float sumlx = 0; int suml2 = 0; for (int i = 0; i < n; i++) { @@ -992,8 +987,9 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax, float sum = 0; for (int i = 0; i < n; i++) sum += x[i] - cur_scale * L[i]; - cur_min = 0.7f * cur_min + 0.3f * sum / n; - if (cur_min > 0) cur_min = 0; + /* True coordinate-descent optimal: min* = sum/n (no momentum). + * Clamp to ≤ 0 since min must be non-positive by convention. */ + cur_min = fminf(0.0f, sum / n); if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale; } } @@ -1006,12 +1002,12 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax, * HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT * * Two-phase approach: - * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants) - * This produces per-sub-block (scale, min) with 16-step search. - * Phase B: HPC BP refines the superblock-level d/dmin rounding. - * 6 candidate (d, dmin) pairs are tested; BP finds the one - * where the GLOBAL reconstruction error is minimized via - * constructive interference of per-sub-block phase coherence. + * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants) + * This produces per-sub-block (scale, min) with 16-step search. + * Phase B: HPC BP refines the superblock-level d/dmin rounding. + * 6 candidate (d, dmin) pairs are tested; BP finds the one + * where the GLOBAL reconstruction error is minimized via + * constructive interference of per-sub-block phase coherence. * ═══════════════════════════════════════════════════════════════════════════ */ /* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants). @@ -1174,22 +1170,22 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x, * Instead of iterative message-passing (BP), this uses the EXACT sequential * measurement protocol from Shor's algorithm: * - * For each block k (MSB → LSB): - * 1. Compute feed-forward phase correction from previously measured blocks - * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w) - * 3. Bake C_k into locals: α(d) *= C_k(d) - * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} - * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales - * 6. Born rule measurement → select optimal scale candidate - * 7. Collapse site + absorb edge weights into neighbors (back-action) + * For each block k (MSB → LSB): + * 1. Compute feed-forward phase correction from previously measured blocks + * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w) + * 3. Bake C_k into locals: α(d) *= C_k(d) + * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} + * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales + * 6. Born rule measurement → select optimal scale candidate + * 7. Collapse site + absorb edge weights into neighbors (back-action) * * This IS the quantum Fourier transform that creates constructive * interference at the optimal RMSE configuration, exactly as Shor's * algorithm creates interference at the correct period. * * Domain mapping: - * Factoring: oracle phase 2π×d×c_k/N → period r - * Quantize: error Boltzmann amplitudes → optimal RMSE block + * Factoring: oracle phase 2π×d×c_k/N → period r + * Quantize: error Boltzmann amplitudes → optimal RMSE block * ═══════════════════════════════════════════════════════════════════════════ */ /* ω₆ roots of unity for CZ phase lookup */ @@ -1280,17 +1276,17 @@ static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome) * Ported 1:1 from tesseract_factor.c lines 2343-2500. * * Measures sites MSB→LSB. For each site k: - * 1. Compute feed-forward phase correction θ_k from previously measured sites - * 2. Compute neighbor contribution C_k(d) analytically - * 3. Bake C_k into locals - * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} - * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6} - * 6. Compute |β(v)|² as measurement probabilities - * 7. Sample/argmax → outcome - * 8. Collapse + back-action via shor_collapse_site() + * 1. Compute feed-forward phase correction θ_k from previously measured sites + * 2. Compute neighbor contribution C_k(d) analytically + * 3. Bake C_k into locals + * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k} + * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6} + * 6. Compute |β(v)|² as measurement probabilities + * 7. Sample/argmax → outcome + * 8. Collapse + back-action via shor_collapse_site() * * Returns: marginals are written into marg_out[n_sites][6]. - * measured_out[n_sites] receives the measurement outcomes. + * measured_out[n_sites] receives the measurement outcomes. * ═══════════════════════════════════════════════════════════════════════════ */ static void shor_measure_graph(HPCGraph *graph, int64_t n_sites, double (*marg_out)[6], int *measured_out, @@ -1446,25 +1442,27 @@ static void shor_measure_graph(HPCGraph *graph, int64_t n_sites, * HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors) * * Same architecture as Q2_K HPC pipeline, but simpler: - * - One parameter per block (scale d only, no dmin) - * - Single quhit per block (6 states) - * - 10 candidate scales → bin to 6 for BP - * - 12-beam Hensel search for globally optimal configuration - * - Triality 3-view marginals for robust scoring + * - One parameter per block (scale d only, no dmin) + * - Single quhit per block (6 states) + * - 24 candidate scales → bin to 6 for BP + * - 48-beam Hensel search for globally optimal configuration + * - Triality 3-view marginals for robust scoring * * Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d * ═══════════════════════════════════════════════════════════════════════════ */ -#define Q4_N_CAND 16 /* scale candidates for Q4_0 (was 10) */ -#define Q4_N_BEAMS 24 /* beam width (was 12) */ +#define Q4_N_CAND 24 /* expanded scale candidates for Q4_0 */ +#define Q4_N_BEAMS 48 /* expanded beam width */ -/* Tight neighborhood around WLS optimum: ±10% */ +/* Tight neighborhood around WLS optimum */ static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = { - 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f, - 0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f + 0.850f, 0.880f, 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, + 0.975f, 0.985f, 0.995f, 1.000f, 1.005f, 1.015f, 1.025f, 1.035f, + 1.050f, 1.070f, 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f }; static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = { - 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, @@ -1474,6 +1472,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, int64_t n_blocks = n_elements / QK4_0; float total_err = 0.0f; + /* ── Compute Tensor Sigma for SA Temperature ── */ + double t_sum_sq = 0.0; + for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i]; + float w_sigma = sqrtf(t_sum_sq / n_elements); + /* ── Phase 1: Greedy seed — compute scale per block ── */ float *greedy_d = (float *)calloc(n_blocks, sizeof(float)); @@ -1501,7 +1504,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, /* ── Step 2a: WLS solve to find optimal d* ── */ float wls_d = greedy_d[blk]; - for (int ls_iter = 0; ls_iter < 3; ls_iter++) { + uint16_t prev_wls_d16 = 0; + for (int ls_iter = 0; ls_iter < 5; ls_iter++) { if (wls_d < 1e-15f) break; float inv_d = 1.0f / wls_d; float num = 0.0f, den = 0.0f; @@ -1519,6 +1523,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f)) wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new)); } + uint16_t cur_wls_d16 = gguf_fp32_to_fp16(wls_d); + if (cur_wls_d16 == prev_wls_d16) break; /* converged in FP16 */ + prev_wls_d16 = cur_wls_d16; } /* ── Step 2b: Generate candidates centered on WLS optimum ── */ @@ -1529,36 +1536,30 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, cand_d16[blk][ci] = d16; float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f; - float err = 0.0f; - - for (int j = 0; j < QK4_0; j += 6) { - int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j); - int half_g = g_len / 2; - float e_cur[6], w_cur[6]; - - for (int kk = 0; kk < g_len; kk++) { - int idx = j + kk; - float x = bw[idx]; - int q = (int)(x * id + 8.5f); - if (q < 0) q = 0; if (q > 15) q = 15; - float deq = ((float)q - 8.0f) * actual_d; - e_cur[kk] = x - deq; - w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f; - } - - /* Decompose into vesica (DC) and wave (AC) components */ - float vesica_err = 0.0f, wave_err = 0.0f; - for (int p = 0; p < half_g; p++) { - float v = e_cur[p] + e_cur[p + half_g]; - float w_wave = e_cur[p] - e_cur[p + half_g]; - float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; - vesica_err += v * v * w_avg; - wave_err += w_wave * w_wave * w_avg; - } - /* Triality weighting: penalize vesica 4×, wave 1×. - * Factor of 0.5 keeps scale consistent with standard MSE. */ - err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); + + /* ── Single-unit D₆ error over all QK4_0 (32) elements ── + * Antipodal pairing: (j, j + QK4_0/2) for j in [0, QK4_0/2). + * Treating the whole block as one unit eliminates boundary + * artefacts from the old 6-element chunks and correctly captures + * long-range error correlations within the block. */ + float e_all[QK4_0], w_all[QK4_0]; + for (int j = 0; j < QK4_0; j++) { + float x = bw[j]; + int q = (int)(x * id + 8.5f); + if (q < 0) q = 0; if (q > 15) q = 15; + float deq = ((float)q - 8.0f) * actual_d; + e_all[j] = x - deq; + w_all[j] = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; } + float vesica_err = 0.0f, wave_err = 0.0f; + for (int j = 0; j < QK4_0 / 2; j++) { + float v = e_all[j] + e_all[j + QK4_0 / 2]; + float w_wave = e_all[j] - e_all[j + QK4_0 / 2]; + float w_avg = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f; + vesica_err += v * v * w_avg; + wave_err += w_wave * w_wave * w_avg; + } + float err = 0.5f * (4.0f * vesica_err + wave_err); cand_errors[blk][ci] = err; } } @@ -1566,7 +1567,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, /* ── Phase 3: HPC graph — single quhit per block ── */ int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); for (int64_t i = 0; i < n_blocks; i++) - best_candidate[i] = 10; /* Q4_NEIGHBOR_MULTS[10] = 1.00 */ + best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */ if (n_blocks >= 2) { float temperature = 0.5f; @@ -1752,7 +1753,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, group_cidx = history[curr_hist].cand_idx; curr_hist = history[curr_hist].parent_idx; } else { - group_cidx = 10; + group_cidx = 11; } if (stride <= 1) { @@ -1797,17 +1798,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, * The beam search found the MAP candidate sequence. But the * triality marginals encode quantum phase-coherent structure * that a greedy beam can miss. - * - * Like tesseract_factor's MCMC period recovery (lines 1920-1964): - * 1. Take N independent Born samples from triality marginals - * 2. Each sample → full candidate assignment across all blocks - * 3. Evaluate actual RMSE for each assignment - * 4. Keep assignment with lowest total RMSE - * - * Reuses the EXISTING converged Möbius sheet — zero new BP. * ══════════════════════════════════════════════════════════════ */ { - #define Q4_BORN_SHOTS 64 + #define Q4_BORN_SHOTS 128 /* Compute beam-search baseline RMSE for comparison */ float beam_total_err = 0.0f; @@ -1847,7 +1840,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, /* Find the best candidate WITHIN this quhit bin */ int64_t blk = gi * stride; float best_bin_err = 1e30f; - int best_bin_cand = 10; /* default */ + int best_bin_cand = 11; /* default */ for (int ci = 0; ci < Q4_N_CAND; ci++) { if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) { if (cand_errors[blk][ci] < best_bin_err) { @@ -1872,6 +1865,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, free(shot_sparse_q4); } + /* Born refinement pass: non-stride blocks were set during beam + * traceback and never revisited by Born shots. For each such block + * pick the lowest-error candidate within the same quhit bin that + * the winning Born shot chose for its stride-representative. */ + if (stride > 1) { + for (int64_t b = 0; b < n_blocks; b++) { + if (b % stride == 0) continue; + int64_t rep = (b / stride) * stride; + int target_bin = Q4_CAND_TO_QUHIT[best_candidate[rep]]; + float best_b_err = 1e30f; + int best_b_cand = best_candidate[rep]; + for (int ci = 0; ci < Q4_N_CAND; ci++) { + if (Q4_CAND_TO_QUHIT[ci] != target_bin) continue; + if (cand_errors[b][ci] < best_b_err) { + best_b_err = cand_errors[b][ci]; + best_b_cand = ci; + } + } + best_candidate[b] = best_b_cand; + } + } + free(marg); hpc_destroy(graph); } @@ -1879,22 +1894,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, /* ══════════════════════════════════════════════════════════════════ * PHASE 4: Assemble blocks via least-squares scale extraction - * - * The factorer assembles a frequency register from BP marginals, - * then EXTRACTS the exact period via continued fractions. - * - * We do the same: the beam search / Born shots selected a grid - * candidate (the "assembled frequency"). Now we EXTRACT the exact - * optimal FP16 scale via weighted least-squares (the "CF step"). - * - * For Q4_0: d_optimal = Σ(w_j × x_j × q̃_j) / Σ(w_j × q̃_j²) - * where q̃_j = (q_j - 8) and q_j is quantized at the grid scale. - * - * This iterates: quantize at d_init → compute d_optimal → re-quantize - * → re-compute until convergence. 3 iterations suffice since Q4_0 - * has only 16 levels — the assignment stabilizes immediately. - * - * The grid gave us 16 possible scales. This gives us 65,536 (all FP16). * ══════════════════════════════════════════════════════════════════ */ #pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err) @@ -1905,13 +1904,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, /* Start from the grid-selected scale (the "assembled frequency") */ float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]); - /* Analog assembly: iterate to full convergence. - * 5 iterations for stable (d, q-values) coupling. */ + /* Analog assembly: iterate to full convergence. */ for (int ls_iter = 0; ls_iter < 5; ls_iter++) { if (d_current < 1e-15f) break; float id = 1.0f / d_current; - /* Quantize at current scale */ int qs_tmp[QK4_0]; for (int j = 0; j < QK4_0; j++) { int q = (int)(bw[j] * id + 8.5f); @@ -1919,8 +1916,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, qs_tmp[j] = q; } - /* Weighted least-squares: d = Σ(w × x × q̃) / Σ(w × q̃²) - * where q̃ = q - 8 (centered quantized value) */ float num = 0.0f, den = 0.0f; for (int j = 0; j < QK4_0; j++) { float q_centered = (float)qs_tmp[j] - 8.0f; @@ -1932,7 +1927,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, if (den > 1e-15f) { float d_new = num / den; - /* Clamp magnitude to prevent runaway (Q4_0 d can be negative) */ float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]); if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) { uint16_t d16 = gguf_fp32_to_fp16(d_new); @@ -1941,28 +1935,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, } } - /* ── FP16 ULP neighborhood search + sign-flip exploration ── - * The WLS solve found the continuous-optimal d. But FP16 truncation - * may shift the optimum. Try ±4 ULP around d in FP16 space, plus - * the negated scale, and pick the one with minimum reconstruction error. */ + /* ── FP16 ULP neighborhood search + sign-flip exploration ── */ { uint16_t base_d16 = gguf_fp32_to_fp16(d_current); uint16_t best_d16 = base_d16; float best_ulp_err = 1e30f; - /* Try ±4 ULP neighborhood + sign flip = up to 17 candidates */ - uint16_t ulp_candidates[17]; + /* Try ±8 ULP neighborhood + sign flip = up to 34 candidates */ + uint16_t ulp_candidates[35]; int n_ulp = 0; - for (int delta = -4; delta <= 4; delta++) { + for (int delta = -8; delta <= 8; delta++) { int cand16 = (int)base_d16 + delta; - if (cand16 >= 0 && cand16 <= 0x7BFF) /* valid positive FP16 */ + if (cand16 >= 0 && cand16 <= 0x7BFF) ulp_candidates[n_ulp++] = (uint16_t)cand16; } - /* Sign-flipped d: negate and try ±0 ULP */ { float neg_d = -d_current; uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d); - ulp_candidates[n_ulp++] = neg_d16; + for (int delta = -8; delta <= 8; delta++) { + int cand16 = (int)neg_d16 + delta; + if (cand16 >= 0 && cand16 <= 0x7BFF) + ulp_candidates[n_ulp++] = (uint16_t)cand16; + } } for (int ui = 0; ui < n_ulp; ui++) { @@ -1984,18 +1978,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, d_current = gguf_fp16_to_fp32(best_d16); } - /* Store the extracted optimal FP16 scale */ output[blk].d = gguf_fp32_to_fp16(d_current); float actual_d = d_current; float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f; - /* ── D₆ Hadamard Error Shaping for Q4_0 ── - * 32 elements per block = 5 full D₆ groups of 6 + 2 tail. - * Apply the same antipodal fold as Q2_K: minimize vesica energy - * to push quantization noise into wave (high-frequency) modes - * that cancel in dot products. */ - - /* Step 1: Standard nearest-rounding as baseline */ + /* ── D₆ Hadamard Error Shaping with Simulated Annealing ── */ int q_base[QK4_0], q_shaped[QK4_0]; float q_cont[QK4_0]; for (int j = 0; j < QK4_0; j++) { @@ -2006,73 +1993,83 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, } memcpy(q_shaped, q_base, QK4_0 * sizeof(int)); - /* Step 2: D₆ greedy flipping on 5 groups of 6 */ - for (int g = 0; g < 5; g++) { - int g_off = g * 6; - - for (int pass = 0; pass < 6; pass++) { - int best_k = -1; - int best_q_alt = 0; - float best_delta = 0.0f; - - /* Current group errors */ - float e_cur[6]; - for (int kk = 0; kk < 6; kk++) { - float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d; - e_cur[kk] = bw[g_off+kk] - deq; - } - - /* Current D₆ metric: vesica energy + DC² */ - float vesica_cur = 0.0f, dc_cur = 0.0f; - for (int p = 0; p < 3; p++) { - float v = e_cur[p] + e_cur[p+3]; - vesica_cur += v * v; - } - for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; - float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; - - /* Try flipping each element */ - for (int k = 0; k < 6; k++) { - int idx = g_off + k; - int q_cur = q_shaped[idx]; + { + float e_live[QK4_0]; + for (int j = 0; j < QK4_0; j++) { + float deq = ((float)q_shaped[j] - 8.0f) * actual_d; + e_live[j] = bw[j] - deq; + } - int q_try; - if (q_cont[idx] - (float)q_cur >= 0) { - q_try = q_cur + 1; - } else { - q_try = q_cur - 1; - } + float v_live[QK4_0 / 2]; + float vesica_cur = 0.0f, dc_cur = 0.0f; + for (int j = 0; j < QK4_0 / 2; j++) { + v_live[j] = e_live[j] + e_live[j + QK4_0 / 2]; + vesica_cur += v_live[j] * v_live[j]; + } + for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j]; + float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; + + /* Simulated Annealing parameters */ + float sa_temp = metric_cur * 0.05f; + float sa_decay = 0.90f; + + for (int pass = 0; pass < QK4_0; pass++) { + int best_k = -1; + int best_q_alt = 0; + float best_delta = -1e30f; + + for (int k = 0; k < QK4_0; k++) { + int q_cur = q_shaped[k]; + int q_try = (q_cont[k] - (float)q_cur >= 0.0f) + ? q_cur + 1 : q_cur - 1; if (q_try < 0 || q_try > 15) continue; - /* Alt errors */ - float e_alt[6]; - for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; float deq_try = ((float)q_try - 8.0f) * actual_d; - e_alt[k] = bw[idx] - deq_try; + float e_new = bw[k] - deq_try; + float de = e_new - e_live[k]; - /* Alt D₆ metric */ - float vesica_alt = 0.0f, dc_alt = 0.0f; - for (int p = 0; p < 3; p++) { - float v = e_alt[p] + e_alt[p+3]; - vesica_alt += v * v; - } - for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; + int pi = (k < QK4_0 / 2) ? k : k - QK4_0 / 2; + float v_old = v_live[pi]; + float v_new = v_old + de; + + float vesica_alt = vesica_cur - v_old * v_old + v_new * v_new; + float dc_alt = dc_cur + de; float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; float delta = metric_cur - metric_alt; if (delta > best_delta) { best_delta = delta; - best_k = k; + best_k = k; best_q_alt = q_try; } } if (best_k < 0) break; - q_shaped[g_off + best_k] = best_q_alt; + + /* SA Acceptance Rule */ + if (best_delta > 0.0f || (sa_temp > 1e-7f && expf(best_delta / sa_temp) > ((float)rand()/RAND_MAX))) { + q_shaped[best_k] = best_q_alt; + float deq_commit = ((float)best_q_alt - 8.0f) * actual_d; + float e_new_commit = bw[best_k] - deq_commit; + float de_commit = e_new_commit - e_live[best_k]; + + int pi_commit = (best_k < QK4_0 / 2) ? best_k : best_k - QK4_0 / 2; + float v_old_commit = v_live[pi_commit]; + float v_new_commit = v_old_commit + de_commit; + + vesica_cur += v_new_commit * v_new_commit - v_old_commit * v_old_commit; + dc_cur += de_commit; + metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; + + v_live[pi_commit] = v_new_commit; + e_live[best_k] = e_new_commit; + } else { + if (sa_temp < 1e-7f) break; + } + sa_temp *= sa_decay; } } - /* Step 3: Error comparison — keep shaped only if MSE doesn't worsen >5% */ float err_base = 0.0f, err_shaped = 0.0f; for (int j = 0; j < QK4_0; j++) { float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f; @@ -2081,9 +2078,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w; err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w; } - int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; + int *q_final = (err_shaped <= err_base) ? q_shaped : q_base; - /* Pack nibbles and compute error */ for (int j = 0; j < QK4_0 / 2; j++) { int q0 = q_final[j]; int q1 = q_final[j + QK4_0/2]; @@ -2114,15 +2110,22 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, init_scale_table(); + /* ── Outlier Clamping for WLS Seeds ── + * Protects the Phase 1 greedy seed from being violently warped by extreme + * >4.0 sigma outliers, which creates better centering for the grid search. */ + double t_sum_sq = 0.0; + for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i]; + float w_sigma = sqrtf(t_sum_sq / n_elements); + float clamp_val = w_sigma * 3.5f; + /* ══════════════════════════════════════════════════════════════════ * PHASE 1: Greedy quantization — produce seed (d, dmin) per block * ══════════════════════════════════════════════════════════════════ */ - /* Store Phase A/B results for all blocks */ typedef struct { - float dm, mm; /* greedy d, dmin (fp32) */ - uint16_t d_fp16, dmin_fp16; /* greedy d, dmin (fp16) */ - uint8_t Ls[16], Lm[16]; /* sub-block scale/min indices */ + float dm, mm; + uint16_t d_fp16, dmin_fp16; + uint8_t Ls[16], Lm[16]; float scales[16], mins[16], sw[16]; } BlockSeed; @@ -2138,15 +2141,21 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i]; float sigma2 = sumx2 / (float)QK_K; + /* Phase 1 WLS uses clamped values to generate stable seeds */ + float sx_clipped[16]; for (int j = 0; j < N_SUB; j++) { const float *sx = block_x + 16 * j; seeds[blk].sw[j] = 0; for (int l = 0; l < 16; l++) { float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f; - wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]); + float v = sx[l]; + if (v > clamp_val) v = clamp_val; + if (v < -clamp_val) v = -clamp_val; + sx_clipped[l] = v; + wt[l] = imp * sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]); seeds[blk].sw[j] += wt[l]; } - seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt, + seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt, L + 16 * j, &seeds[blk].mins[j], Laux); } @@ -2160,36 +2169,30 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, /* ══════════════════════════════════════════════════════════════════ * PHASE 2: WLS-Optimal Candidate Generation - * - * Instead of a fixed multiplier grid centered on greedy seeds, - * we first solve a 3-iteration Weighted Least-Squares to find - * the true optimal (d*, dmin*) per block, then generate the - * 16×16 candidate grid centered on THOSE optimal values. - * This makes the candidate space data-driven, not fabricated. * ══════════════════════════════════════════════════════════════════ */ - /* Wide neighborhood around WLS optimum: ±20% with asymmetric spacing - * — finer near 1.0 for precision, wider at edges for exploration. - * Critical for large-σ weights where the optimal (d,dmin) may be - * far from the WLS seed. */ + /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */ static const float NEIGHBOR_MULTS_D[N_CAND_D] = { - 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, - 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f + 0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f, + 0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f, + 1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f, + 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f }; static const float NEIGHBOR_MULTS_M[N_CAND_M] = { - 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f, - 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f + 0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f, + 0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f, + 1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f, + 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f }; - /* Map 16 candidates → 6 quhit states for BP encoding */ - static const int CAND_TO_QUHIT[16] = { - 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5 + /* Map 24 candidates → 6 quhit states for BP encoding */ + static const int CAND_TO_QUHIT[24] = { + 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; - /* candidate_errors[blk][256] — weighted MSE per candidate */ float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL; uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL; uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL; - /* Per-candidate Ls/Lm — must recompute for each (d, dmin) */ uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL; uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL; @@ -2208,18 +2211,23 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, for (int64_t blk = 0; blk < n_blocks; blk++) { const float *block_x = weights + blk * QK_K; - /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── - * Seed from Phase 1 greedy, iterate 3× to converge. - * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] - * This is a 2-variable WLS: minimize Σ w×(x - d×a + dmin×b)² */ + /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── */ float wls_dm = seeds[blk].dm; float wls_mm = seeds[blk].mm; uint8_t wls_Ls[16], wls_Lm[16]; memcpy(wls_Ls, seeds[blk].Ls, 16); memcpy(wls_Lm, seeds[blk].Lm, 16); + /* Generate soft-clipped buffer for WLS internal stability */ + float clipped_block_x[QK_K]; + for(int i=0; i clamp_val) v = clamp_val; + if (v < -clamp_val) v = -clamp_val; + clipped_block_x[i] = v; + } + for (int ls_iter = 0; ls_iter < 5; ls_iter++) { - /* Quantize all elements at current (wls_dm, wls_mm) */ uint8_t L_wls[QK_K]; for (int j = 0; j < N_SUB; j++) { float d_sub = wls_dm * (float)wls_Ls[j]; @@ -2229,19 +2237,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, continue; } for (int k = 0; k < 16; k++) { - int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); + int q = gguf_nearest_int((clipped_block_x[16*j+k] + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; L_wls[16*j+k] = (uint8_t)q; } } - /* Accumulate 2×2 normal equations */ double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; for (int j = 0; j < N_SUB; j++) { float ls_f = (float)wls_Ls[j]; float lm_f = (float)wls_Lm[j]; for (int k = 0; k < 16; k++) { - float x = block_x[16*j+k]; + float x = clipped_block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; float a = ls_f * (float)L_wls[16*j+k]; @@ -2254,19 +2261,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } } - /* Solve via Cramer's rule */ double det = Saa * Sbb - Sab * Sab; if (fabs(det) > 1e-30) { double d_new = (Sbb * Sxa - Sab * Sxb) / det; double dm_new = (Sab * Sxa - Saa * Sxb) / det; - /* Clamp: positive and within 4× of seed (prevent runaway) */ if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10)) wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new)); if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10)) wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); } - /* Re-derive Ls/Lm for updated (d*, dmin*) */ for (int j = 0; j < N_SUB; j++) { if (wls_dm > 1e-15f) { int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm); @@ -2281,9 +2285,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } } - /* ── Step 2b: Generate 16×16 candidates centered on WLS optimum ── - * Grid is now centered on (wls_dm, wls_mm) not (greedy_dm, greedy_mm). - * Tighter spacing because we're already near the true minimum. */ + /* ── Step 2b: Generate Candidates ── */ for (int di = 0; di < N_CAND_D; di++) { float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di]; uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm); @@ -2298,87 +2300,58 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, candidate_d[blk][cidx] = trial_d16; candidate_dmin[blk][cidx] = trial_dmin16; - /* Recompute Ls/Lm for THIS candidate dm/mm */ uint8_t trial_Ls[16], trial_Lm[16]; for (int j = 0; j < N_SUB; j++) { if (actual_dm > 1e-15f) { int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm); if (ls < 0) ls = 0; if (ls > 15) ls = 15; trial_Ls[j] = (uint8_t)ls; - } else { - trial_Ls[j] = 0; - } + } else { trial_Ls[j] = 0; } if (actual_mm > 1e-15f) { int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm); if (lm < 0) lm = 0; if (lm > 15) lm = 15; trial_Lm[j] = (uint8_t)lm; - } else { - trial_Lm[j] = 0; - } + } else { trial_Lm[j] = 0; } } memcpy(candidate_Ls[blk][cidx], trial_Ls, 16); memcpy(candidate_Lm[blk][cidx], trial_Lm, 16); - /* Fully re-quantize and measure weighted MSE */ - float err = 0.0f; - for (int j = 0; j < N_SUB; j++) { - float d = actual_dm * (float)trial_Ls[j]; - float m = actual_mm * (float)trial_Lm[j]; + /* Error evaluation MUST use the non-clipped original weights */ + float e_all[QK_K], w_all[QK_K]; + for (int i = 0; i < QK_K; i++) { + int jj = i >> 4; + float d = actual_dm * (float)trial_Ls[jj]; + float m = actual_mm * (float)trial_Lm[jj]; + float x = block_x[i]; + w_all[i] = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f; if (d < 1e-15f) { - for (int k = 0; k < 16; k++) { - float x = block_x[16 * j + k]; - float w = (imat_importance) ? - imat_importance[blk * QK_K + 16 * j + k] : 1.0f; - err += x * x * w; - } - continue; - } - for (int k = 0; k < 16; k += 6) { - int g_len = (k + 6 <= 16) ? 6 : (16 - k); - int half_g = g_len / 2; - float e_cur[6], w_cur[6]; - - for (int kk = 0; kk < g_len; kk++) { - int idx = 16 * j + k + kk; - float x = block_x[idx]; - int q = gguf_nearest_int((x + m) / d); - if (q < 0) q = 0; if (q > 3) q = 3; - float deq = d * (float)q - m; - e_cur[kk] = x - deq; - w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f; - } - - /* Decompose into vesica and wave */ - float vesica_err = 0.0f, wave_err = 0.0f; - for (int p = 0; p < half_g; p++) { - float v = e_cur[p] + e_cur[p + half_g]; - float w_wave = e_cur[p] - e_cur[p + half_g]; - float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f; - vesica_err += v * v * w_avg; - wave_err += w_wave * w_wave * w_avg; - } - /* Triality weighting: penalize vesica 4×, wave 1× */ - err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err); + e_all[i] = x; + } else { + int q = gguf_nearest_int((x + m) / d); + if (q < 0) q = 0; if (q > 3) q = 3; + e_all[i] = x - (d * (float)q - m); } } - candidate_errors[blk][cidx] = err; + float vesica_err = 0.0f, wave_err = 0.0f; + for (int i = 0; i < QK_K / 2; i++) { + float v = e_all[i] + e_all[i + QK_K / 2]; + float w_wave = e_all[i] - e_all[i + QK_K / 2]; + float w_avg = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f; + vesica_err += v * v * w_avg; + wave_err += w_wave * w_wave * w_avg; + } + candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err); } } } /* ══════════════════════════════════════════════════════════════════ * PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement - * - * Build a multi-quhit graph where each block has 2 quhits - * encoding the 36 candidate errors. Shor's sequential measurement - * (IDFT6 + feed-forward + collapse/back-action) extracts exact - * marginals for optimal (d, dmin) per block — replaces BP. * ══════════════════════════════════════════════════════════════════ */ - /* Default: use greedy candidate (index 5*10+5 = 55, mult 1.00×1.00) */ int *best_candidate = (int *)malloc(n_blocks * sizeof(int)); for (int64_t i = 0; i < n_blocks; i++) - best_candidate[i] = 10 * N_CAND_M + 10; /* NEIGHBOR_MULTS_D[10]=1.00, _M[10]=1.00 */ + best_candidate[i] = 11 * N_CAND_M + 11; /* index 11 = 1.0 multiplier */ if (opt_mode != OPT_MSE && n_blocks >= 2) { int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks; @@ -2391,14 +2364,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, for (int64_t i = 0; i < n_sites; i++) triality_dft(&graph->locals[i]); - /* Encode each stride group's AGGREGATED candidate errors as dual-quhit - * amplitudes. For stride > 1, average errors across ALL blocks in - * the group — not just the first block. This is critical for large - * tensors where stride=97 means 96/97 blocks were being ignored. */ - - /* Compute adaptive temperature from median error spread. - * This ensures the Boltzmann encoding produces meaningful distributions - * regardless of weight magnitude (σ=0.0003 vs σ=0.024). */ { double err_accum = 0.0; int err_count = 0; @@ -2413,18 +2378,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } if (err_count > 0) { float median_err = (float)(err_accum / err_count); - /* Temperature = 10% of median max error — sharp enough to - * discriminate, soft enough for Shor interference */ temperature = median_err * 0.1f; if (temperature < 1e-10f) temperature = 1e-10f; } } for (int64_t i = 0; i < graph_blocks; i++) { - /* Aggregate errors across entire stride group */ float agg_errors[TOTAL_SCALE_CANDIDATES]; - for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) - agg_errors[c] = 0.0f; + for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f; int64_t blk_start = i * stride; int64_t blk_end = blk_start + stride; @@ -2435,7 +2396,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] += candidate_errors[b][c]; } - /* Average across group */ if (group_size > 1) { float inv_gs = 1.0f / (float)group_size; for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) @@ -2447,7 +2407,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, if (agg_errors[c] < min_err) min_err = agg_errors[c]; - /* Quhit 0 (coarse = d dimension): marginalize over dmin */ double coarse_re[6]; double coarse_norm = 0.0; for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0; @@ -2459,14 +2418,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, (2.0 * (double)temperature)); } } - for (int qi = 0; qi < 6; qi++) - coarse_norm += coarse_re[qi] * coarse_re[qi]; + for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi]; if (coarse_norm > 1e-30) { double inv = 1.0 / sqrt(coarse_norm); for (int v = 0; v < 6; v++) coarse_re[v] *= inv; } - /* Quhit 1 (fine = dmin dimension): marginalize over d */ double fine_re[6]; double fine_norm = 0.0; for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0; @@ -2478,14 +2435,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, (2.0 * (double)temperature)); } } - for (int qi = 0; qi < 6; qi++) - fine_norm += fine_re[qi] * fine_re[qi]; + for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi]; if (fine_norm > 1e-30) { double inv = 1.0 / sqrt(fine_norm); for (int v = 0; v < 6; v++) fine_re[v] *= inv; } - /* Write quhits */ int64_t s0 = 2 * i, s1 = 2 * i + 1; for (int v = 0; v < 6; v++) { graph->locals[s0].edge_re[v] = coarse_re[v]; @@ -2503,31 +2458,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, triality_update_mask(&graph->locals[s1]); } - /* Build edges */ for (int64_t i = 0; i < graph_blocks; i++) { - hpc_cz(graph, 2 * i, 2 * i + 1); /* intra-block: d ↔ dmin */ + hpc_cz(graph, 2 * i, 2 * i + 1); if (i + 1 < graph_blocks) { - hpc_cz(graph, 2 * i, 2 * (i + 1)); /* d ↔ d neighbor */ - hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* dmin ↔ dmin */ + hpc_cz(graph, 2 * i, 2 * (i + 1)); + hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); } } - /* ── Shor's Griffiths-Niu Sequential Measurement (dual quhit) ── - * Replaces BP with exact marginals via IDFT6 + feed-forward + - * collapse/back-action (ported 1:1 from tesseract_factor.c). - * - * The dual-quhit graph has 2×graph_blocks sites: - * Even sites (s0 = 2*i): coarse (d dimension) - * Odd sites (s1 = 2*i+1): fine (dmin dimension) - * - * Single-pass sequential measurement produces exact marginals - * for both dimensions simultaneously through the CZ correlations. */ double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6])); int *shor_measured = (int *)calloc(n_sites, sizeof(int)); shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1); - /* Extract coarse (d) and fine (dmin) marginals from Shor output */ double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); double (*fine_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6])); @@ -2541,277 +2484,233 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, free(shor_marg); free(shor_measured); - /* ══ Hensel-Inspired Beam Search Constraint Propagation ══ - * Like tesseract_factor's Hensel lift: process blocks sequentially, - * maintain K best configurations, prune by accumulated error. + /* ══════════════════════════════════════════════════════════════ + * PHASE 3 — DETERMINISTIC VITERBI DP * - * The constraint: blocks are selected JOINTLY. */ - - #define N_BEAMS 24 /* K beams — widened for 31B (was 12) */ - - typedef struct { - double acc_error; - int history_idx; /* index into the backpointer array */ - } QuantBeam; - - typedef struct { - int cand_idx; - int parent_idx; - } BeamHistory; - - QuantBeam beams[N_BEAMS]; - int active_beams = 1; - - /* Pre-allocate history to avoid O(N^2) memory copies */ - BeamHistory *history = (BeamHistory *)malloc(n_blocks * N_BEAMS * sizeof(BeamHistory)); - - for (int b = 0; b < N_BEAMS; b++) { - beams[b].acc_error = 0.0; - beams[b].history_idx = -1; - } + * Replaces the probabilistic beam-search + Born-rule Monte-Carlo + * shots with an exact, fully-deterministic DP over the 36-state + * Shor quhit space (6 coarse bins × 6 fine bins). + * + * For each graph block i and combined state s = qi_d*6 + qi_m: + * + * bin_best_err[i][s] = min candidate error in that (d,m)-bin + * aggregated over the stride group + * bin_log_prior[i][s] = log P_coarse(qi_d) + log P_fine(qi_m) + * from Shor marginals → HPC prior bonus + * + * Local Viterbi cost (lower = better): + * vcost[i][s] = bin_best_err[i][s] + * − VITERBI_BETA × scale_err × bin_log_prior[i][s] + * + * Transition cost (cross-block smoothness prior): + * trans(s′→s) = VITERBI_ALPHA × scale_err + * × (|qi_d − qi_d′| + |qi_m − qi_m′|) + * + * DP recurrence: + * dp[0][s] = vcost[0][s] + * dp[i][s] = vcost[i][s] + min_{s′}(dp[i-1][s′] + trans(s′→s)) + * + * Traceback yields the globally optimal sequence of bin choices, + * which is then mapped to per-block best_candidate[] indices. + * A 5%-threshold greedy override rescues blocks where the local + * MSE-optimal candidate is meaningfully better than the bin winner. + * ══════════════════════════════════════════════════════════════ */ - /* Process blocks sequentially with beam search */ - for (int64_t i = 0; i < graph_blocks; i++) { - double c_total = 0.0, f_total = 0.0; - for (int v = 0; v < 6; v++) { - c_total += coarse_marg[i][v]; - f_total += fine_marg[i][v]; - } + #define VIT_N_STATES 36 /* 6 coarse × 6 fine quhit bins */ + #define VITERBI_BETA 0.25f /* log-prior bonus weight */ + #define VITERBI_ALPHA 0.08f /* cross-block smoothness penalty weight */ - /* Candidate scores for this block: triality prob × (1/normalized_error) */ - double cand_score[TOTAL_SCALE_CANDIDATES]; - int64_t blk = i * stride; - int d_bin_count[6] = {0}, m_bin_count[6] = {0}; - for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++; - for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++; - /* Per-block error normalization: divide by block mean error - * so small-weight blocks don't dominate beam selection */ - float blk_mean_err = 0.0f; - for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) - blk_mean_err += candidate_errors[blk][c]; - blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES; - if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f; - for (int di = 0; di < N_CAND_D; di++) { - int qi_d = CAND_TO_QUHIT[di]; - double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0; - p_d /= (double)d_bin_count[qi_d]; - for (int mi = 0; mi < N_CAND_M; mi++) { - int qi_m = CAND_TO_QUHIT[mi]; - double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0; - p_m /= (double)m_bin_count[qi_m]; - int cidx = di * N_CAND_M + mi; - cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15); + { + int64_t vit_gi, vit_b; + int vit_s, vit_sp; + + /* Per-graph-block per-state workspace */ + float (*vit_bin_err )[VIT_N_STATES] = + (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES])); + int (*vit_bin_cand)[VIT_N_STATES] = + (int (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int [VIT_N_STATES])); + float (*vit_log_pri )[VIT_N_STATES] = + (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES])); + float (*vit_dp )[VIT_N_STATES] = + (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES])); + int (*vit_back )[VIT_N_STATES] = + (int (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int [VIT_N_STATES])); + + /* ── Step A: build per-block per-bin statistics ── */ + for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) { + double c_tot = 0.0, f_tot = 0.0; + + for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) { + vit_bin_err [vit_gi][vit_s] = 1e30f; + vit_bin_cand[vit_gi][vit_s] = -1; } - } - /* Extend beams × 36 candidates, keep top K */ - typedef struct { double score; int beam_idx; int cand_idx; } BeamExt; - BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES]; - int n_ext = 0; + /* Best candidate per (qi_d, qi_m) bin over stride group */ + for (vit_b = vit_gi * stride; + vit_b < (vit_gi + 1) * stride && vit_b < n_blocks; + vit_b++) { + int vit_c; + for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) { + int qi_d = CAND_TO_QUHIT[vit_c / N_CAND_M]; + int qi_m = CAND_TO_QUHIT[vit_c % N_CAND_M]; + vit_s = qi_d * 6 + qi_m; + float e = candidate_errors[vit_b][vit_c]; + if (e < vit_bin_err[vit_gi][vit_s]) { + vit_bin_err[vit_gi][vit_s] = e; + /* Canonical candidate = stride-rep block's best */ + if (vit_b == vit_gi * stride) + vit_bin_cand[vit_gi][vit_s] = vit_c; + } + } + } - for (int b = 0; b < active_beams; b++) { - for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { - /* Score = -(accumulated_error + this_block_error) × triality_prob */ - double ext_err = beams[b].acc_error + candidate_errors[blk][c]; - double ext_score = cand_score[c] / (ext_err + 1e-15); - extensions[n_ext].score = ext_score; - extensions[n_ext].beam_idx = b; - extensions[n_ext].cand_idx = c; - n_ext++; + /* HPC log-prior from Shor marginals */ + for (int v = 0; v < 6; v++) { + c_tot += coarse_marg[vit_gi][v]; + f_tot += fine_marg [vit_gi][v]; + } + for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) { + int qi_d = vit_s / 6, qi_m = vit_s % 6; + double pc = (c_tot > 1e-30) + ? coarse_marg[vit_gi][qi_d] / c_tot : 1.0/6.0; + double pf = (f_tot > 1e-30) + ? fine_marg [vit_gi][qi_m] / f_tot : 1.0/6.0; + vit_log_pri[vit_gi][vit_s] = + (float)(log(pc + 1e-30) + log(pf + 1e-30)); } } - /* Top-K selection */ - int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS; - int top_indices[N_BEAMS]; - for (int k = 0; k < top_k; k++) { - int best = -1; - double best_s = -1e30; - for (int e = 0; e < n_ext; e++) { - if (extensions[e].score > best_s) { - best_s = extensions[e].score; - best = e; + /* ── Step B: scale_err normaliser for transition cost ── */ + float vit_scale_err = 0.0f; + int vit_scale_cnt = 0; + for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) { + for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) { + if (vit_bin_err[vit_gi][vit_s] < 1e29f) { + vit_scale_err += vit_bin_err[vit_gi][vit_s]; + vit_scale_cnt++; } } - top_indices[k] = best; - extensions[best].score = -2e30; /* poison */ } - - /* Build new beams from top-K extensions using backpointers */ - QuantBeam new_beams[N_BEAMS]; - for (int k = 0; k < top_k; k++) { - int ext_idx = top_indices[k]; - int src_beam = extensions[ext_idx].beam_idx; - int cand = extensions[ext_idx].cand_idx; - - int hist_idx = i * N_BEAMS + k; - history[hist_idx].cand_idx = cand; - history[hist_idx].parent_idx = beams[src_beam].history_idx; - - new_beams[k].history_idx = hist_idx; - new_beams[k].acc_error = beams[src_beam].acc_error - + candidate_errors[blk][cand]; + vit_scale_err = (vit_scale_cnt > 0) + ? vit_scale_err / (float)vit_scale_cnt : 1e-10f; + if (vit_scale_err < 1e-20f) vit_scale_err = 1e-20f; + + /* ── Step C: Forward Viterbi pass ── */ + + /* Block 0 — no predecessor */ + for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) { + float local = (vit_bin_err[0][vit_s] < 1e29f) + ? vit_bin_err[0][vit_s] + - VITERBI_BETA * vit_scale_err * vit_log_pri[0][vit_s] + : 1e30f; + vit_dp [0][vit_s] = local; + vit_back[0][vit_s] = -1; } - for (int k = 0; k < top_k; k++) - beams[k] = new_beams[k]; - active_beams = top_k; - } - - /* Trace back the best beam's selections. - * The beam search selects one candidate per GRAPH NODE (stride group). - * For stride > 1, each block within the stride group independently - * picks its own best candidate — using the beam's coarse/fine quhit - * bins as a constraint, but evaluating its own candidate_errors. - * This eliminates stride-aliasing: previously 96/97 blocks were - * forced to use a candidate chosen for 1 representative block. */ - int curr_hist = beams[0].history_idx; - for (int64_t i = graph_blocks - 1; i >= 0; i--) { - int group_cidx; - if (curr_hist >= 0) { - group_cidx = history[curr_hist].cand_idx; - curr_hist = history[curr_hist].parent_idx; - } else { - group_cidx = 10 * N_CAND_M + 10; - } - - if (stride <= 1) { - /* No stride group — direct assignment */ - best_candidate[i] = group_cidx; - } else { - /* Per-block local optimization within the stride group. - * The beam-selected candidate determines the target quhit - * bins (d_bin, dmin_bin). Each block picks its own best - * candidate that falls in compatible bins, or falls back - * to the globally best candidate for that block. */ - int group_di = group_cidx / N_CAND_M; - int group_mi = group_cidx % N_CAND_M; - int target_d_bin = CAND_TO_QUHIT[group_di]; - int target_m_bin = CAND_TO_QUHIT[group_mi]; - - for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) { - /* Find best candidate in same quhit bins */ - float best_err = 1e30f; - int best_c = group_cidx; - - for (int di = 0; di < N_CAND_D; di++) { - if (CAND_TO_QUHIT[di] != target_d_bin) continue; - for (int mi = 0; mi < N_CAND_M; mi++) { - if (CAND_TO_QUHIT[mi] != target_m_bin) continue; - int cidx = di * N_CAND_M + mi; - if (candidate_errors[b][cidx] < best_err) { - best_err = candidate_errors[b][cidx]; - best_c = cidx; - } - } + /* Blocks 1..graph_blocks-1 */ + for (vit_gi = 1; vit_gi < graph_blocks; vit_gi++) { + for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) { + float local; + float best_pred = 1e30f; + int best_sp = 0; + int qi_d = vit_s / 6; + int qi_m = vit_s % 6; + + if (vit_bin_err[vit_gi][vit_s] > 1e29f) { + vit_dp [vit_gi][vit_s] = 1e30f; + vit_back[vit_gi][vit_s] = 0; + continue; } - - /* Also check if the block's overall best is significantly - * better — if so, use it (greedy override) */ - float global_best = 1e30f; - int global_best_c = group_cidx; - for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) { - if (candidate_errors[b][c] < global_best) { - global_best = candidate_errors[b][c]; - global_best_c = c; + local = vit_bin_err[vit_gi][vit_s] + - VITERBI_BETA * vit_scale_err * vit_log_pri[vit_gi][vit_s]; + + /* Min-cost predecessor with Manhattan transition penalty */ + for (vit_sp = 0; vit_sp < VIT_N_STATES; vit_sp++) { + float prev = vit_dp[vit_gi - 1][vit_sp]; + if (prev > 1e29f) continue; + int td = abs(qi_d - (vit_sp / 6)); + int tm = abs(qi_m - (vit_sp % 6)); + float trans = VITERBI_ALPHA * vit_scale_err * (float)(td + tm); + float total = prev + trans; + if (total < best_pred) { + best_pred = total; + best_sp = vit_sp; } } - - /* Use bin-constrained choice unless the global best - * is >5% better — preserves Shor coherence while - * allowing escape from bad bin assignments */ - if (global_best < best_err * 0.95f) - best_candidate[b] = global_best_c; - else - best_candidate[b] = best_c; + vit_dp [vit_gi][vit_s] = (best_pred < 1e29f) + ? best_pred + local : 1e30f; + vit_back[vit_gi][vit_s] = best_sp; } } - } - - free(history); - /* ══════════════════════════════════════════════════════════════ - * Phase 3.5: Born-Rule Multi-Shot Scale Refinement (Q2_K) - * - * 2D Born sampling: sample coarse quhit (d dimension) and - * fine quhit (dmin dimension) jointly from triality marginals. - * Each shot produces a (d_idx, dmin_idx) pair per block. - * ══════════════════════════════════════════════════════════════ */ - { - #define Q2K_BORN_SHOTS 64 - - float beam_total_err = 0.0f; - for (int64_t bi = 0; bi < n_blocks; bi++) - beam_total_err += candidate_errors[bi][best_candidate[bi]]; - - unsigned int born_rng_q2 = 271828; - /* Compute tail error once (blocks beyond graph coverage) */ - float tail_err = 0.0f; - for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++) - tail_err += candidate_errors[bi][best_candidate[bi]]; - - /* Sparse shot buffer: only track stride-sampled blocks */ - int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int)); - - for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) { - float shot_err = tail_err; - - for (int64_t gi = 0; gi < graph_blocks; gi++) { - /* Born sample coarse (d) quhit */ - double c_total = 0.0; - for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v]; - born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; - double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0; - double target_c = rnd_c * c_total; - double cum_c = 0.0; - int qi_d = 5; - for (int v = 0; v < 6; v++) { - cum_c += coarse_marg[gi][v]; - if (cum_c > target_c) { qi_d = v; break; } - } - - /* Born sample fine (dmin) quhit */ - double f_total = 0.0; - for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v]; - born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u; - double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0; - double target_f = rnd_f * f_total; - double cum_f = 0.0; - int qi_m = 5; - for (int v = 0; v < 6; v++) { - cum_f += fine_marg[gi][v]; - if (cum_f > target_f) { qi_m = v; break; } + /* ── Step D: Traceback ── */ + int *vit_path = (int *)malloc(graph_blocks * sizeof(int)); + { + int best_s = 0; + float best_f = vit_dp[graph_blocks - 1][0]; + for (vit_s = 1; vit_s < VIT_N_STATES; vit_s++) { + if (vit_dp[graph_blocks - 1][vit_s] < best_f) { + best_f = vit_dp[graph_blocks - 1][vit_s]; + best_s = vit_s; } + } + vit_path[graph_blocks - 1] = best_s; + for (vit_gi = graph_blocks - 2; vit_gi >= 0; vit_gi--) + vit_path[vit_gi] = vit_back[vit_gi + 1][vit_path[vit_gi + 1]]; + } - /* Find best candidate within the sampled (d_bin, m_bin) */ - int64_t blk = gi * stride; - float best_bin_err = 1e30f; - int best_bin_cand = 10 * N_CAND_M + 10; - for (int di = 0; di < N_CAND_D; di++) { - if (CAND_TO_QUHIT[di] != qi_d) continue; - for (int mi = 0; mi < N_CAND_M; mi++) { - if (CAND_TO_QUHIT[mi] != qi_m) continue; - int cidx = di * N_CAND_M + mi; - if (candidate_errors[blk][cidx] < best_bin_err) { - best_bin_err = candidate_errors[blk][cidx]; - best_bin_cand = cidx; - } + /* ── Step E: Map Viterbi path → best_candidate[] ── */ + for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) { + vit_s = vit_path[vit_gi]; + int qi_d = vit_s / 6; + int qi_m = vit_s % 6; + int64_t blk_rep = vit_gi * stride; + + /* Stride-representative block: use precomputed bin winner */ + if (vit_bin_cand[vit_gi][vit_s] >= 0) + best_candidate[blk_rep] = vit_bin_cand[vit_gi][vit_s]; + + /* Non-representative blocks in the stride group */ + for (vit_b = blk_rep + 1; + vit_b < (vit_gi + 1) * stride && vit_b < n_blocks; + vit_b++) { + int vit_c; + float best_e = 1e30f; + int best_c = best_candidate[blk_rep]; + for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) { + if (CAND_TO_QUHIT[vit_c / N_CAND_M] != qi_d) continue; + if (CAND_TO_QUHIT[vit_c % N_CAND_M] != qi_m) continue; + if (candidate_errors[vit_b][vit_c] < best_e) { + best_e = candidate_errors[vit_b][vit_c]; + best_c = vit_c; } } - - shot_sparse[gi] = best_bin_cand; - shot_err += candidate_errors[blk][best_bin_cand]; + best_candidate[vit_b] = best_c; } + } - if (shot_err < beam_total_err) { - /* Only now apply the sparse updates to best_candidate */ - for (int64_t gi = 0; gi < graph_blocks; gi++) - best_candidate[gi * stride] = shot_sparse[gi]; - beam_total_err = shot_err; + /* ── Step F: 5 % greedy override (pure MSE safety net) ── */ + for (vit_b = 0; vit_b < n_blocks; vit_b++) { + int vit_c; + float cur_err = candidate_errors[vit_b][best_candidate[vit_b]]; + float g_best = cur_err; + int g_cand = best_candidate[vit_b]; + for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) { + if (candidate_errors[vit_b][vit_c] < g_best) { + g_best = candidate_errors[vit_b][vit_c]; + g_cand = vit_c; + } } + if (g_best < cur_err * 0.95f) + best_candidate[vit_b] = g_cand; } - free(shot_sparse); + free(vit_path); + free(vit_dp); + free(vit_back); + free(vit_bin_err); + free(vit_bin_cand); + free(vit_log_pri); } free(coarse_marg); @@ -2819,7 +2718,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, hpc_destroy(graph); } } else { - /* OPT_MSE or single block: pick candidate with lowest raw error */ for (int64_t blk = 0; blk < n_blocks; blk++) { float best_err = candidate_errors[blk][0]; int best_idx = 0; @@ -2834,27 +2732,80 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } /* ══════════════════════════════════════════════════════════════════ - * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction + * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS + * + * Transforms the tensor from a collection of isolated 256-element + * Q2_K superblocks into a single, continuous error-cancelling waveform. + * + * After Phase 3 has selected the optimal (d, dmin) candidate for every + * block, this sequential pass computes the net DC residual left by each + * block using a cheap round-nearest forward quantization, then feeds the + * negated, exponentially-decayed residual as a correction bias into the + * WLS solver of the immediately following block. + * + * Mathematically, for block N with final DC residual R_N = Σ εᵢ: + * + * dc_bias[N+1] = −DC_DECAY × R_N / QK_K (per-element offset) * - * Like Q4_0's CF analog: the beam search / Born shots selected a - * grid candidate (d_grid, dmin_grid). Now we EXTRACT the exact - * optimal FP16 (d, dmin) via weighted least-squares, holding the - * sub-block Ls/Lm and quantized levels fixed. + * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the + * quantizer toward codes whose reconstruction deq ≈ x′, so that * - * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] + * Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N * - * Full analog assembly: at each iteration, EXHAUSTIVELY search - * all 16×16 = 256 possible (Ls[j], Lm[j]) pairs per sub-block - * to find the assignment that minimizes weighted reconstruction - * error. Then WLS-solve for the global (d, dmin). Repeat 5×. + * The accumulated cross-block DC collapses geometrically: * - * This guarantees every parameter is at its conditional optimum — - * the perfect bit analog at 2-bit resolution. + * R₀, DC_DECAY·R₀, DC_DECAY²·R₀, … → 0 + * + * The result is written into block_dc_bias[n_blocks]. Phase 4 reads + * this array (safe: written sequentially before the parallel loop). + * ══════════════════════════════════════════════════════════════════ */ + + #define DC_DECAY 0.85f /* Boundary-condition leak factor (0 = isolated, 1 = full) */ + + float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float)); + + if (block_dc_bias) { + float rolling_dc = 0.0f; + + for (int64_t blk = 0; blk < n_blocks; blk++) { + const float *bx = weights + blk * QK_K; + int cidx = best_candidate[blk]; + float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]); + float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); + + /* Bias applied to THIS block's WLS targets */ + float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K; + block_dc_bias[blk] = dc_bias; + + /* Quick round-nearest quant to estimate DC residual for NEXT block. + * We quantize the adjusted target x′ = x − dc_bias, then measure + * the residual of the ORIGINAL weight against the chosen code. */ + float dc_res = 0.0f; + int j, k; + for (j = 0; j < N_SUB; j++) { + float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j]; + float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j]; + for (k = 0; k < 16; k++) { + float x_adj = bx[16*j + k] - dc_bias; + int q = 0; + if (d_sub >= 1e-15f) { + q = gguf_nearest_int((x_adj + m_sub) / d_sub); + if (q < 0) q = 0; + if (q > 3) q = 3; + } + float deq = d_sub * (float)q - m_sub; + /* Residual against ORIGINAL weight (not adjusted) */ + dc_res += bx[16*j + k] - deq; + } + } + rolling_dc = dc_res; + } + } + + /* ══════════════════════════════════════════════════════════════════ + * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction * ══════════════════════════════════════════════════════════════════ */ - /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement. - * This eliminates ~776K malloc/free cycles from the inner loop. - * Each thread reuses its graph via hpc_reset_for_subblock(). */ int _n_omp_threads = 1; #ifdef _OPENMP _n_omp_threads = omp_get_max_threads(); @@ -2869,32 +2820,36 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, int cidx = best_candidate[blk]; uint8_t Ls_blk[16], Lm_blk[16]; - /* Start from HPC-selected candidate */ + /* ── Rolling DC boundary condition ────────────────────────────── + * dc_adj shifts every WLS target in this block so that the net + * quantisation error steers toward cancelling the previous block's + * DC residual (written by the sequential Phase 3.9 pre-pass). */ + float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f; + + /* Adjusted weight view — WLS and Shor work on this array; + * the final error is always reported against the original block_x. */ + float adj_block_x[QK_K]; + { + int _i; + for (_i = 0; _i < QK_K; _i++) + adj_block_x[_i] = block_x[_i] - dc_adj; + } + memcpy(Ls_blk, candidate_Ls[blk][cidx], 16); memcpy(Lm_blk, candidate_Lm[blk][cidx], 16); float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]); float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); - /* ── Analog assembly: iterate to convergence ── - * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes - * after 2-3 passes. Additional iterations produce negligible - * change in the committed FP16 values. - * A) Sub-block Shor measurement to find coupled (Ls,Lm) states - * B) Optimal q-value assignment - * C) WLS solve for (d, dmin) */ - for (int ls_iter = 0; ls_iter < 3; ls_iter++) { - - /* ── Step A: Sub-block Quhit BP (Strategy 1) ── - * For each sub-block j, evaluate all 256 (Ls, Lm) pairs. - * Keep the 6 best pairs as quhit states for a 16-node graph. - * Run BP to jointly select the globally optimal (Ls, Lm). */ + uint16_t prev_dm16 = 0, prev_mm16 = 0; + for (int ls_iter = 0; ls_iter < 5; ls_iter++) { + uint8_t state_ls[N_SUB][6]; uint8_t state_lm[N_SUB][6]; float state_err[N_SUB][6]; for (int j = 0; j < N_SUB; j++) { - const float *sx = block_x + 16 * j; + const float *sx = adj_block_x + 16 * j; for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f; for (int try_ls = 0; try_ls <= 15; try_ls++) { @@ -2917,7 +2872,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, sub_err += diff * diff * w; } - /* Insert into top 6 */ for (int v = 0; v < 6; v++) { if (sub_err < state_err[j][v]) { for (int u = 5; u > v; u--) { @@ -2935,7 +2889,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } } - /* Reset thread-local sub-block graph (zero allocations) */ int _tid = 0; #ifdef _OPENMP _tid = omp_get_thread_num(); @@ -2946,19 +2899,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, float min_sub_err[N_SUB]; for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0]; - /* Initialize unary potentials from local errors */ for (int j = 0; j < N_SUB; j++) { triality_dft(&sg->locals[j]); double amp_re[6]; double amp_norm = 0.0; for (int v = 0; v < 6; v++) { - /* Adaptive temperature: scale with local error spread - * so Shor measurement produces meaningful interference - * patterns regardless of weight magnitude */ - float err_spread = state_err[j][5] - state_err[j][0]; - float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f; - if (sub_temp < 1e-12f) sub_temp = 1e-12f; - amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp); + float err_spread = state_err[j][5] - state_err[j][0]; + float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f; + if (sub_temp < 1e-12f) sub_temp = 1e-12f; + amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp); amp_norm += amp_re[v] * amp_re[v]; } if (amp_norm > 1e-30) { @@ -2975,12 +2924,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, triality_update_mask(&sg->locals[j]); } - /* Add coupling edges between adjacent sub-blocks */ for (int j = 0; j < N_SUB - 1; j++) hpc_cz(sg, j, j + 1); - /* ── Shor sequential measurement on sub-block graph ── - * Stack-allocated arrays: eliminates 2 calloc/free per iteration */ double sub_marg[N_SUB][6]; int sub_measured[N_SUB]; memset(sub_marg, 0, sizeof(sub_marg)); @@ -2988,7 +2934,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1); - /* Extract optimal Ls/Lm from Shor marginals */ for (int j = 0; j < N_SUB; j++) { double best_prob = -1.0; int best_v = 0; @@ -3003,7 +2948,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, } } - /* ── Step B: Quantize q-values with optimal Ls/Lm ── */ uint8_t L[QK_K]; for (int j = 0; j < N_SUB; j++) { float d_sub = dm * (float)Ls_blk[j]; @@ -3013,22 +2957,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, continue; } for (int k = 0; k < 16; k++) { - int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub); + int q = gguf_nearest_int((adj_block_x[16*j+k] + m_sub) / d_sub); if (q < 0) q = 0; if (q > 3) q = 3; L[16*j+k] = (uint8_t)q; } } - /* ── Step C: WLS solve for (d, dmin) ── - * x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j] - * Let a = Ls[j]×q[j,k], b = Lm[j] - * Normal equations via Cramer's rule */ double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0; for (int j = 0; j < N_SUB; j++) { float ls_f = (float)Ls_blk[j]; float lm_f = (float)Lm_blk[j]; for (int k = 0; k < 16; k++) { - float x = block_x[16*j+k]; + float x = adj_block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; float a = ls_f * (float)L[16*j+k]; @@ -3045,7 +2985,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, if (fabs(det) > 1e-30) { double d_new = (Sbb * Sxa - Sab * Sxb) / det; double dm_new = (Sab * Sxa - Saa * Sxb) / det; - /* Clamp: positive and within 4× of candidate seed */ float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]); float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]); if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10)) @@ -3053,28 +2992,27 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10)) mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new)); } - if (isnan(dm) || isnan(mm)) { - printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det); - exit(1); - } + + uint16_t cur_dm16 = gguf_fp32_to_fp16(dm); + uint16_t cur_mm16 = gguf_fp32_to_fp16(mm); + if (cur_dm16 == prev_dm16 && cur_mm16 == prev_mm16) break; + prev_dm16 = cur_dm16; + prev_mm16 = cur_mm16; } - /* ── FP16 ULP neighborhood search for (d, dmin) ── - * The WLS solve found continuous-optimal (d, dmin). But FP16 - * truncation may shift the optimum. Try ±4 ULP around both - * d and dmin, pick the pair with minimum reconstruction error. */ + /* ── FP16 ULP neighborhood search for (d, dmin) — Expanded to ±8 ── */ { uint16_t base_d16 = gguf_fp32_to_fp16(dm); uint16_t base_m16 = gguf_fp32_to_fp16(mm); uint16_t best_d16 = base_d16, best_m16 = base_m16; float best_ulp_err = 1e30f; - for (int dd = -2; dd <= 2; dd++) { + for (int dd = -8; dd <= 8; dd++) { int cd16 = (int)base_d16 + dd; if (cd16 < 0 || cd16 > 0x7BFF) continue; float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16); - for (int dm_delta = -2; dm_delta <= 2; dm_delta++) { + for (int dm_delta = -8; dm_delta <= 8; dm_delta++) { int cm16 = (int)base_m16 + dm_delta; if (cm16 < 0 || cm16 > 0x7BFF) continue; float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16); @@ -3084,7 +3022,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, float d_sub = trial_dm * (float)Ls_blk[j]; float m_sub = trial_mm * (float)Lm_blk[j]; for (int k = 0; k < 16; k++) { - float x = block_x[16*j+k]; + float x = adj_block_x[16*j+k]; float w = (imat_importance) ? imat_importance[blk * QK_K + 16*j+k] : 1.0f; int q; @@ -3109,21 +3047,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, mm = gguf_fp16_to_fp32(best_m16); } - /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ── - * The WLS solve may have shifted (d, dmin) after the last Step A. - * Neighborhood search ±2 around current values (25 pairs vs 256) - * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */ for (int j = 0; j < N_SUB; j++) { - const float *sx = block_x + 16 * j; + const float *sx = adj_block_x + 16 * j; float best_sub_err = 1e30f; uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j]; - int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0; - int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15; - int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0; - int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15; - for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) { + for (int try_ls = 0; try_ls <= 15; try_ls++) { float d_sub = dm * (float)try_ls; - for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) { + for (int try_lm = 0; try_lm <= 15; try_lm++) { float m_sub = mm * (float)try_lm; float sub_err = 0.0f; for (int k = 0; k < 16; k++) { @@ -3151,150 +3081,201 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, Lm_blk[j] = best_lm; } - /* Store the extracted optimal FP16 (d, dmin) */ output[blk].d = gguf_fp32_to_fp16(dm); output[blk].dmin = gguf_fp32_to_fp16(mm); for (int j = 0; j < N_SUB; j++) output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4); - /* ── Final quantization with D₆ Hadamard Error Shaping ── + /* ── Final quantization: D₆ Hadamard Greedy Descent (deterministic) ── * - * Standard Q2_K rounds each weight independently: q = round((x+m)/d). - * But within a sub-block, weights share (d, m), so their quantization - * errors are CORRELATED. Independent rounding is suboptimal. + * The original Simulated Annealing acceptance rule is replaced by a + * strict greedy descent: only accept a flip if it strictly reduces the + * D₆ Hadamard metric (4·‖vesica‖² + DC²). This makes error shaping + * fully deterministic and thread-safe (no rand() inside omp parallel), + * consistent with the Viterbi philosophy applied in Phase 3. * - * The D₆ fold (antipodal Hadamard from the triality quhit) decomposes - * the error vector into vesica (sum) and wave (difference) components: - * vesica[k] = (e[k] + e[k+3]) / √2 — DC-like, accumulates in dot products - * wave[k] = (e[k] - e[k+3]) / √2 — noise-like, cancels in dot products - * - * We WANT large wave error and small vesica error. So we greedily - * flip rounding decisions (floor↔ceil) to minimize vesica energy, - * even if total element-wise error increases slightly. - * - * Process: 16 elements per sub-block, treat as 2 groups of 6 + 4 tail. - * Apply DFT₆-fold to each group of 6, minimize vesica component. + * The metric measures both: + * - Vesica Piscis term: correlated error between weights i and i+QK_K/2 + * (targets the first non-DC harmonic — halfwave symmetry) + * - DC term: total signed error across the 256-weight superblock + * (captured and propagated to the next block by Phase 3.9) */ uint8_t L[QK_K]; - for (int j = 0; j < N_SUB; j++) { - float d = dm * (float)(output[blk].scales[j] & 0xF); - if (d < 1e-15f) { - for (int k = 0; k < 16; k++) L[16 * j + k] = 0; - continue; + { + float q_cont_all[QK_K]; + int q_base_all[QK_K]; + int q_shaped_all[QK_K]; + + for (int i = 0; i < QK_K; i++) { + int jj = i >> 4; + float d_s = dm * (float)(output[blk].scales[jj] & 0xF); + float m_s = mm * (float)(output[blk].scales[jj] >> 4); + if (d_s < 1e-15f) { + q_cont_all[i] = 0.0f; + q_base_all[i] = 0; + } else { + /* Quantize the DC-adjusted target */ + float qc = (adj_block_x[i] + m_s) / d_s; + q_cont_all[i] = qc; + int qr = gguf_nearest_int(qc); + if (qr < 0) qr = 0; if (qr > 3) qr = 3; + q_base_all[i] = qr; + } } - float m = mm * (float)(output[blk].scales[j] >> 4); - float id = 1.0f / d; - - /* Step 1: Standard nearest-rounding as baseline */ - int q_base[16]; - float q_cont[16]; /* continuous q values before rounding */ - for (int k = 0; k < 16; k++) { - q_cont[k] = (block_x[16*j+k] + m) * id; - q_base[k] = gguf_nearest_int(q_cont[k]); - if (q_base[k] < 0) q_base[k] = 0; - if (q_base[k] > 3) q_base[k] = 3; + memcpy(q_shaped_all, q_base_all, QK_K * sizeof(int)); + + float e_live[QK_K]; + for (int i = 0; i < QK_K; i++) { + int jj = i >> 4; + float d_s = dm * (float)(output[blk].scales[jj] & 0xF); + float m_s = mm * (float)(output[blk].scales[jj] >> 4); + float deq = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f; + /* Residual against the adjusted target (DC-corrected view) */ + e_live[i] = adj_block_x[i] - deq; } - /* Step 2: D₆ Hadamard Error Shaping - * For each 6-element group, greedily flip the rounding decision - * that most reduces the D₆-folded vesica error component. - * - * D₆ fold on 6-element groups: antipodal pairs (0,3), (1,4), (2,5) - * vesica[k] = e[k] + e[k+3] (k=0,1,2) — DC-like, propagates - * wave[k] = e[k] - e[k+3] (k=0,1,2) — noise-like, cancels - * - * Weight vesica 4× over wave + penalize DC (sum of all 6 errors) */ - int q_shaped[16]; - memcpy(q_shaped, q_base, 16 * sizeof(int)); - - /* Process groups: [0..5], [6..11], tail [12..15] handled by D₆ metric on available pairs */ - for (int g = 0; g < 2; g++) { - int g_off = g * 6; - if (g_off + 5 >= 16) break; - - /* Multiple greedy passes — each pass finds the single best flip */ - for (int pass = 0; pass < 6; pass++) { - int best_k = -1; - int best_q_alt = 0; - float best_delta = 0.0f; /* improvement = current_metric - alt_metric */ - - /* Compute current group errors */ - float e_cur[6]; - for (int kk = 0; kk < 6; kk++) { - int ii = g_off + kk; - float deq = d * (float)q_shaped[ii] - m; - e_cur[kk] = block_x[16*j+ii] - deq; - } + float v_live[QK_K / 2]; + float vesica_cur = 0.0f, dc_cur = 0.0f; + for (int i = 0; i < QK_K / 2; i++) { + v_live[i] = e_live[i] + e_live[i + QK_K / 2]; + vesica_cur += v_live[i] * v_live[i]; + } + for (int i = 0; i < QK_K; i++) dc_cur += e_live[i]; + float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; - /* Current D₆ metric: vesica energy + DC² */ - float vesica_cur = 0.0f, dc_cur = 0.0f; - for (int p = 0; p < 3; p++) { - float v = e_cur[p] + e_cur[p+3]; - vesica_cur += v * v; - } - for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk]; - float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; - - /* Try flipping each element */ - for (int k = 0; k < 6; k++) { - int idx = g_off + k; - int q_cur = q_shaped[idx]; - - /* Try the alternative rounding */ - int q_try; - if (q_cont[idx] - (float)q_cur >= 0) { - q_try = q_cur + 1; - } else { - q_try = q_cur - 1; - } - if (q_try < 0 || q_try > 3) continue; - - /* Compute alt errors (only element k changes) */ - float e_alt[6]; - for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk]; - float deq_try = d * (float)q_try - m; - e_alt[k] = block_x[16*j+idx] - deq_try; - - /* Alt D₆ metric */ - float vesica_alt = 0.0f, dc_alt = 0.0f; - for (int p = 0; p < 3; p++) { - float v = e_alt[p] + e_alt[p+3]; - vesica_alt += v * v; - } - for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk]; - float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt; - - float delta = metric_cur - metric_alt; - if (delta > best_delta) { - best_delta = delta; - best_k = k; - best_q_alt = q_try; - } + /* Deterministic greedy descent: accept only strict improvements */ + for (int pass = 0; pass < QK_K; pass++) { + int best_k = -1; + int best_q_alt = 0; + float best_delta = 0.0f; /* strictly positive threshold */ + + for (int k = 0; k < QK_K; k++) { + int jj = k >> 4; + float d_s = dm * (float)(output[blk].scales[jj] & 0xF); + if (d_s < 1e-15f) continue; + + int q_cur = q_shaped_all[k]; + int q_try = (q_cont_all[k] - (float)q_cur >= 0.0f) + ? q_cur + 1 : q_cur - 1; + if (q_try < 0 || q_try > 3) continue; + + float m_s = mm * (float)(output[blk].scales[jj] >> 4); + float e_new = adj_block_x[k] - (d_s * (float)q_try - m_s); + float de = e_new - e_live[k]; + + int pi = (k < QK_K / 2) ? k : k - QK_K / 2; + float v_new = v_live[pi] + de; + + float vesica_alt = vesica_cur - v_live[pi]*v_live[pi] + v_new*v_new; + float dc_alt = dc_cur + de; + float delta = metric_cur - (4.0f * vesica_alt + dc_alt * dc_alt); + + if (delta > best_delta) { + best_delta = delta; + best_k = k; + best_q_alt = q_try; } + } - if (best_k < 0) break; /* no improvement found */ - q_shaped[g_off + best_k] = best_q_alt; /* commit the flip */ + if (best_k < 0) break; /* converged — no further improvement */ + + q_shaped_all[best_k] = best_q_alt; + { + int jj_c = best_k >> 4; + float d_c = dm * (float)(output[blk].scales[jj_c] & 0xF); + float m_c = mm * (float)(output[blk].scales[jj_c] >> 4); + float e_new_c = adj_block_x[best_k] - (d_c * (float)best_q_alt - m_c); + float de_c = e_new_c - e_live[best_k]; + int pi_c = (best_k < QK_K / 2) ? best_k : best_k - QK_K / 2; + float v_new_c = v_live[pi_c] + de_c; + vesica_cur += v_new_c * v_new_c - v_live[pi_c] * v_live[pi_c]; + dc_cur += de_c; + metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur; + v_live[pi_c] = v_new_c; + e_live[best_k]= e_new_c; } } - /* Step 3: Final error comparison — only keep shaped if it improves - * or is within 5% of baseline (vesica shaping trades element MSE - * for better spectral distribution of error) */ + /* Choose base vs shaped by comparing MSE against original weights */ float err_base = 0.0f, err_shaped = 0.0f; - for (int k = 0; k < 16; k++) { - float x = block_x[16*j+k]; - float w = (imat_importance) ? - imat_importance[blk * QK_K + 16*j + k] : 1.0f; - float deq_b = d * (float)q_base[k] - m; - float deq_s = d * (float)q_shaped[k] - m; - err_base += (x - deq_b) * (x - deq_b) * w; - err_shaped += (x - deq_s) * (x - deq_s) * w; + for (int i = 0; i < QK_K; i++) { + int jj = i >> 4; + float d_s = dm * (float)(output[blk].scales[jj] & 0xF); + float m_s = mm * (float)(output[blk].scales[jj] >> 4); + float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f; + float deq_b = (d_s > 1e-15f) ? (d_s * (float)q_base_all[i] - m_s) : 0.0f; + float deq_s = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f; + float xv = block_x[i]; /* original weight for error report */ + err_base += (xv - deq_b) * (xv - deq_b) * w; + err_shaped += (xv - deq_s) * (xv - deq_s) * w; + } + { + int use_shaped = (err_shaped <= err_base); + for (int i = 0; i < QK_K; i++) + L[i] = (uint8_t)(use_shaped ? q_shaped_all[i] : q_base_all[i]); } + } + + /* ── Cross-weight error diffusion — intra-sub-block Floyd-Steinberg ── + * + * Implements cross-weight error diffusion within each 16-weight sub-block. + * After the greedy descent has committed quantisation codes, the residual + * of each weight is partially propagated forward to the next position in + * the same sub-block (7/16 of the error), re-quantising if the diffused + * target falls in a different bin. + * + * This is the "cross-weight" dimension of the error-diffusion request: + * neighbouring weights share and partially absorb each other's rounding + * error, shaping the within-block spectrum away from the DC component + * that Phase 3.9 already propagates between blocks. + * + * Staying within sub-blocks avoids scale-mismatch artefacts that would + * arise from diffusing across the dm * Ls[j] boundary between sub-blocks. + * + * The diffused codes are accepted only when they reduce the weighted MSE + * against the ORIGINAL weight (not the adjusted target), so the diffusion + * cannot increase the total reconstruction error. + */ + { + int fs_j, fs_k; + for (fs_j = 0; fs_j < N_SUB; fs_j++) { + int base = fs_j * 16; + float d_s = dm * (float)(output[blk].scales[fs_j] & 0xF); + float m_s = mm * (float)(output[blk].scales[fs_j] >> 4); + if (d_s < 1e-15f) continue; + + float carry = 0.0f; /* FS carry from position k-1 */ + + for (fs_k = 0; fs_k < 16; fs_k++) { + int idx = base + fs_k; + float x_orig = block_x[idx]; + float x_adj = adj_block_x[idx] + carry; /* adjusted + diffused */ + + /* Propose new code from diffused target */ + int q_fs = gguf_nearest_int((x_adj + m_s) / d_s); + if (q_fs < 0) q_fs = 0; if (q_fs > 3) q_fs = 3; + + if (q_fs != (int)L[idx]) { + /* Accept only when MSE against original weight improves */ + float w_imp = (imat_importance) + ? imat_importance[blk * QK_K + idx] : 1.0f; + float deq_old = d_s * (float)L[idx] - m_s; + float deq_new = d_s * (float)q_fs - m_s; + float e_old = (x_orig - deq_old) * (x_orig - deq_old) * w_imp; + float e_new = (x_orig - deq_new) * (x_orig - deq_new) * w_imp; + if (e_new < e_old) + L[idx] = (uint8_t)q_fs; + } - int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base; - for (int k = 0; k < 16; k++) - L[16 * j + k] = (uint8_t)q_final[k]; + /* Propagate 7/16 of the residual (adj target vs committed code) */ + { + float deq_final = d_s * (float)L[idx] - m_s; + float residual = (adj_block_x[idx] - deq_final); + carry = (fs_k < 15) ? residual * (7.0f / 16.0f) : 0.0f; + } + } + } } for (int j = 0; j < QK_K; j += 128) { @@ -3315,11 +3296,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, total_err += berr; } - /* Free thread-local sub-block graphs */ for (int _ti = 0; _ti < _n_omp_threads; _ti++) hpc_destroy(_tl_graphs[_ti]); free(_tl_graphs); + free(block_dc_bias); free(seeds); free(candidate_errors); free(candidate_d); @@ -3332,14 +3313,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements, if (verbose) { float rmse = sqrtf(total_err / (float)n_elements); - /* Compute weight σ for fidelity classification */ double w_sum2 = 0.0; for (int64_t i = 0; i < n_elements; i++) w_sum2 += (double)weights[i] * (double)weights[i]; - float w_sigma = (float)sqrt(w_sum2 / (double)n_elements); + w_sigma = (float)sqrt(w_sum2 / (double)n_elements); float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f; - /* Fidelity classification */ const char *fidelity_class; const char *fidelity_icon; if (rmse <= 1.0e-04f) { @@ -3493,8 +3472,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, if (should_quantize(ti, gguf_names[i])) { if (is_attention_tensor(gguf_names[i])) { - /* Promote attention Q/K/V/O to Q4_0 for higher precision. - * Attention scores are most sensitive to quantization noise. */ tensor_types[i] = GGML_TYPE_Q4_0; int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0; tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0); @@ -3506,18 +3483,15 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements); } } else if (ti->n_dims >= 2) { - /* 2D non-quantized tensors (embeddings, output) → F16 */ tensor_types[i] = GGML_TYPE_F16; tensor_sizes[i] = ti->n_elements * sizeof(uint16_t); } else { - /* 1D tensors (norms, biases) → F32 */ tensor_types[i] = GGML_TYPE_F32; tensor_sizes[i] = ti->n_elements * sizeof(float); } tensor_offsets[i] = data_offset; - /* Align each tensor to 32 bytes */ data_offset += tensor_sizes[i]; data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) & ~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1); @@ -3592,7 +3566,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, int src = tensor_src_idx[i]; const STTensorInfo *ti = st_multi_tensor_info(mf, src); uint64_t dims[ST_MAX_DIMS]; - /* GGUF uses reversed dimension order from SafeTensors/PyTorch */ int nd = ti->n_dims; for (int d = 0; d < nd; d++) { dims[d] = (uint64_t)ti->shape[nd - 1 - d]; @@ -3622,7 +3595,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, print_progress_bar(i, total_tensors, gguf_names[i], quant_start); if (tensor_types[i] == GGML_TYPE_Q2_K) { - /* ── HPC-Optimized Q2_K Quantization ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", @@ -3633,7 +3605,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, int64_t n_elements = ti->n_elements; float tensor_error = 0.0f; - /* Pad to QK_K boundary */ int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K; if (padded > n_elements) { f32_data = realloc(f32_data, padded * sizeof(float)); @@ -3645,7 +3616,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, int64_t n_blocks = n_elements / QK_K; BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K)); - /* Look up imatrix importance for this tensor */ const float *imp = NULL; if (imatrix) { const IMatrixEntry *ime = imatrix_find_any(imatrix, @@ -3666,13 +3636,11 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, float rmse = sqrtf(tensor_error / (float)ti->n_elements); - /* Compute weight σ for fidelity gate */ double wss = 0.0; for (int64_t j = 0; j < ti->n_elements; j++) wss += (double)f32_data[j] * (double)f32_data[j]; float w_sig = (float)sqrt(wss / (double)ti->n_elements); - /* Fidelity gate: classify RMSE vs 1e-04 target */ const char *fid; if (rmse <= 1.0e-04f) fid = "★★★★ ULTRA"; else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH"; @@ -3695,7 +3663,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, free(quant_data); free(f32_data); } else if (tensor_types[i] == GGML_TYPE_Q4_0) { - /* ── HPC-Optimized Q4_0 Quantization (attention tensors) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n", @@ -3705,7 +3672,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, int64_t n_elements = ti->n_elements; - /* Pad to QK4_0 boundary */ int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0; if (padded > n_elements) { f32_data = realloc(f32_data, padded * sizeof(float)); @@ -3718,7 +3684,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0)); float tensor_error = 0.0f; - /* Look up imatrix importance for this tensor */ const float *imp = NULL; if (imatrix) { const IMatrixEntry *ime = imatrix_find_any(imatrix, @@ -3739,7 +3704,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, float rmse = sqrtf(tensor_error / (float)ti->n_elements); - /* Compute weight σ for fidelity gate */ double wss4 = 0.0; for (int64_t j = 0; j < ti->n_elements; j++) wss4 += (double)f32_data[j] * (double)f32_data[j]; @@ -3767,7 +3731,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, free(q4_data); free(f32_data); } else if (tensor_types[i] == GGML_TYPE_F16) { - /* ── Store as F16 (embeddings, output, 2D non-quantized) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", @@ -3775,7 +3738,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, continue; } - /* Convert F32 → F16 */ uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t)); for (int64_t j = 0; j < ti->n_elements; j++) f16_data[j] = gguf_fp32_to_fp16(f32_data[j]); @@ -3793,7 +3755,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, free(f16_data); free(f32_data); } else { - /* ── Keep as F32 (1D: norms, biases) ── */ float *f32_data = st_multi_tensor_to_f32(mf, src); if (!f32_data) { fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n", @@ -3814,7 +3775,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, free(f32_data); } - /* Pad to alignment */ gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT); } @@ -3823,8 +3783,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, long final_size = ftell(fp); fclose(fp); - /* ── Final summary with Shor fidelity metrics ── */ - /* Compute original model size (all as F32) */ int64_t original_f32_size = 0; for (int i = 0; i < total_tensors; i++) { const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]); @@ -3840,7 +3798,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf, float mean_mse_per_tensor = (quant_count > 0) ? total_error_sum / (float)quant_count : 0.0f; - /* Fidelity classification */ const char *overall_fid, *overall_icon; if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)"; overall_icon = "★★★★"; } else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)"; overall_icon = "★★★☆"; } @@ -3931,12 +3888,12 @@ void hexstate_init(void) /* Quantize a single tensor's F32 data to Q2_K using HPC optimization. * * Parameters: - * weights: input F32 data (must be padded to multiple of 256) - * n_elements: number of elements (must be multiple of 256) - * output: output buffer (must be n_elements/256 * 84 bytes) - * out_error: pointer to receive total MSE (can be NULL) - * opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended) - * verbose: 1 for per-block diagnostics + * weights: input F32 data (must be padded to multiple of 256) + * n_elements: number of elements (must be multiple of 256) + * output: output buffer (must be n_elements/256 * 84 bytes) + * out_error: pointer to receive total MSE (can be NULL) + * opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended) + * verbose: 1 for per-block diagnostics */ void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements, void *output, float *out_error, @@ -3967,12 +3924,12 @@ int hexstate_q2k_block_elements(void) { return QK_K; } /* HPC-optimized Q4_0 quantization for attention tensors. * Called from Python requantizer via ctypes. - * weights: input F32 weights - * n_elements: number of elements (must be multiple of 32) - * output: output buffer (must be n_elements/32 * 18 bytes) - * out_error: pointer to receive total MSE (can be NULL) - * imat_importance: optional per-element importance weights - * verbose: 1 for per-block diagnostics + * weights: input F32 weights + * n_elements: number of elements (must be multiple of 32) + * output: output buffer (must be n_elements/32 * 18 bytes) + * out_error: pointer to receive total MSE (can be NULL) + * imat_importance: optional per-element importance weights + * verbose: 1 for per-block diagnostics */ void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements, void *output, float *out_error, @@ -4220,4 +4177,4 @@ int main(int argc, char **argv) st_multi_close(mf); return result; } -#endif /* HEXSTATE_LIBRARY */ +#endif /* HEXSTATE_LIBRARY */ \ No newline at end of file