diff --git "a/hexstate_quantize.c" "b/hexstate_quantize.c"
--- "a/hexstate_quantize.c"
+++ "b/hexstate_quantize.c"
@@ -1,5 +1,5 @@
/* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_quantize.c — HExState GGUF Quantizer
+ * hexstate_quantize.c — HexState GGUF Quantizer
*
* ╔═══════════════════════════════════════════════════════════════╗
* ║ HPC-Optimized GGUF Quantization Engine ║
@@ -15,32 +15,32 @@
* This tool adapts the HExState HPC Ouroboros factoring engine for
* LLM weight quantization. The core mathematical machinery is reused:
*
- * Factoring Domain → Quantization Domain
- * ─────────────────────────────────────────────────
- * HPCGraph + CZ edges → Block sensitivity graph
- * Complex Amplitude BP → Importance propagation
- * MCMC period sampler → Optimal scale search
- * try_period() validation → Error bound checking
- * LLL lattice reduction → (future) Adaptive bit allocation
+ * Factoring Domain → Quantization Domain
+ * ─────────────────────────────────────────────────
+ * HPCGraph + CZ edges → Block sensitivity graph
+ * Complex Amplitude BP → Importance propagation
+ * MCMC period sampler → Optimal scale search
+ * try_period() validation → Error bound checking
+ * LLL lattice reduction → (future) Adaptive bit allocation
*
* Additional techniques ported from llm-compressor:
- * MSE grid search → Optimal min/max range shrinking
- * Importance matrix (imatrix) → Per-channel error weighting
+ * MSE grid search → Optimal min/max range shrinking
+ * Importance matrix (imatrix) → Per-channel error weighting
*
* Build:
- * make -f Makefile.quantize
+ * make -f Makefile.quantize
*
* Usage:
- * ./hexstate_quantize [options]
+ * ./hexstate_quantize [options]
*
* Input can be:
- * - A single .safetensors file
- * - A model directory containing sharded .safetensors files
+ * - A single .safetensors file
+ * - A model directory containing sharded .safetensors files
*
* Options:
- * --optimizer hpc|mse|hybrid Scale optimization strategy (default: hybrid)
- * --imatrix Importance matrix for weighted quantization
- * --verbose Per-block diagnostics
+ * --optimizer hpc|mse|hybrid Scale optimization strategy (default: hybrid)
+ * --imatrix Importance matrix for weighted quantization
+ * --verbose Per-block diagnostics
* ═══════════════════════════════════════════════════════════════════════════ */
#include
@@ -555,8 +555,8 @@ static void map_tensor_name(const char *hf_name, char *gguf_name, int buflen)
* SHOULD THIS TENSOR BE QUANTIZED?
*
* Decision rules:
- * - Quantize: weight matrices (2D, large)
- * - Keep F32: norms, biases, embeddings, 1D tensors
+ * - Quantize: weight matrices (2D, large)
+ * - Keep F32: norms, biases, embeddings, 1D tensors
* ═══════════════════════════════════════════════════════════════════════════ */
static int should_quantize(const STTensorInfo *ti, const char *gguf_name)
@@ -620,12 +620,12 @@ static int is_attention_tensor(const char *gguf_name)
* For Q2_K: 256-weight superblocks.
*
* The 6 values per site correspond to 6 candidate scale factors:
- * v=0: scale * 0.85 (aggressive, high compression)
- * v=1: scale * 0.90
- * v=2: scale * 0.95
- * v=3: scale * 1.00 (standard)
- * v=4: scale * 1.05
- * v=5: scale * 1.10 (conservative, less compression error)
+ * v=0: scale * 0.85 (aggressive, high compression)
+ * v=1: scale * 0.90
+ * v=2: scale * 0.95
+ * v=3: scale * 1.00 (standard)
+ * v=4: scale * 1.05
+ * v=5: scale * 1.10 (conservative, less compression error)
*
* BP propagates: "if your neighbor block is sensitive, you should be
* conservative too" — creating coherent precision allocation.
@@ -637,12 +637,12 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
};
/* ── Multi-quhit expanded scale table ──
- * Search grid: 10×10 = 100 (d, dmin) candidates
- * Quhit encoding: bin 10 → 6 for D=6 quhits (BP operates on 6-state marginals)
- * Beam search: operates on all 100 candidates directly */
+ * Search grid: 24×24 = 576 (d, dmin) candidates
+ * Quhit encoding: bin 24 → 6 for D=6 quhits (BP operates on 6-state marginals)
+ * Beam search: operates on all 576 candidates directly */
#define QUHITS_PER_BLOCK 2
-#define N_CAND_D 16 /* d multiplier candidates (was 10) */
-#define N_CAND_M 16 /* dmin multiplier candidates (was 10) */
+#define N_CAND_D 24 /* d multiplier candidates (expanded) */
+#define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
@@ -650,7 +650,7 @@ static int scale_table_initialized = 0;
static void init_scale_table(void) {
if (scale_table_initialized) return;
- /* 100 candidates: uniform spacing centered on 1.0 */
+ /* candidates: uniform spacing centered on 1.0 */
for (int i = 0; i < TOTAL_SCALE_CANDIDATES; i++) {
SCALE_TABLE[i] = 0.50f + (float)i * (1.00f / (float)(TOTAL_SCALE_CANDIDATES - 1));
}
@@ -695,20 +695,15 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
/* ═══════════════════════════════════════════════════════════════════════════
* FAST POWER APPROXIMATION — Replaces powf(x, 2.4f) in MSE grid search
*
- * powf() costs ~50-100 cycles. For norm=2.4: x^2.4 = x^2 × x^0.4
- * where x^0.4 = (x^2)^0.2 = (x^2)^(1/5). Use cbrtf approximation:
- * x^0.4 ≈ sqrtf(cbrtf(x^2 × x^2)) but simpler: x^2 × sqrtf(sqrtf(x))
- * is close enough for error norm purposes (~1% relative error).
+ * powf() costs ~50-100 cycles. Use log2f+exp2f (~25 cycles) for the
+ * exact x^2.4 = x^2 × 2^(0.4·log2(x)) computation instead.
* ═══════════════════════════════════════════════════════════════════════════ */
static inline float fast_pow_2_4(float x)
{
- /* x^2.4 = x^2 × x^0.4. For x^0.4: use x^(2/5) = sqrt(x^(4/5))
- * x^(4/5) = (x^4)^(1/5). Approximation via sqrtf chain:
- * x^0.4 ≈ sqrtf(sqrtf(x)) × x^(-0.1) — too complex.
- * Simpler: x^2.4 = (x^12)^(1/5) = fifth_root(x^12)
- * Best: just use x*x * sqrtf(cbrtf(x*x)) since cbrtf is fast (~15 cycles) */
+ /* x^2.4 = x^2 × 2^(0.4 × log2(x)). log2f+exp2f ≈ 25 cycles total vs
+ * 50-100 for powf, and produces the exact ^2.4 norm the grid search needs. */
float x2 = x * x;
- return x2 * sqrtf(cbrtf(x2)); /* x^2 × (x^2)^(1/6) ≈ x^(2+1/3) ≈ x^2.333 */
+ return x2 * exp2f(0.4f * log2f(x)); /* x^2 × x^0.4 = x^2.4 */
}
/* Compute the Q2_K sub-block reconstruction error for a block at a given
@@ -743,16 +738,16 @@ static float compute_block_error_q2k(const float *weights, int block_size,
}
/* Build multi-quhit HPC sensitivity graph.
- * 2 quhits per block → 36 scale candidates per block.
+ * 2 quhits per block → 576 scale candidates per block.
*
* Graph layout: sites [0..2*n-1] where:
- * site 2*i = coarse quhit for block i
- * site 2*i + 1 = fine quhit for block i
+ * site 2*i = coarse quhit for block i
+ * site 2*i + 1 = fine quhit for block i
*
* Edges:
- * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
- * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
- * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
+ * Intra-block: CZ(2i, 2i+1) — coarse↔fine coupling
+ * Inter-block: CZ(2i, 2(i+1)) — coarse↔coarse neighbor
+ * CZ(2i+1, 2(i+1)+1) — fine↔fine neighbor */
static HPCGraph *build_sensitivity_graph(const float *weights,
int64_t n_elements,
int block_size,
@@ -774,13 +769,13 @@ static HPCGraph *build_sensitivity_graph(const float *weights,
for (int64_t i = 0; i < n_sites; i++)
triality_dft(&graph->locals[i]);
- /* Compute errors for all 36 scale candidates per block,
+ /* Compute errors for all candidates per block,
* then project onto coarse (quhit 0) and fine (quhit 1) marginals */
for (int64_t i = 0; i < graph_blocks; i++) {
int64_t block_idx = i * stride;
const float *block_weights = weights + block_idx * block_size;
- /* Evaluate all 36 candidates */
+ /* Evaluate all candidates */
float errors[TOTAL_SCALE_CANDIDATES];
float min_err = 1e30f;
for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
@@ -873,12 +868,12 @@ static HPCGraph *build_sensitivity_graph(const float *weights,
* For a Q2_K sub-block, progressively shrink the min/max range to find
* the candidate that minimizes weighted reconstruction error.
*
- * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
- * candidate_min = p * min
- * candidate_max = p * max
- * error = ||x - quantize(x, candidate_min, candidate_max)||^norm
- * if error < best: update best
- * else: patience--; if patience == 0: break
+ * for p in [1.0, 1.0 - 1/grid, 1.0 - 2/grid, ...] down to (1 - maxshrink):
+ * candidate_min = p * min
+ * candidate_max = p * max
+ * error = ||x - quantize(x, candidate_min, candidate_max)||^norm
+ * if error < best: update best
+ * else: patience--; if patience == 0: break
*
* This is a direct C port of llm-compressor's _grid_search_mse.
* ═══════════════════════════════════════════════════════════════════════════ */
@@ -977,7 +972,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
float cur_scale = best_scale;
if (cur_scale > 1e-15f) {
float iscale = 1.0f / cur_scale;
- for (int itry = 0; itry < 3; itry++) {
+ for (int itry = 0; itry < 5; itry++) {
float sumlx = 0;
int suml2 = 0;
for (int i = 0; i < n; i++) {
@@ -992,8 +987,9 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
float sum = 0;
for (int i = 0; i < n; i++)
sum += x[i] - cur_scale * L[i];
- cur_min = 0.7f * cur_min + 0.3f * sum / n;
- if (cur_min > 0) cur_min = 0;
+ /* True coordinate-descent optimal: min* = sum/n (no momentum).
+ * Clamp to ≤ 0 since min must be non-positive by convention. */
+ cur_min = fminf(0.0f, sum / n);
if (cur_scale > 1e-15f) iscale = 1.0f / cur_scale;
}
}
@@ -1006,12 +1002,12 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
* HPC Q2_K QUANTIZATION — GGML-QUALITY + HPC REFINEMENT
*
* Two-phase approach:
- * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
- * This produces per-sub-block (scale, min) with 16-step search.
- * Phase B: HPC BP refines the superblock-level d/dmin rounding.
- * 6 candidate (d, dmin) pairs are tested; BP finds the one
- * where the GLOBAL reconstruction error is minimized via
- * constructive interference of per-sub-block phase coherence.
+ * Phase A: Per-sub-block weighted least-squares (ggml make_qkx2_quants)
+ * This produces per-sub-block (scale, min) with 16-step search.
+ * Phase B: HPC BP refines the superblock-level d/dmin rounding.
+ * 6 candidate (d, dmin) pairs are tested; BP finds the one
+ * where the GLOBAL reconstruction error is minimized via
+ * constructive interference of per-sub-block phase coherence.
* ═══════════════════════════════════════════════════════════════════════════ */
/* Weighted least-squares quantization for a sub-block (ggml make_qkx2_quants).
@@ -1174,22 +1170,22 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x,
* Instead of iterative message-passing (BP), this uses the EXACT sequential
* measurement protocol from Shor's algorithm:
*
- * For each block k (MSB → LSB):
- * 1. Compute feed-forward phase correction from previously measured blocks
- * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
- * 3. Bake C_k into locals: α(d) *= C_k(d)
- * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
- * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
- * 6. Born rule measurement → select optimal scale candidate
- * 7. Collapse site + absorb edge weights into neighbors (back-action)
+ * For each block k (MSB → LSB):
+ * 1. Compute feed-forward phase correction from previously measured blocks
+ * 2. Compute work factor: C_k(d) = Π_j Σ_w local_j(w) × edge(d,w)
+ * 3. Bake C_k into locals: α(d) *= C_k(d)
+ * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
+ * 5. Apply IDFT6 in-place: interference creates peaks at optimal scales
+ * 6. Born rule measurement → select optimal scale candidate
+ * 7. Collapse site + absorb edge weights into neighbors (back-action)
*
* This IS the quantum Fourier transform that creates constructive
* interference at the optimal RMSE configuration, exactly as Shor's
* algorithm creates interference at the correct period.
*
* Domain mapping:
- * Factoring: oracle phase 2π×d×c_k/N → period r
- * Quantize: error Boltzmann amplitudes → optimal RMSE block
+ * Factoring: oracle phase 2π×d×c_k/N → period r
+ * Quantize: error Boltzmann amplitudes → optimal RMSE block
* ═══════════════════════════════════════════════════════════════════════════ */
/* ω₆ roots of unity for CZ phase lookup */
@@ -1280,17 +1276,17 @@ static void shor_collapse_site(HPCGraph *graph, int target_site, int outcome)
* Ported 1:1 from tesseract_factor.c lines 2343-2500.
*
* Measures sites MSB→LSB. For each site k:
- * 1. Compute feed-forward phase correction θ_k from previously measured sites
- * 2. Compute neighbor contribution C_k(d) analytically
- * 3. Bake C_k into locals
- * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
- * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
- * 6. Compute |β(v)|² as measurement probabilities
- * 7. Sample/argmax → outcome
- * 8. Collapse + back-action via shor_collapse_site()
+ * 1. Compute feed-forward phase correction θ_k from previously measured sites
+ * 2. Compute neighbor contribution C_k(d) analytically
+ * 3. Bake C_k into locals
+ * 4. Apply phase correction: α(d) *= e^{-2πi d θ_k}
+ * 5. Apply IDFT6: β(v) = (1/√6) Σ_d α'(d) × e^{2πi dv/6}
+ * 6. Compute |β(v)|² as measurement probabilities
+ * 7. Sample/argmax → outcome
+ * 8. Collapse + back-action via shor_collapse_site()
*
* Returns: marginals are written into marg_out[n_sites][6].
- * measured_out[n_sites] receives the measurement outcomes.
+ * measured_out[n_sites] receives the measurement outcomes.
* ═══════════════════════════════════════════════════════════════════════════ */
static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
double (*marg_out)[6], int *measured_out,
@@ -1446,25 +1442,27 @@ static void shor_measure_graph(HPCGraph *graph, int64_t n_sites,
* HPC-OPTIMIZED Q4_0 QUANTIZATION (for attention tensors)
*
* Same architecture as Q2_K HPC pipeline, but simpler:
- * - One parameter per block (scale d only, no dmin)
- * - Single quhit per block (6 states)
- * - 10 candidate scales → bin to 6 for BP
- * - 12-beam Hensel search for globally optimal configuration
- * - Triality 3-view marginals for robust scoring
+ * - One parameter per block (scale d only, no dmin)
+ * - Single quhit per block (6 states)
+ * - 24 candidate scales → bin to 6 for BP
+ * - 48-beam Hensel search for globally optimal configuration
+ * - Triality 3-view marginals for robust scoring
*
* Q4_0 block: 32 weights, 16 levels (0–15), dequant: w = (q - 8) * d
* ═══════════════════════════════════════════════════════════════════════════ */
-#define Q4_N_CAND 16 /* scale candidates for Q4_0 (was 10) */
-#define Q4_N_BEAMS 24 /* beam width (was 12) */
+#define Q4_N_CAND 24 /* expanded scale candidates for Q4_0 */
+#define Q4_N_BEAMS 48 /* expanded beam width */
-/* Tight neighborhood around WLS optimum: ±10% */
+/* Tight neighborhood around WLS optimum */
static const float Q4_NEIGHBOR_MULTS[Q4_N_CAND] = {
- 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f, 0.975f, 0.985f,
- 0.995f, 1.005f, 1.015f, 1.025f, 1.035f, 1.050f, 1.070f, 1.100f
+ 0.850f, 0.880f, 0.900f, 0.915f, 0.930f, 0.945f, 0.955f, 0.965f,
+ 0.975f, 0.985f, 0.995f, 1.000f, 1.005f, 1.015f, 1.025f, 1.035f,
+ 1.050f, 1.070f, 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
};
static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
- 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
@@ -1474,6 +1472,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
int64_t n_blocks = n_elements / QK4_0;
float total_err = 0.0f;
+ /* ── Compute Tensor Sigma for SA Temperature ── */
+ double t_sum_sq = 0.0;
+ for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
+ float w_sigma = sqrtf(t_sum_sq / n_elements);
+
/* ── Phase 1: Greedy seed — compute scale per block ── */
float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
@@ -1501,7 +1504,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
/* ── Step 2a: WLS solve to find optimal d* ── */
float wls_d = greedy_d[blk];
- for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
+ uint16_t prev_wls_d16 = 0;
+ for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
if (wls_d < 1e-15f) break;
float inv_d = 1.0f / wls_d;
float num = 0.0f, den = 0.0f;
@@ -1519,6 +1523,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
wls_d = gguf_fp16_to_fp32(gguf_fp32_to_fp16(d_new));
}
+ uint16_t cur_wls_d16 = gguf_fp32_to_fp16(wls_d);
+ if (cur_wls_d16 == prev_wls_d16) break; /* converged in FP16 */
+ prev_wls_d16 = cur_wls_d16;
}
/* ── Step 2b: Generate candidates centered on WLS optimum ── */
@@ -1529,36 +1536,30 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
cand_d16[blk][ci] = d16;
float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
- float err = 0.0f;
-
- for (int j = 0; j < QK4_0; j += 6) {
- int g_len = (j + 6 <= QK4_0) ? 6 : (QK4_0 - j);
- int half_g = g_len / 2;
- float e_cur[6], w_cur[6];
-
- for (int kk = 0; kk < g_len; kk++) {
- int idx = j + kk;
- float x = bw[idx];
- int q = (int)(x * id + 8.5f);
- if (q < 0) q = 0; if (q > 15) q = 15;
- float deq = ((float)q - 8.0f) * actual_d;
- e_cur[kk] = x - deq;
- w_cur[kk] = (imat_importance) ? imat_importance[blk * QK4_0 + idx] : 1.0f;
- }
-
- /* Decompose into vesica (DC) and wave (AC) components */
- float vesica_err = 0.0f, wave_err = 0.0f;
- for (int p = 0; p < half_g; p++) {
- float v = e_cur[p] + e_cur[p + half_g];
- float w_wave = e_cur[p] - e_cur[p + half_g];
- float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
- vesica_err += v * v * w_avg;
- wave_err += w_wave * w_wave * w_avg;
- }
- /* Triality weighting: penalize vesica 4×, wave 1×.
- * Factor of 0.5 keeps scale consistent with standard MSE. */
- err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
+
+ /* ── Single-unit D₆ error over all QK4_0 (32) elements ──
+ * Antipodal pairing: (j, j + QK4_0/2) for j in [0, QK4_0/2).
+ * Treating the whole block as one unit eliminates boundary
+ * artefacts from the old 6-element chunks and correctly captures
+ * long-range error correlations within the block. */
+ float e_all[QK4_0], w_all[QK4_0];
+ for (int j = 0; j < QK4_0; j++) {
+ float x = bw[j];
+ int q = (int)(x * id + 8.5f);
+ if (q < 0) q = 0; if (q > 15) q = 15;
+ float deq = ((float)q - 8.0f) * actual_d;
+ e_all[j] = x - deq;
+ w_all[j] = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
}
+ float vesica_err = 0.0f, wave_err = 0.0f;
+ for (int j = 0; j < QK4_0 / 2; j++) {
+ float v = e_all[j] + e_all[j + QK4_0 / 2];
+ float w_wave = e_all[j] - e_all[j + QK4_0 / 2];
+ float w_avg = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f;
+ vesica_err += v * v * w_avg;
+ wave_err += w_wave * w_wave * w_avg;
+ }
+ float err = 0.5f * (4.0f * vesica_err + wave_err);
cand_errors[blk][ci] = err;
}
}
@@ -1566,7 +1567,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
/* ── Phase 3: HPC graph — single quhit per block ── */
int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
for (int64_t i = 0; i < n_blocks; i++)
- best_candidate[i] = 10; /* Q4_NEIGHBOR_MULTS[10] = 1.00 */
+ best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
if (n_blocks >= 2) {
float temperature = 0.5f;
@@ -1752,7 +1753,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
group_cidx = history[curr_hist].cand_idx;
curr_hist = history[curr_hist].parent_idx;
} else {
- group_cidx = 10;
+ group_cidx = 11;
}
if (stride <= 1) {
@@ -1797,17 +1798,9 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
* The beam search found the MAP candidate sequence. But the
* triality marginals encode quantum phase-coherent structure
* that a greedy beam can miss.
- *
- * Like tesseract_factor's MCMC period recovery (lines 1920-1964):
- * 1. Take N independent Born samples from triality marginals
- * 2. Each sample → full candidate assignment across all blocks
- * 3. Evaluate actual RMSE for each assignment
- * 4. Keep assignment with lowest total RMSE
- *
- * Reuses the EXISTING converged Möbius sheet — zero new BP.
* ══════════════════════════════════════════════════════════════ */
{
- #define Q4_BORN_SHOTS 64
+ #define Q4_BORN_SHOTS 128
/* Compute beam-search baseline RMSE for comparison */
float beam_total_err = 0.0f;
@@ -1847,7 +1840,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
/* Find the best candidate WITHIN this quhit bin */
int64_t blk = gi * stride;
float best_bin_err = 1e30f;
- int best_bin_cand = 10; /* default */
+ int best_bin_cand = 11; /* default */
for (int ci = 0; ci < Q4_N_CAND; ci++) {
if (Q4_CAND_TO_QUHIT[ci] == sampled_qi) {
if (cand_errors[blk][ci] < best_bin_err) {
@@ -1872,6 +1865,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
free(shot_sparse_q4);
}
+ /* Born refinement pass: non-stride blocks were set during beam
+ * traceback and never revisited by Born shots. For each such block
+ * pick the lowest-error candidate within the same quhit bin that
+ * the winning Born shot chose for its stride-representative. */
+ if (stride > 1) {
+ for (int64_t b = 0; b < n_blocks; b++) {
+ if (b % stride == 0) continue;
+ int64_t rep = (b / stride) * stride;
+ int target_bin = Q4_CAND_TO_QUHIT[best_candidate[rep]];
+ float best_b_err = 1e30f;
+ int best_b_cand = best_candidate[rep];
+ for (int ci = 0; ci < Q4_N_CAND; ci++) {
+ if (Q4_CAND_TO_QUHIT[ci] != target_bin) continue;
+ if (cand_errors[b][ci] < best_b_err) {
+ best_b_err = cand_errors[b][ci];
+ best_b_cand = ci;
+ }
+ }
+ best_candidate[b] = best_b_cand;
+ }
+ }
+
free(marg);
hpc_destroy(graph);
}
@@ -1879,22 +1894,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
/* ══════════════════════════════════════════════════════════════════
* PHASE 4: Assemble blocks via least-squares scale extraction
- *
- * The factorer assembles a frequency register from BP marginals,
- * then EXTRACTS the exact period via continued fractions.
- *
- * We do the same: the beam search / Born shots selected a grid
- * candidate (the "assembled frequency"). Now we EXTRACT the exact
- * optimal FP16 scale via weighted least-squares (the "CF step").
- *
- * For Q4_0: d_optimal = Σ(w_j × x_j × q̃_j) / Σ(w_j × q̃_j²)
- * where q̃_j = (q_j - 8) and q_j is quantized at the grid scale.
- *
- * This iterates: quantize at d_init → compute d_optimal → re-quantize
- * → re-compute until convergence. 3 iterations suffice since Q4_0
- * has only 16 levels — the assignment stabilizes immediately.
- *
- * The grid gave us 16 possible scales. This gives us 65,536 (all FP16).
* ══════════════════════════════════════════════════════════════════ */
#pragma omp parallel for schedule(dynamic, 64) reduction(+:total_err)
@@ -1905,13 +1904,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
/* Start from the grid-selected scale (the "assembled frequency") */
float d_current = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
- /* Analog assembly: iterate to full convergence.
- * 5 iterations for stable (d, q-values) coupling. */
+ /* Analog assembly: iterate to full convergence. */
for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
if (d_current < 1e-15f) break;
float id = 1.0f / d_current;
- /* Quantize at current scale */
int qs_tmp[QK4_0];
for (int j = 0; j < QK4_0; j++) {
int q = (int)(bw[j] * id + 8.5f);
@@ -1919,8 +1916,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
qs_tmp[j] = q;
}
- /* Weighted least-squares: d = Σ(w × x × q̃) / Σ(w × q̃²)
- * where q̃ = q - 8 (centered quantized value) */
float num = 0.0f, den = 0.0f;
for (int j = 0; j < QK4_0; j++) {
float q_centered = (float)qs_tmp[j] - 8.0f;
@@ -1932,7 +1927,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
if (den > 1e-15f) {
float d_new = num / den;
- /* Clamp magnitude to prevent runaway (Q4_0 d can be negative) */
float d_seed = gguf_fp16_to_fp32(cand_d16[blk][cidx]);
if (fabsf(d_new) < 4.0f * (fabsf(d_seed) + 1e-10f)) {
uint16_t d16 = gguf_fp32_to_fp16(d_new);
@@ -1941,28 +1935,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
}
}
- /* ── FP16 ULP neighborhood search + sign-flip exploration ──
- * The WLS solve found the continuous-optimal d. But FP16 truncation
- * may shift the optimum. Try ±4 ULP around d in FP16 space, plus
- * the negated scale, and pick the one with minimum reconstruction error. */
+ /* ── FP16 ULP neighborhood search + sign-flip exploration ── */
{
uint16_t base_d16 = gguf_fp32_to_fp16(d_current);
uint16_t best_d16 = base_d16;
float best_ulp_err = 1e30f;
- /* Try ±4 ULP neighborhood + sign flip = up to 17 candidates */
- uint16_t ulp_candidates[17];
+ /* Try ±8 ULP neighborhood + sign flip = up to 34 candidates */
+ uint16_t ulp_candidates[35];
int n_ulp = 0;
- for (int delta = -4; delta <= 4; delta++) {
+ for (int delta = -8; delta <= 8; delta++) {
int cand16 = (int)base_d16 + delta;
- if (cand16 >= 0 && cand16 <= 0x7BFF) /* valid positive FP16 */
+ if (cand16 >= 0 && cand16 <= 0x7BFF)
ulp_candidates[n_ulp++] = (uint16_t)cand16;
}
- /* Sign-flipped d: negate and try ±0 ULP */
{
float neg_d = -d_current;
uint16_t neg_d16 = gguf_fp32_to_fp16(neg_d);
- ulp_candidates[n_ulp++] = neg_d16;
+ for (int delta = -8; delta <= 8; delta++) {
+ int cand16 = (int)neg_d16 + delta;
+ if (cand16 >= 0 && cand16 <= 0x7BFF)
+ ulp_candidates[n_ulp++] = (uint16_t)cand16;
+ }
}
for (int ui = 0; ui < n_ulp; ui++) {
@@ -1984,18 +1978,11 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
d_current = gguf_fp16_to_fp32(best_d16);
}
- /* Store the extracted optimal FP16 scale */
output[blk].d = gguf_fp32_to_fp16(d_current);
float actual_d = d_current;
float id = (fabsf(actual_d) > 1e-15f) ? 1.0f / actual_d : 0.0f;
- /* ── D₆ Hadamard Error Shaping for Q4_0 ──
- * 32 elements per block = 5 full D₆ groups of 6 + 2 tail.
- * Apply the same antipodal fold as Q2_K: minimize vesica energy
- * to push quantization noise into wave (high-frequency) modes
- * that cancel in dot products. */
-
- /* Step 1: Standard nearest-rounding as baseline */
+ /* ── D₆ Hadamard Error Shaping with Simulated Annealing ── */
int q_base[QK4_0], q_shaped[QK4_0];
float q_cont[QK4_0];
for (int j = 0; j < QK4_0; j++) {
@@ -2006,73 +1993,83 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
}
memcpy(q_shaped, q_base, QK4_0 * sizeof(int));
- /* Step 2: D₆ greedy flipping on 5 groups of 6 */
- for (int g = 0; g < 5; g++) {
- int g_off = g * 6;
-
- for (int pass = 0; pass < 6; pass++) {
- int best_k = -1;
- int best_q_alt = 0;
- float best_delta = 0.0f;
-
- /* Current group errors */
- float e_cur[6];
- for (int kk = 0; kk < 6; kk++) {
- float deq = ((float)q_shaped[g_off+kk] - 8.0f) * actual_d;
- e_cur[kk] = bw[g_off+kk] - deq;
- }
-
- /* Current D₆ metric: vesica energy + DC² */
- float vesica_cur = 0.0f, dc_cur = 0.0f;
- for (int p = 0; p < 3; p++) {
- float v = e_cur[p] + e_cur[p+3];
- vesica_cur += v * v;
- }
- for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
- float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
-
- /* Try flipping each element */
- for (int k = 0; k < 6; k++) {
- int idx = g_off + k;
- int q_cur = q_shaped[idx];
+ {
+ float e_live[QK4_0];
+ for (int j = 0; j < QK4_0; j++) {
+ float deq = ((float)q_shaped[j] - 8.0f) * actual_d;
+ e_live[j] = bw[j] - deq;
+ }
- int q_try;
- if (q_cont[idx] - (float)q_cur >= 0) {
- q_try = q_cur + 1;
- } else {
- q_try = q_cur - 1;
- }
+ float v_live[QK4_0 / 2];
+ float vesica_cur = 0.0f, dc_cur = 0.0f;
+ for (int j = 0; j < QK4_0 / 2; j++) {
+ v_live[j] = e_live[j] + e_live[j + QK4_0 / 2];
+ vesica_cur += v_live[j] * v_live[j];
+ }
+ for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
+ float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+
+ /* Simulated Annealing parameters */
+ float sa_temp = metric_cur * 0.05f;
+ float sa_decay = 0.90f;
+
+ for (int pass = 0; pass < QK4_0; pass++) {
+ int best_k = -1;
+ int best_q_alt = 0;
+ float best_delta = -1e30f;
+
+ for (int k = 0; k < QK4_0; k++) {
+ int q_cur = q_shaped[k];
+ int q_try = (q_cont[k] - (float)q_cur >= 0.0f)
+ ? q_cur + 1 : q_cur - 1;
if (q_try < 0 || q_try > 15) continue;
- /* Alt errors */
- float e_alt[6];
- for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
float deq_try = ((float)q_try - 8.0f) * actual_d;
- e_alt[k] = bw[idx] - deq_try;
+ float e_new = bw[k] - deq_try;
+ float de = e_new - e_live[k];
- /* Alt D₆ metric */
- float vesica_alt = 0.0f, dc_alt = 0.0f;
- for (int p = 0; p < 3; p++) {
- float v = e_alt[p] + e_alt[p+3];
- vesica_alt += v * v;
- }
- for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
+ int pi = (k < QK4_0 / 2) ? k : k - QK4_0 / 2;
+ float v_old = v_live[pi];
+ float v_new = v_old + de;
+
+ float vesica_alt = vesica_cur - v_old * v_old + v_new * v_new;
+ float dc_alt = dc_cur + de;
float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;
float delta = metric_cur - metric_alt;
if (delta > best_delta) {
best_delta = delta;
- best_k = k;
+ best_k = k;
best_q_alt = q_try;
}
}
if (best_k < 0) break;
- q_shaped[g_off + best_k] = best_q_alt;
+
+ /* SA Acceptance Rule */
+ if (best_delta > 0.0f || (sa_temp > 1e-7f && expf(best_delta / sa_temp) > ((float)rand()/RAND_MAX))) {
+ q_shaped[best_k] = best_q_alt;
+ float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
+ float e_new_commit = bw[best_k] - deq_commit;
+ float de_commit = e_new_commit - e_live[best_k];
+
+ int pi_commit = (best_k < QK4_0 / 2) ? best_k : best_k - QK4_0 / 2;
+ float v_old_commit = v_live[pi_commit];
+ float v_new_commit = v_old_commit + de_commit;
+
+ vesica_cur += v_new_commit * v_new_commit - v_old_commit * v_old_commit;
+ dc_cur += de_commit;
+ metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+
+ v_live[pi_commit] = v_new_commit;
+ e_live[best_k] = e_new_commit;
+ } else {
+ if (sa_temp < 1e-7f) break;
+ }
+ sa_temp *= sa_decay;
}
}
- /* Step 3: Error comparison — keep shaped only if MSE doesn't worsen >5% */
float err_base = 0.0f, err_shaped = 0.0f;
for (int j = 0; j < QK4_0; j++) {
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
@@ -2081,9 +2078,8 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w;
err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w;
}
- int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base;
+ int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
- /* Pack nibbles and compute error */
for (int j = 0; j < QK4_0 / 2; j++) {
int q0 = q_final[j];
int q1 = q_final[j + QK4_0/2];
@@ -2114,15 +2110,22 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
init_scale_table();
+ /* ── Outlier Clamping for WLS Seeds ──
+ * Protects the Phase 1 greedy seed from being violently warped by extreme
+ * >4.0 sigma outliers, which creates better centering for the grid search. */
+ double t_sum_sq = 0.0;
+ for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
+ float w_sigma = sqrtf(t_sum_sq / n_elements);
+ float clamp_val = w_sigma * 3.5f;
+
/* ══════════════════════════════════════════════════════════════════
* PHASE 1: Greedy quantization — produce seed (d, dmin) per block
* ══════════════════════════════════════════════════════════════════ */
- /* Store Phase A/B results for all blocks */
typedef struct {
- float dm, mm; /* greedy d, dmin (fp32) */
- uint16_t d_fp16, dmin_fp16; /* greedy d, dmin (fp16) */
- uint8_t Ls[16], Lm[16]; /* sub-block scale/min indices */
+ float dm, mm;
+ uint16_t d_fp16, dmin_fp16;
+ uint8_t Ls[16], Lm[16];
float scales[16], mins[16], sw[16];
} BlockSeed;
@@ -2138,15 +2141,21 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
for (int i = 0; i < QK_K; i++) sumx2 += block_x[i] * block_x[i];
float sigma2 = sumx2 / (float)QK_K;
+ /* Phase 1 WLS uses clamped values to generate stable seeds */
+ float sx_clipped[16];
for (int j = 0; j < N_SUB; j++) {
const float *sx = block_x + 16 * j;
seeds[blk].sw[j] = 0;
for (int l = 0; l < 16; l++) {
float imp = (imat_importance) ? imat_importance[blk * QK_K + 16 * j + l] : 1.0f;
- wt[l] = imp * sqrtf(sigma2 + sx[l] * sx[l]);
+ float v = sx[l];
+ if (v > clamp_val) v = clamp_val;
+ if (v < -clamp_val) v = -clamp_val;
+ sx_clipped[l] = v;
+ wt[l] = imp * sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
seeds[blk].sw[j] += wt[l];
}
- seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx, wt,
+ seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
L + 16 * j, &seeds[blk].mins[j], Laux);
}
@@ -2160,36 +2169,30 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
/* ══════════════════════════════════════════════════════════════════
* PHASE 2: WLS-Optimal Candidate Generation
- *
- * Instead of a fixed multiplier grid centered on greedy seeds,
- * we first solve a 3-iteration Weighted Least-Squares to find
- * the true optimal (d*, dmin*) per block, then generate the
- * 16×16 candidate grid centered on THOSE optimal values.
- * This makes the candidate space data-driven, not fabricated.
* ══════════════════════════════════════════════════════════════════ */
- /* Wide neighborhood around WLS optimum: ±20% with asymmetric spacing
- * — finer near 1.0 for precision, wider at edges for exploration.
- * Critical for large-σ weights where the optimal (d,dmin) may be
- * far from the WLS seed. */
+ /* Expanded neighborhood around WLS optimum: ±30% with 24 candidates */
static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
- 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
- 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
+ 0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
+ 0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
+ 1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
+ 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
};
static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
- 0.800f, 0.850f, 0.890f, 0.920f, 0.945f, 0.965f, 0.980f, 0.990f,
- 1.010f, 1.020f, 1.035f, 1.055f, 1.080f, 1.110f, 1.150f, 1.200f
+ 0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
+ 0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
+ 1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
+ 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
};
- /* Map 16 candidates → 6 quhit states for BP encoding */
- static const int CAND_TO_QUHIT[16] = {
- 0, 0, 0, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5
+ /* Map 24 candidates → 6 quhit states for BP encoding */
+ static const int CAND_TO_QUHIT[24] = {
+ 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
};
- /* candidate_errors[blk][256] — weighted MSE per candidate */
float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
- /* Per-candidate Ls/Lm — must recompute for each (d, dmin) */
uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;
@@ -2208,18 +2211,23 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
for (int64_t blk = 0; blk < n_blocks; blk++) {
const float *block_x = weights + blk * QK_K;
- /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ──
- * Seed from Phase 1 greedy, iterate 3× to converge.
- * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
- * This is a 2-variable WLS: minimize Σ w×(x - d×a + dmin×b)² */
+ /* ── Step 2a: WLS solve to find optimal (d*, dmin*) ── */
float wls_dm = seeds[blk].dm;
float wls_mm = seeds[blk].mm;
uint8_t wls_Ls[16], wls_Lm[16];
memcpy(wls_Ls, seeds[blk].Ls, 16);
memcpy(wls_Lm, seeds[blk].Lm, 16);
+ /* Generate soft-clipped buffer for WLS internal stability */
+ float clipped_block_x[QK_K];
+ for(int i=0; i clamp_val) v = clamp_val;
+ if (v < -clamp_val) v = -clamp_val;
+ clipped_block_x[i] = v;
+ }
+
for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
- /* Quantize all elements at current (wls_dm, wls_mm) */
uint8_t L_wls[QK_K];
for (int j = 0; j < N_SUB; j++) {
float d_sub = wls_dm * (float)wls_Ls[j];
@@ -2229,19 +2237,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
continue;
}
for (int k = 0; k < 16; k++) {
- int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
+ int q = gguf_nearest_int((clipped_block_x[16*j+k] + m_sub) / d_sub);
if (q < 0) q = 0; if (q > 3) q = 3;
L_wls[16*j+k] = (uint8_t)q;
}
}
- /* Accumulate 2×2 normal equations */
double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
for (int j = 0; j < N_SUB; j++) {
float ls_f = (float)wls_Ls[j];
float lm_f = (float)wls_Lm[j];
for (int k = 0; k < 16; k++) {
- float x = block_x[16*j+k];
+ float x = clipped_block_x[16*j+k];
float w = (imat_importance) ?
imat_importance[blk * QK_K + 16*j+k] : 1.0f;
float a = ls_f * (float)L_wls[16*j+k];
@@ -2254,19 +2261,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
}
- /* Solve via Cramer's rule */
double det = Saa * Sbb - Sab * Sab;
if (fabs(det) > 1e-30) {
double d_new = (Sbb * Sxa - Sab * Sxb) / det;
double dm_new = (Sab * Sxa - Saa * Sxb) / det;
- /* Clamp: positive and within 4× of seed (prevent runaway) */
if (d_new > 0.0 && d_new < 4.0 * (seeds[blk].dm + 1e-10))
wls_dm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_new));
if (dm_new > 0.0 && dm_new < 4.0 * (seeds[blk].mm + 1e-10))
wls_mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
}
- /* Re-derive Ls/Lm for updated (d*, dmin*) */
for (int j = 0; j < N_SUB; j++) {
if (wls_dm > 1e-15f) {
int ls = gguf_nearest_int(seeds[blk].scales[j] / wls_dm);
@@ -2281,9 +2285,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
}
- /* ── Step 2b: Generate 16×16 candidates centered on WLS optimum ──
- * Grid is now centered on (wls_dm, wls_mm) not (greedy_dm, greedy_mm).
- * Tighter spacing because we're already near the true minimum. */
+ /* ── Step 2b: Generate Candidates ── */
for (int di = 0; di < N_CAND_D; di++) {
float trial_dm = wls_dm * NEIGHBOR_MULTS_D[di];
uint16_t trial_d16 = gguf_fp32_to_fp16(trial_dm);
@@ -2298,87 +2300,58 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
candidate_d[blk][cidx] = trial_d16;
candidate_dmin[blk][cidx] = trial_dmin16;
- /* Recompute Ls/Lm for THIS candidate dm/mm */
uint8_t trial_Ls[16], trial_Lm[16];
for (int j = 0; j < N_SUB; j++) {
if (actual_dm > 1e-15f) {
int ls = gguf_nearest_int(seeds[blk].scales[j] / actual_dm);
if (ls < 0) ls = 0; if (ls > 15) ls = 15;
trial_Ls[j] = (uint8_t)ls;
- } else {
- trial_Ls[j] = 0;
- }
+ } else { trial_Ls[j] = 0; }
if (actual_mm > 1e-15f) {
int lm = gguf_nearest_int(seeds[blk].mins[j] / actual_mm);
if (lm < 0) lm = 0; if (lm > 15) lm = 15;
trial_Lm[j] = (uint8_t)lm;
- } else {
- trial_Lm[j] = 0;
- }
+ } else { trial_Lm[j] = 0; }
}
memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);
- /* Fully re-quantize and measure weighted MSE */
- float err = 0.0f;
- for (int j = 0; j < N_SUB; j++) {
- float d = actual_dm * (float)trial_Ls[j];
- float m = actual_mm * (float)trial_Lm[j];
+ /* Error evaluation MUST use the non-clipped original weights */
+ float e_all[QK_K], w_all[QK_K];
+ for (int i = 0; i < QK_K; i++) {
+ int jj = i >> 4;
+ float d = actual_dm * (float)trial_Ls[jj];
+ float m = actual_mm * (float)trial_Lm[jj];
+ float x = block_x[i];
+ w_all[i] = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
if (d < 1e-15f) {
- for (int k = 0; k < 16; k++) {
- float x = block_x[16 * j + k];
- float w = (imat_importance) ?
- imat_importance[blk * QK_K + 16 * j + k] : 1.0f;
- err += x * x * w;
- }
- continue;
- }
- for (int k = 0; k < 16; k += 6) {
- int g_len = (k + 6 <= 16) ? 6 : (16 - k);
- int half_g = g_len / 2;
- float e_cur[6], w_cur[6];
-
- for (int kk = 0; kk < g_len; kk++) {
- int idx = 16 * j + k + kk;
- float x = block_x[idx];
- int q = gguf_nearest_int((x + m) / d);
- if (q < 0) q = 0; if (q > 3) q = 3;
- float deq = d * (float)q - m;
- e_cur[kk] = x - deq;
- w_cur[kk] = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
- }
-
- /* Decompose into vesica and wave */
- float vesica_err = 0.0f, wave_err = 0.0f;
- for (int p = 0; p < half_g; p++) {
- float v = e_cur[p] + e_cur[p + half_g];
- float w_wave = e_cur[p] - e_cur[p + half_g];
- float w_avg = (w_cur[p] + w_cur[p + half_g]) * 0.5f;
- vesica_err += v * v * w_avg;
- wave_err += w_wave * w_wave * w_avg;
- }
- /* Triality weighting: penalize vesica 4×, wave 1× */
- err += 0.5f * (4.0f * vesica_err + 1.0f * wave_err);
+ e_all[i] = x;
+ } else {
+ int q = gguf_nearest_int((x + m) / d);
+ if (q < 0) q = 0; if (q > 3) q = 3;
+ e_all[i] = x - (d * (float)q - m);
}
}
- candidate_errors[blk][cidx] = err;
+ float vesica_err = 0.0f, wave_err = 0.0f;
+ for (int i = 0; i < QK_K / 2; i++) {
+ float v = e_all[i] + e_all[i + QK_K / 2];
+ float w_wave = e_all[i] - e_all[i + QK_K / 2];
+ float w_avg = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f;
+ vesica_err += v * v * w_avg;
+ wave_err += w_wave * w_wave * w_avg;
+ }
+ candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err);
}
}
}
/* ══════════════════════════════════════════════════════════════════
* PHASE 3: HPC Graph — Shor's Griffiths-Niu Measurement
- *
- * Build a multi-quhit graph where each block has 2 quhits
- * encoding the 36 candidate errors. Shor's sequential measurement
- * (IDFT6 + feed-forward + collapse/back-action) extracts exact
- * marginals for optimal (d, dmin) per block — replaces BP.
* ══════════════════════════════════════════════════════════════════ */
- /* Default: use greedy candidate (index 5*10+5 = 55, mult 1.00×1.00) */
int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
for (int64_t i = 0; i < n_blocks; i++)
- best_candidate[i] = 10 * N_CAND_M + 10; /* NEIGHBOR_MULTS_D[10]=1.00, _M[10]=1.00 */
+ best_candidate[i] = 11 * N_CAND_M + 11; /* index 11 = 1.0 multiplier */
if (opt_mode != OPT_MSE && n_blocks >= 2) {
int64_t graph_blocks = (n_blocks > 2000) ? 2000 : n_blocks;
@@ -2391,14 +2364,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
for (int64_t i = 0; i < n_sites; i++)
triality_dft(&graph->locals[i]);
- /* Encode each stride group's AGGREGATED candidate errors as dual-quhit
- * amplitudes. For stride > 1, average errors across ALL blocks in
- * the group — not just the first block. This is critical for large
- * tensors where stride=97 means 96/97 blocks were being ignored. */
-
- /* Compute adaptive temperature from median error spread.
- * This ensures the Boltzmann encoding produces meaningful distributions
- * regardless of weight magnitude (σ=0.0003 vs σ=0.024). */
{
double err_accum = 0.0;
int err_count = 0;
@@ -2413,18 +2378,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
if (err_count > 0) {
float median_err = (float)(err_accum / err_count);
- /* Temperature = 10% of median max error — sharp enough to
- * discriminate, soft enough for Shor interference */
temperature = median_err * 0.1f;
if (temperature < 1e-10f) temperature = 1e-10f;
}
}
for (int64_t i = 0; i < graph_blocks; i++) {
- /* Aggregate errors across entire stride group */
float agg_errors[TOTAL_SCALE_CANDIDATES];
- for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
- agg_errors[c] = 0.0f;
+ for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) agg_errors[c] = 0.0f;
int64_t blk_start = i * stride;
int64_t blk_end = blk_start + stride;
@@ -2435,7 +2396,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
agg_errors[c] += candidate_errors[b][c];
}
- /* Average across group */
if (group_size > 1) {
float inv_gs = 1.0f / (float)group_size;
for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
@@ -2447,7 +2407,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
if (agg_errors[c] < min_err)
min_err = agg_errors[c];
- /* Quhit 0 (coarse = d dimension): marginalize over dmin */
double coarse_re[6];
double coarse_norm = 0.0;
for (int qi = 0; qi < 6; qi++) coarse_re[qi] = 0.0;
@@ -2459,14 +2418,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
(2.0 * (double)temperature));
}
}
- for (int qi = 0; qi < 6; qi++)
- coarse_norm += coarse_re[qi] * coarse_re[qi];
+ for (int qi = 0; qi < 6; qi++) coarse_norm += coarse_re[qi] * coarse_re[qi];
if (coarse_norm > 1e-30) {
double inv = 1.0 / sqrt(coarse_norm);
for (int v = 0; v < 6; v++) coarse_re[v] *= inv;
}
- /* Quhit 1 (fine = dmin dimension): marginalize over d */
double fine_re[6];
double fine_norm = 0.0;
for (int qi = 0; qi < 6; qi++) fine_re[qi] = 0.0;
@@ -2478,14 +2435,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
(2.0 * (double)temperature));
}
}
- for (int qi = 0; qi < 6; qi++)
- fine_norm += fine_re[qi] * fine_re[qi];
+ for (int qi = 0; qi < 6; qi++) fine_norm += fine_re[qi] * fine_re[qi];
if (fine_norm > 1e-30) {
double inv = 1.0 / sqrt(fine_norm);
for (int v = 0; v < 6; v++) fine_re[v] *= inv;
}
- /* Write quhits */
int64_t s0 = 2 * i, s1 = 2 * i + 1;
for (int v = 0; v < 6; v++) {
graph->locals[s0].edge_re[v] = coarse_re[v];
@@ -2503,31 +2458,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
triality_update_mask(&graph->locals[s1]);
}
- /* Build edges */
for (int64_t i = 0; i < graph_blocks; i++) {
- hpc_cz(graph, 2 * i, 2 * i + 1); /* intra-block: d ↔ dmin */
+ hpc_cz(graph, 2 * i, 2 * i + 1);
if (i + 1 < graph_blocks) {
- hpc_cz(graph, 2 * i, 2 * (i + 1)); /* d ↔ d neighbor */
- hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1); /* dmin ↔ dmin */
+ hpc_cz(graph, 2 * i, 2 * (i + 1));
+ hpc_cz(graph, 2 * i + 1, 2 * (i + 1) + 1);
}
}
- /* ── Shor's Griffiths-Niu Sequential Measurement (dual quhit) ──
- * Replaces BP with exact marginals via IDFT6 + feed-forward +
- * collapse/back-action (ported 1:1 from tesseract_factor.c).
- *
- * The dual-quhit graph has 2×graph_blocks sites:
- * Even sites (s0 = 2*i): coarse (d dimension)
- * Odd sites (s1 = 2*i+1): fine (dmin dimension)
- *
- * Single-pass sequential measurement produces exact marginals
- * for both dimensions simultaneously through the CZ correlations. */
double (*shor_marg)[6] = (double (*)[6])calloc(n_sites, sizeof(double[6]));
int *shor_measured = (int *)calloc(n_sites, sizeof(int));
shor_measure_graph(graph, n_sites, shor_marg, shor_measured, 1);
- /* Extract coarse (d) and fine (dmin) marginals from Shor output */
double (*coarse_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
double (*fine_marg)[6] = (double (*)[6])calloc(graph_blocks, sizeof(double[6]));
@@ -2541,277 +2484,233 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
free(shor_marg);
free(shor_measured);
- /* ══ Hensel-Inspired Beam Search Constraint Propagation ══
- * Like tesseract_factor's Hensel lift: process blocks sequentially,
- * maintain K best configurations, prune by accumulated error.
+ /* ══════════════════════════════════════════════════════════════
+ * PHASE 3 — DETERMINISTIC VITERBI DP
*
- * The constraint: blocks are selected JOINTLY. */
-
- #define N_BEAMS 24 /* K beams — widened for 31B (was 12) */
-
- typedef struct {
- double acc_error;
- int history_idx; /* index into the backpointer array */
- } QuantBeam;
-
- typedef struct {
- int cand_idx;
- int parent_idx;
- } BeamHistory;
-
- QuantBeam beams[N_BEAMS];
- int active_beams = 1;
-
- /* Pre-allocate history to avoid O(N^2) memory copies */
- BeamHistory *history = (BeamHistory *)malloc(n_blocks * N_BEAMS * sizeof(BeamHistory));
-
- for (int b = 0; b < N_BEAMS; b++) {
- beams[b].acc_error = 0.0;
- beams[b].history_idx = -1;
- }
+ * Replaces the probabilistic beam-search + Born-rule Monte-Carlo
+ * shots with an exact, fully-deterministic DP over the 36-state
+ * Shor quhit space (6 coarse bins × 6 fine bins).
+ *
+ * For each graph block i and combined state s = qi_d*6 + qi_m:
+ *
+ * bin_best_err[i][s] = min candidate error in that (d,m)-bin
+ * aggregated over the stride group
+ * bin_log_prior[i][s] = log P_coarse(qi_d) + log P_fine(qi_m)
+ * from Shor marginals → HPC prior bonus
+ *
+ * Local Viterbi cost (lower = better):
+ * vcost[i][s] = bin_best_err[i][s]
+ * − VITERBI_BETA × scale_err × bin_log_prior[i][s]
+ *
+ * Transition cost (cross-block smoothness prior):
+ * trans(s′→s) = VITERBI_ALPHA × scale_err
+ * × (|qi_d − qi_d′| + |qi_m − qi_m′|)
+ *
+ * DP recurrence:
+ * dp[0][s] = vcost[0][s]
+ * dp[i][s] = vcost[i][s] + min_{s′}(dp[i-1][s′] + trans(s′→s))
+ *
+ * Traceback yields the globally optimal sequence of bin choices,
+ * which is then mapped to per-block best_candidate[] indices.
+ * A 5%-threshold greedy override rescues blocks where the local
+ * MSE-optimal candidate is meaningfully better than the bin winner.
+ * ══════════════════════════════════════════════════════════════ */
- /* Process blocks sequentially with beam search */
- for (int64_t i = 0; i < graph_blocks; i++) {
- double c_total = 0.0, f_total = 0.0;
- for (int v = 0; v < 6; v++) {
- c_total += coarse_marg[i][v];
- f_total += fine_marg[i][v];
- }
+ #define VIT_N_STATES 36 /* 6 coarse × 6 fine quhit bins */
+ #define VITERBI_BETA 0.25f /* log-prior bonus weight */
+ #define VITERBI_ALPHA 0.08f /* cross-block smoothness penalty weight */
- /* Candidate scores for this block: triality prob × (1/normalized_error) */
- double cand_score[TOTAL_SCALE_CANDIDATES];
- int64_t blk = i * stride;
- int d_bin_count[6] = {0}, m_bin_count[6] = {0};
- for (int k = 0; k < N_CAND_D; k++) d_bin_count[CAND_TO_QUHIT[k]]++;
- for (int k = 0; k < N_CAND_M; k++) m_bin_count[CAND_TO_QUHIT[k]]++;
- /* Per-block error normalization: divide by block mean error
- * so small-weight blocks don't dominate beam selection */
- float blk_mean_err = 0.0f;
- for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++)
- blk_mean_err += candidate_errors[blk][c];
- blk_mean_err /= (float)TOTAL_SCALE_CANDIDATES;
- if (blk_mean_err < 1e-30f) blk_mean_err = 1e-30f;
- for (int di = 0; di < N_CAND_D; di++) {
- int qi_d = CAND_TO_QUHIT[di];
- double p_d = (c_total > 1e-30) ? coarse_marg[i][qi_d] / c_total : 1.0/6.0;
- p_d /= (double)d_bin_count[qi_d];
- for (int mi = 0; mi < N_CAND_M; mi++) {
- int qi_m = CAND_TO_QUHIT[mi];
- double p_m = (f_total > 1e-30) ? fine_marg[i][qi_m] / f_total : 1.0/6.0;
- p_m /= (double)m_bin_count[qi_m];
- int cidx = di * N_CAND_M + mi;
- cand_score[cidx] = p_d * p_m / (candidate_errors[blk][cidx] / blk_mean_err + 1e-15);
+ {
+ int64_t vit_gi, vit_b;
+ int vit_s, vit_sp;
+
+ /* Per-graph-block per-state workspace */
+ float (*vit_bin_err )[VIT_N_STATES] =
+ (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+ int (*vit_bin_cand)[VIT_N_STATES] =
+ (int (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int [VIT_N_STATES]));
+ float (*vit_log_pri )[VIT_N_STATES] =
+ (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+ float (*vit_dp )[VIT_N_STATES] =
+ (float (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(float[VIT_N_STATES]));
+ int (*vit_back )[VIT_N_STATES] =
+ (int (*)[VIT_N_STATES])malloc(graph_blocks * sizeof(int [VIT_N_STATES]));
+
+ /* ── Step A: build per-block per-bin statistics ── */
+ for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+ double c_tot = 0.0, f_tot = 0.0;
+
+ for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+ vit_bin_err [vit_gi][vit_s] = 1e30f;
+ vit_bin_cand[vit_gi][vit_s] = -1;
}
- }
- /* Extend beams × 36 candidates, keep top K */
- typedef struct { double score; int beam_idx; int cand_idx; } BeamExt;
- BeamExt extensions[N_BEAMS * TOTAL_SCALE_CANDIDATES];
- int n_ext = 0;
+ /* Best candidate per (qi_d, qi_m) bin over stride group */
+ for (vit_b = vit_gi * stride;
+ vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
+ vit_b++) {
+ int vit_c;
+ for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+ int qi_d = CAND_TO_QUHIT[vit_c / N_CAND_M];
+ int qi_m = CAND_TO_QUHIT[vit_c % N_CAND_M];
+ vit_s = qi_d * 6 + qi_m;
+ float e = candidate_errors[vit_b][vit_c];
+ if (e < vit_bin_err[vit_gi][vit_s]) {
+ vit_bin_err[vit_gi][vit_s] = e;
+ /* Canonical candidate = stride-rep block's best */
+ if (vit_b == vit_gi * stride)
+ vit_bin_cand[vit_gi][vit_s] = vit_c;
+ }
+ }
+ }
- for (int b = 0; b < active_beams; b++) {
- for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
- /* Score = -(accumulated_error + this_block_error) × triality_prob */
- double ext_err = beams[b].acc_error + candidate_errors[blk][c];
- double ext_score = cand_score[c] / (ext_err + 1e-15);
- extensions[n_ext].score = ext_score;
- extensions[n_ext].beam_idx = b;
- extensions[n_ext].cand_idx = c;
- n_ext++;
+ /* HPC log-prior from Shor marginals */
+ for (int v = 0; v < 6; v++) {
+ c_tot += coarse_marg[vit_gi][v];
+ f_tot += fine_marg [vit_gi][v];
+ }
+ for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+ int qi_d = vit_s / 6, qi_m = vit_s % 6;
+ double pc = (c_tot > 1e-30)
+ ? coarse_marg[vit_gi][qi_d] / c_tot : 1.0/6.0;
+ double pf = (f_tot > 1e-30)
+ ? fine_marg [vit_gi][qi_m] / f_tot : 1.0/6.0;
+ vit_log_pri[vit_gi][vit_s] =
+ (float)(log(pc + 1e-30) + log(pf + 1e-30));
}
}
- /* Top-K selection */
- int top_k = (n_ext < N_BEAMS) ? n_ext : N_BEAMS;
- int top_indices[N_BEAMS];
- for (int k = 0; k < top_k; k++) {
- int best = -1;
- double best_s = -1e30;
- for (int e = 0; e < n_ext; e++) {
- if (extensions[e].score > best_s) {
- best_s = extensions[e].score;
- best = e;
+ /* ── Step B: scale_err normaliser for transition cost ── */
+ float vit_scale_err = 0.0f;
+ int vit_scale_cnt = 0;
+ for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+ for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+ if (vit_bin_err[vit_gi][vit_s] < 1e29f) {
+ vit_scale_err += vit_bin_err[vit_gi][vit_s];
+ vit_scale_cnt++;
}
}
- top_indices[k] = best;
- extensions[best].score = -2e30; /* poison */
}
-
- /* Build new beams from top-K extensions using backpointers */
- QuantBeam new_beams[N_BEAMS];
- for (int k = 0; k < top_k; k++) {
- int ext_idx = top_indices[k];
- int src_beam = extensions[ext_idx].beam_idx;
- int cand = extensions[ext_idx].cand_idx;
-
- int hist_idx = i * N_BEAMS + k;
- history[hist_idx].cand_idx = cand;
- history[hist_idx].parent_idx = beams[src_beam].history_idx;
-
- new_beams[k].history_idx = hist_idx;
- new_beams[k].acc_error = beams[src_beam].acc_error
- + candidate_errors[blk][cand];
+ vit_scale_err = (vit_scale_cnt > 0)
+ ? vit_scale_err / (float)vit_scale_cnt : 1e-10f;
+ if (vit_scale_err < 1e-20f) vit_scale_err = 1e-20f;
+
+ /* ── Step C: Forward Viterbi pass ── */
+
+ /* Block 0 — no predecessor */
+ for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+ float local = (vit_bin_err[0][vit_s] < 1e29f)
+ ? vit_bin_err[0][vit_s]
+ - VITERBI_BETA * vit_scale_err * vit_log_pri[0][vit_s]
+ : 1e30f;
+ vit_dp [0][vit_s] = local;
+ vit_back[0][vit_s] = -1;
}
- for (int k = 0; k < top_k; k++)
- beams[k] = new_beams[k];
- active_beams = top_k;
- }
-
- /* Trace back the best beam's selections.
- * The beam search selects one candidate per GRAPH NODE (stride group).
- * For stride > 1, each block within the stride group independently
- * picks its own best candidate — using the beam's coarse/fine quhit
- * bins as a constraint, but evaluating its own candidate_errors.
- * This eliminates stride-aliasing: previously 96/97 blocks were
- * forced to use a candidate chosen for 1 representative block. */
- int curr_hist = beams[0].history_idx;
- for (int64_t i = graph_blocks - 1; i >= 0; i--) {
- int group_cidx;
- if (curr_hist >= 0) {
- group_cidx = history[curr_hist].cand_idx;
- curr_hist = history[curr_hist].parent_idx;
- } else {
- group_cidx = 10 * N_CAND_M + 10;
- }
-
- if (stride <= 1) {
- /* No stride group — direct assignment */
- best_candidate[i] = group_cidx;
- } else {
- /* Per-block local optimization within the stride group.
- * The beam-selected candidate determines the target quhit
- * bins (d_bin, dmin_bin). Each block picks its own best
- * candidate that falls in compatible bins, or falls back
- * to the globally best candidate for that block. */
- int group_di = group_cidx / N_CAND_M;
- int group_mi = group_cidx % N_CAND_M;
- int target_d_bin = CAND_TO_QUHIT[group_di];
- int target_m_bin = CAND_TO_QUHIT[group_mi];
-
- for (int64_t b = i * stride; b < (i+1) * stride && b < n_blocks; b++) {
- /* Find best candidate in same quhit bins */
- float best_err = 1e30f;
- int best_c = group_cidx;
-
- for (int di = 0; di < N_CAND_D; di++) {
- if (CAND_TO_QUHIT[di] != target_d_bin) continue;
- for (int mi = 0; mi < N_CAND_M; mi++) {
- if (CAND_TO_QUHIT[mi] != target_m_bin) continue;
- int cidx = di * N_CAND_M + mi;
- if (candidate_errors[b][cidx] < best_err) {
- best_err = candidate_errors[b][cidx];
- best_c = cidx;
- }
- }
+ /* Blocks 1..graph_blocks-1 */
+ for (vit_gi = 1; vit_gi < graph_blocks; vit_gi++) {
+ for (vit_s = 0; vit_s < VIT_N_STATES; vit_s++) {
+ float local;
+ float best_pred = 1e30f;
+ int best_sp = 0;
+ int qi_d = vit_s / 6;
+ int qi_m = vit_s % 6;
+
+ if (vit_bin_err[vit_gi][vit_s] > 1e29f) {
+ vit_dp [vit_gi][vit_s] = 1e30f;
+ vit_back[vit_gi][vit_s] = 0;
+ continue;
}
-
- /* Also check if the block's overall best is significantly
- * better — if so, use it (greedy override) */
- float global_best = 1e30f;
- int global_best_c = group_cidx;
- for (int c = 0; c < TOTAL_SCALE_CANDIDATES; c++) {
- if (candidate_errors[b][c] < global_best) {
- global_best = candidate_errors[b][c];
- global_best_c = c;
+ local = vit_bin_err[vit_gi][vit_s]
+ - VITERBI_BETA * vit_scale_err * vit_log_pri[vit_gi][vit_s];
+
+ /* Min-cost predecessor with Manhattan transition penalty */
+ for (vit_sp = 0; vit_sp < VIT_N_STATES; vit_sp++) {
+ float prev = vit_dp[vit_gi - 1][vit_sp];
+ if (prev > 1e29f) continue;
+ int td = abs(qi_d - (vit_sp / 6));
+ int tm = abs(qi_m - (vit_sp % 6));
+ float trans = VITERBI_ALPHA * vit_scale_err * (float)(td + tm);
+ float total = prev + trans;
+ if (total < best_pred) {
+ best_pred = total;
+ best_sp = vit_sp;
}
}
-
- /* Use bin-constrained choice unless the global best
- * is >5% better — preserves Shor coherence while
- * allowing escape from bad bin assignments */
- if (global_best < best_err * 0.95f)
- best_candidate[b] = global_best_c;
- else
- best_candidate[b] = best_c;
+ vit_dp [vit_gi][vit_s] = (best_pred < 1e29f)
+ ? best_pred + local : 1e30f;
+ vit_back[vit_gi][vit_s] = best_sp;
}
}
- }
-
- free(history);
- /* ══════════════════════════════════════════════════════════════
- * Phase 3.5: Born-Rule Multi-Shot Scale Refinement (Q2_K)
- *
- * 2D Born sampling: sample coarse quhit (d dimension) and
- * fine quhit (dmin dimension) jointly from triality marginals.
- * Each shot produces a (d_idx, dmin_idx) pair per block.
- * ══════════════════════════════════════════════════════════════ */
- {
- #define Q2K_BORN_SHOTS 64
-
- float beam_total_err = 0.0f;
- for (int64_t bi = 0; bi < n_blocks; bi++)
- beam_total_err += candidate_errors[bi][best_candidate[bi]];
-
- unsigned int born_rng_q2 = 271828;
- /* Compute tail error once (blocks beyond graph coverage) */
- float tail_err = 0.0f;
- for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
- tail_err += candidate_errors[bi][best_candidate[bi]];
-
- /* Sparse shot buffer: only track stride-sampled blocks */
- int *shot_sparse = (int *)malloc(graph_blocks * sizeof(int));
-
- for (int shot = 0; shot < Q2K_BORN_SHOTS; shot++) {
- float shot_err = tail_err;
-
- for (int64_t gi = 0; gi < graph_blocks; gi++) {
- /* Born sample coarse (d) quhit */
- double c_total = 0.0;
- for (int v = 0; v < 6; v++) c_total += coarse_marg[gi][v];
- born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
- double rnd_c = (double)(born_rng_q2 >> 8) / 16777216.0;
- double target_c = rnd_c * c_total;
- double cum_c = 0.0;
- int qi_d = 5;
- for (int v = 0; v < 6; v++) {
- cum_c += coarse_marg[gi][v];
- if (cum_c > target_c) { qi_d = v; break; }
- }
-
- /* Born sample fine (dmin) quhit */
- double f_total = 0.0;
- for (int v = 0; v < 6; v++) f_total += fine_marg[gi][v];
- born_rng_q2 = born_rng_q2 * 1664525u + 1013904223u;
- double rnd_f = (double)(born_rng_q2 >> 8) / 16777216.0;
- double target_f = rnd_f * f_total;
- double cum_f = 0.0;
- int qi_m = 5;
- for (int v = 0; v < 6; v++) {
- cum_f += fine_marg[gi][v];
- if (cum_f > target_f) { qi_m = v; break; }
+ /* ── Step D: Traceback ── */
+ int *vit_path = (int *)malloc(graph_blocks * sizeof(int));
+ {
+ int best_s = 0;
+ float best_f = vit_dp[graph_blocks - 1][0];
+ for (vit_s = 1; vit_s < VIT_N_STATES; vit_s++) {
+ if (vit_dp[graph_blocks - 1][vit_s] < best_f) {
+ best_f = vit_dp[graph_blocks - 1][vit_s];
+ best_s = vit_s;
}
+ }
+ vit_path[graph_blocks - 1] = best_s;
+ for (vit_gi = graph_blocks - 2; vit_gi >= 0; vit_gi--)
+ vit_path[vit_gi] = vit_back[vit_gi + 1][vit_path[vit_gi + 1]];
+ }
- /* Find best candidate within the sampled (d_bin, m_bin) */
- int64_t blk = gi * stride;
- float best_bin_err = 1e30f;
- int best_bin_cand = 10 * N_CAND_M + 10;
- for (int di = 0; di < N_CAND_D; di++) {
- if (CAND_TO_QUHIT[di] != qi_d) continue;
- for (int mi = 0; mi < N_CAND_M; mi++) {
- if (CAND_TO_QUHIT[mi] != qi_m) continue;
- int cidx = di * N_CAND_M + mi;
- if (candidate_errors[blk][cidx] < best_bin_err) {
- best_bin_err = candidate_errors[blk][cidx];
- best_bin_cand = cidx;
- }
+ /* ── Step E: Map Viterbi path → best_candidate[] ── */
+ for (vit_gi = 0; vit_gi < graph_blocks; vit_gi++) {
+ vit_s = vit_path[vit_gi];
+ int qi_d = vit_s / 6;
+ int qi_m = vit_s % 6;
+ int64_t blk_rep = vit_gi * stride;
+
+ /* Stride-representative block: use precomputed bin winner */
+ if (vit_bin_cand[vit_gi][vit_s] >= 0)
+ best_candidate[blk_rep] = vit_bin_cand[vit_gi][vit_s];
+
+ /* Non-representative blocks in the stride group */
+ for (vit_b = blk_rep + 1;
+ vit_b < (vit_gi + 1) * stride && vit_b < n_blocks;
+ vit_b++) {
+ int vit_c;
+ float best_e = 1e30f;
+ int best_c = best_candidate[blk_rep];
+ for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+ if (CAND_TO_QUHIT[vit_c / N_CAND_M] != qi_d) continue;
+ if (CAND_TO_QUHIT[vit_c % N_CAND_M] != qi_m) continue;
+ if (candidate_errors[vit_b][vit_c] < best_e) {
+ best_e = candidate_errors[vit_b][vit_c];
+ best_c = vit_c;
}
}
-
- shot_sparse[gi] = best_bin_cand;
- shot_err += candidate_errors[blk][best_bin_cand];
+ best_candidate[vit_b] = best_c;
}
+ }
- if (shot_err < beam_total_err) {
- /* Only now apply the sparse updates to best_candidate */
- for (int64_t gi = 0; gi < graph_blocks; gi++)
- best_candidate[gi * stride] = shot_sparse[gi];
- beam_total_err = shot_err;
+ /* ── Step F: 5 % greedy override (pure MSE safety net) ── */
+ for (vit_b = 0; vit_b < n_blocks; vit_b++) {
+ int vit_c;
+ float cur_err = candidate_errors[vit_b][best_candidate[vit_b]];
+ float g_best = cur_err;
+ int g_cand = best_candidate[vit_b];
+ for (vit_c = 0; vit_c < TOTAL_SCALE_CANDIDATES; vit_c++) {
+ if (candidate_errors[vit_b][vit_c] < g_best) {
+ g_best = candidate_errors[vit_b][vit_c];
+ g_cand = vit_c;
+ }
}
+ if (g_best < cur_err * 0.95f)
+ best_candidate[vit_b] = g_cand;
}
- free(shot_sparse);
+ free(vit_path);
+ free(vit_dp);
+ free(vit_back);
+ free(vit_bin_err);
+ free(vit_bin_cand);
+ free(vit_log_pri);
}
free(coarse_marg);
@@ -2819,7 +2718,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
hpc_destroy(graph);
}
} else {
- /* OPT_MSE or single block: pick candidate with lowest raw error */
for (int64_t blk = 0; blk < n_blocks; blk++) {
float best_err = candidate_errors[blk][0];
int best_idx = 0;
@@ -2834,27 +2732,80 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
/* ══════════════════════════════════════════════════════════════════
- * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
+ * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
+ *
+ * Transforms the tensor from a collection of isolated 256-element
+ * Q2_K superblocks into a single, continuous error-cancelling waveform.
+ *
+ * After Phase 3 has selected the optimal (d, dmin) candidate for every
+ * block, this sequential pass computes the net DC residual left by each
+ * block using a cheap round-nearest forward quantization, then feeds the
+ * negated, exponentially-decayed residual as a correction bias into the
+ * WLS solver of the immediately following block.
+ *
+ * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
+ *
+ * dc_bias[N+1] = −DC_DECAY × R_N / QK_K (per-element offset)
*
- * Like Q4_0's CF analog: the beam search / Born shots selected a
- * grid candidate (d_grid, dmin_grid). Now we EXTRACT the exact
- * optimal FP16 (d, dmin) via weighted least-squares, holding the
- * sub-block Ls/Lm and quantized levels fixed.
+ * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
+ * quantizer toward codes whose reconstruction deq ≈ x′, so that
*
- * Q2_K model: x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
+ * Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
*
- * Full analog assembly: at each iteration, EXHAUSTIVELY search
- * all 16×16 = 256 possible (Ls[j], Lm[j]) pairs per sub-block
- * to find the assignment that minimizes weighted reconstruction
- * error. Then WLS-solve for the global (d, dmin). Repeat 5×.
+ * The accumulated cross-block DC collapses geometrically:
*
- * This guarantees every parameter is at its conditional optimum —
- * the perfect bit analog at 2-bit resolution.
+ * R₀, DC_DECAY·R₀, DC_DECAY²·R₀, … → 0
+ *
+ * The result is written into block_dc_bias[n_blocks]. Phase 4 reads
+ * this array (safe: written sequentially before the parallel loop).
+ * ══════════════════════════════════════════════════════════════════ */
+
+ #define DC_DECAY 0.85f /* Boundary-condition leak factor (0 = isolated, 1 = full) */
+
+ float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
+
+ if (block_dc_bias) {
+ float rolling_dc = 0.0f;
+
+ for (int64_t blk = 0; blk < n_blocks; blk++) {
+ const float *bx = weights + blk * QK_K;
+ int cidx = best_candidate[blk];
+ float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
+ float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+
+ /* Bias applied to THIS block's WLS targets */
+ float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
+ block_dc_bias[blk] = dc_bias;
+
+ /* Quick round-nearest quant to estimate DC residual for NEXT block.
+ * We quantize the adjusted target x′ = x − dc_bias, then measure
+ * the residual of the ORIGINAL weight against the chosen code. */
+ float dc_res = 0.0f;
+ int j, k;
+ for (j = 0; j < N_SUB; j++) {
+ float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
+ float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
+ for (k = 0; k < 16; k++) {
+ float x_adj = bx[16*j + k] - dc_bias;
+ int q = 0;
+ if (d_sub >= 1e-15f) {
+ q = gguf_nearest_int((x_adj + m_sub) / d_sub);
+ if (q < 0) q = 0;
+ if (q > 3) q = 3;
+ }
+ float deq = d_sub * (float)q - m_sub;
+ /* Residual against ORIGINAL weight (not adjusted) */
+ dc_res += bx[16*j + k] - deq;
+ }
+ }
+ rolling_dc = dc_res;
+ }
+ }
+
+ /* ══════════════════════════════════════════════════════════════════
+ * PHASE 4: Assemble blocks via least-squares (d, dmin) extraction
* ══════════════════════════════════════════════════════════════════ */
- /* Pre-allocate one HPCGraph per OMP thread for sub-block Shor measurement.
- * This eliminates ~776K malloc/free cycles from the inner loop.
- * Each thread reuses its graph via hpc_reset_for_subblock(). */
int _n_omp_threads = 1;
#ifdef _OPENMP
_n_omp_threads = omp_get_max_threads();
@@ -2869,32 +2820,36 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
int cidx = best_candidate[blk];
uint8_t Ls_blk[16], Lm_blk[16];
- /* Start from HPC-selected candidate */
+ /* ── Rolling DC boundary condition ──────────────────────────────
+ * dc_adj shifts every WLS target in this block so that the net
+ * quantisation error steers toward cancelling the previous block's
+ * DC residual (written by the sequential Phase 3.9 pre-pass). */
+ float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
+
+ /* Adjusted weight view — WLS and Shor work on this array;
+ * the final error is always reported against the original block_x. */
+ float adj_block_x[QK_K];
+ {
+ int _i;
+ for (_i = 0; _i < QK_K; _i++)
+ adj_block_x[_i] = block_x[_i] - dc_adj;
+ }
+
memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);
float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
- /* ── Analog assembly: iterate to convergence ──
- * 3 iterations: the (Ls,Lm) ↔ (d,dmin) coupling stabilizes
- * after 2-3 passes. Additional iterations produce negligible
- * change in the committed FP16 values.
- * A) Sub-block Shor measurement to find coupled (Ls,Lm) states
- * B) Optimal q-value assignment
- * C) WLS solve for (d, dmin) */
- for (int ls_iter = 0; ls_iter < 3; ls_iter++) {
-
- /* ── Step A: Sub-block Quhit BP (Strategy 1) ──
- * For each sub-block j, evaluate all 256 (Ls, Lm) pairs.
- * Keep the 6 best pairs as quhit states for a 16-node graph.
- * Run BP to jointly select the globally optimal (Ls, Lm). */
+ uint16_t prev_dm16 = 0, prev_mm16 = 0;
+ for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
+
uint8_t state_ls[N_SUB][6];
uint8_t state_lm[N_SUB][6];
float state_err[N_SUB][6];
for (int j = 0; j < N_SUB; j++) {
- const float *sx = block_x + 16 * j;
+ const float *sx = adj_block_x + 16 * j;
for (int v = 0; v < 6; v++) state_err[j][v] = 1e30f;
for (int try_ls = 0; try_ls <= 15; try_ls++) {
@@ -2917,7 +2872,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
sub_err += diff * diff * w;
}
- /* Insert into top 6 */
for (int v = 0; v < 6; v++) {
if (sub_err < state_err[j][v]) {
for (int u = 5; u > v; u--) {
@@ -2935,7 +2889,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
}
- /* Reset thread-local sub-block graph (zero allocations) */
int _tid = 0;
#ifdef _OPENMP
_tid = omp_get_thread_num();
@@ -2946,19 +2899,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
float min_sub_err[N_SUB];
for (int j = 0; j < N_SUB; j++) min_sub_err[j] = state_err[j][0];
- /* Initialize unary potentials from local errors */
for (int j = 0; j < N_SUB; j++) {
triality_dft(&sg->locals[j]);
double amp_re[6];
double amp_norm = 0.0;
for (int v = 0; v < 6; v++) {
- /* Adaptive temperature: scale with local error spread
- * so Shor measurement produces meaningful interference
- * patterns regardless of weight magnitude */
- float err_spread = state_err[j][5] - state_err[j][0];
- float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
- if (sub_temp < 1e-12f) sub_temp = 1e-12f;
- amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
+ float err_spread = state_err[j][5] - state_err[j][0];
+ float sub_temp = (err_spread > 1e-15f) ? err_spread * 0.3f : 0.1f;
+ if (sub_temp < 1e-12f) sub_temp = 1e-12f;
+ amp_re[v] = exp(-(double)(state_err[j][v] - min_sub_err[j]) / (double)sub_temp);
amp_norm += amp_re[v] * amp_re[v];
}
if (amp_norm > 1e-30) {
@@ -2975,12 +2924,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
triality_update_mask(&sg->locals[j]);
}
- /* Add coupling edges between adjacent sub-blocks */
for (int j = 0; j < N_SUB - 1; j++)
hpc_cz(sg, j, j + 1);
- /* ── Shor sequential measurement on sub-block graph ──
- * Stack-allocated arrays: eliminates 2 calloc/free per iteration */
double sub_marg[N_SUB][6];
int sub_measured[N_SUB];
memset(sub_marg, 0, sizeof(sub_marg));
@@ -2988,7 +2934,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
shor_measure_graph(sg, N_SUB, sub_marg, sub_measured, 1);
- /* Extract optimal Ls/Lm from Shor marginals */
for (int j = 0; j < N_SUB; j++) {
double best_prob = -1.0;
int best_v = 0;
@@ -3003,7 +2948,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
}
}
- /* ── Step B: Quantize q-values with optimal Ls/Lm ── */
uint8_t L[QK_K];
for (int j = 0; j < N_SUB; j++) {
float d_sub = dm * (float)Ls_blk[j];
@@ -3013,22 +2957,18 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
continue;
}
for (int k = 0; k < 16; k++) {
- int q = gguf_nearest_int((block_x[16*j+k] + m_sub) / d_sub);
+ int q = gguf_nearest_int((adj_block_x[16*j+k] + m_sub) / d_sub);
if (q < 0) q = 0; if (q > 3) q = 3;
L[16*j+k] = (uint8_t)q;
}
}
- /* ── Step C: WLS solve for (d, dmin) ──
- * x[j,k] ≈ d × Ls[j] × q[j,k] - dmin × Lm[j]
- * Let a = Ls[j]×q[j,k], b = Lm[j]
- * Normal equations via Cramer's rule */
double Saa = 0, Sab = 0, Sbb = 0, Sxa = 0, Sxb = 0;
for (int j = 0; j < N_SUB; j++) {
float ls_f = (float)Ls_blk[j];
float lm_f = (float)Lm_blk[j];
for (int k = 0; k < 16; k++) {
- float x = block_x[16*j+k];
+ float x = adj_block_x[16*j+k];
float w = (imat_importance) ?
imat_importance[blk * QK_K + 16*j+k] : 1.0f;
float a = ls_f * (float)L[16*j+k];
@@ -3045,7 +2985,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
if (fabs(det) > 1e-30) {
double d_new = (Sbb * Sxa - Sab * Sxb) / det;
double dm_new = (Sab * Sxa - Saa * Sxb) / det;
- /* Clamp: positive and within 4× of candidate seed */
float d_seed = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
float m_seed = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
if (d_new > 0.0 && d_new < 4.0 * (d_seed + 1e-10))
@@ -3053,28 +2992,27 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
if (dm_new > 0.0 && dm_new < 4.0 * (m_seed + 1e-10))
mm = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)dm_new));
}
- if (isnan(dm) || isnan(mm)) {
- printf("NaN detected before ULP: dm=%f mm=%f det=%f\n", dm, mm, det);
- exit(1);
- }
+
+ uint16_t cur_dm16 = gguf_fp32_to_fp16(dm);
+ uint16_t cur_mm16 = gguf_fp32_to_fp16(mm);
+ if (cur_dm16 == prev_dm16 && cur_mm16 == prev_mm16) break;
+ prev_dm16 = cur_dm16;
+ prev_mm16 = cur_mm16;
}
- /* ── FP16 ULP neighborhood search for (d, dmin) ──
- * The WLS solve found continuous-optimal (d, dmin). But FP16
- * truncation may shift the optimum. Try ±4 ULP around both
- * d and dmin, pick the pair with minimum reconstruction error. */
+ /* ── FP16 ULP neighborhood search for (d, dmin) — Expanded to ±8 ── */
{
uint16_t base_d16 = gguf_fp32_to_fp16(dm);
uint16_t base_m16 = gguf_fp32_to_fp16(mm);
uint16_t best_d16 = base_d16, best_m16 = base_m16;
float best_ulp_err = 1e30f;
- for (int dd = -2; dd <= 2; dd++) {
+ for (int dd = -8; dd <= 8; dd++) {
int cd16 = (int)base_d16 + dd;
if (cd16 < 0 || cd16 > 0x7BFF) continue;
float trial_dm = gguf_fp16_to_fp32((uint16_t)cd16);
- for (int dm_delta = -2; dm_delta <= 2; dm_delta++) {
+ for (int dm_delta = -8; dm_delta <= 8; dm_delta++) {
int cm16 = (int)base_m16 + dm_delta;
if (cm16 < 0 || cm16 > 0x7BFF) continue;
float trial_mm = gguf_fp16_to_fp32((uint16_t)cm16);
@@ -3084,7 +3022,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
float d_sub = trial_dm * (float)Ls_blk[j];
float m_sub = trial_mm * (float)Lm_blk[j];
for (int k = 0; k < 16; k++) {
- float x = block_x[16*j+k];
+ float x = adj_block_x[16*j+k];
float w = (imat_importance) ?
imat_importance[blk * QK_K + 16*j+k] : 1.0f;
int q;
@@ -3109,21 +3047,13 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
mm = gguf_fp16_to_fp32(best_m16);
}
- /* ── Final Ls/Lm re-optimization at committed FP16 (d, dmin) ──
- * The WLS solve may have shifted (d, dmin) after the last Step A.
- * Neighborhood search ±2 around current values (25 pairs vs 256)
- * is sufficient since WLS shifts are typically < 1 Ls/Lm step. */
for (int j = 0; j < N_SUB; j++) {
- const float *sx = block_x + 16 * j;
+ const float *sx = adj_block_x + 16 * j;
float best_sub_err = 1e30f;
uint8_t best_ls = Ls_blk[j], best_lm = Lm_blk[j];
- int ls_lo = (Ls_blk[j] > 2) ? Ls_blk[j] - 2 : 0;
- int ls_hi = (Ls_blk[j] < 13) ? Ls_blk[j] + 2 : 15;
- int lm_lo = (Lm_blk[j] > 2) ? Lm_blk[j] - 2 : 0;
- int lm_hi = (Lm_blk[j] < 13) ? Lm_blk[j] + 2 : 15;
- for (int try_ls = ls_lo; try_ls <= ls_hi; try_ls++) {
+ for (int try_ls = 0; try_ls <= 15; try_ls++) {
float d_sub = dm * (float)try_ls;
- for (int try_lm = lm_lo; try_lm <= lm_hi; try_lm++) {
+ for (int try_lm = 0; try_lm <= 15; try_lm++) {
float m_sub = mm * (float)try_lm;
float sub_err = 0.0f;
for (int k = 0; k < 16; k++) {
@@ -3151,150 +3081,201 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
Lm_blk[j] = best_lm;
}
- /* Store the extracted optimal FP16 (d, dmin) */
output[blk].d = gguf_fp32_to_fp16(dm);
output[blk].dmin = gguf_fp32_to_fp16(mm);
for (int j = 0; j < N_SUB; j++)
output[blk].scales[j] = Ls_blk[j] | (Lm_blk[j] << 4);
- /* ── Final quantization with D₆ Hadamard Error Shaping ──
+ /* ── Final quantization: D₆ Hadamard Greedy Descent (deterministic) ──
*
- * Standard Q2_K rounds each weight independently: q = round((x+m)/d).
- * But within a sub-block, weights share (d, m), so their quantization
- * errors are CORRELATED. Independent rounding is suboptimal.
+ * The original Simulated Annealing acceptance rule is replaced by a
+ * strict greedy descent: only accept a flip if it strictly reduces the
+ * D₆ Hadamard metric (4·‖vesica‖² + DC²). This makes error shaping
+ * fully deterministic and thread-safe (no rand() inside omp parallel),
+ * consistent with the Viterbi philosophy applied in Phase 3.
*
- * The D₆ fold (antipodal Hadamard from the triality quhit) decomposes
- * the error vector into vesica (sum) and wave (difference) components:
- * vesica[k] = (e[k] + e[k+3]) / √2 — DC-like, accumulates in dot products
- * wave[k] = (e[k] - e[k+3]) / √2 — noise-like, cancels in dot products
- *
- * We WANT large wave error and small vesica error. So we greedily
- * flip rounding decisions (floor↔ceil) to minimize vesica energy,
- * even if total element-wise error increases slightly.
- *
- * Process: 16 elements per sub-block, treat as 2 groups of 6 + 4 tail.
- * Apply DFT₆-fold to each group of 6, minimize vesica component.
+ * The metric measures both:
+ * - Vesica Piscis term: correlated error between weights i and i+QK_K/2
+ * (targets the first non-DC harmonic — halfwave symmetry)
+ * - DC term: total signed error across the 256-weight superblock
+ * (captured and propagated to the next block by Phase 3.9)
*/
uint8_t L[QK_K];
- for (int j = 0; j < N_SUB; j++) {
- float d = dm * (float)(output[blk].scales[j] & 0xF);
- if (d < 1e-15f) {
- for (int k = 0; k < 16; k++) L[16 * j + k] = 0;
- continue;
+ {
+ float q_cont_all[QK_K];
+ int q_base_all[QK_K];
+ int q_shaped_all[QK_K];
+
+ for (int i = 0; i < QK_K; i++) {
+ int jj = i >> 4;
+ float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+ float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+ if (d_s < 1e-15f) {
+ q_cont_all[i] = 0.0f;
+ q_base_all[i] = 0;
+ } else {
+ /* Quantize the DC-adjusted target */
+ float qc = (adj_block_x[i] + m_s) / d_s;
+ q_cont_all[i] = qc;
+ int qr = gguf_nearest_int(qc);
+ if (qr < 0) qr = 0; if (qr > 3) qr = 3;
+ q_base_all[i] = qr;
+ }
}
- float m = mm * (float)(output[blk].scales[j] >> 4);
- float id = 1.0f / d;
-
- /* Step 1: Standard nearest-rounding as baseline */
- int q_base[16];
- float q_cont[16]; /* continuous q values before rounding */
- for (int k = 0; k < 16; k++) {
- q_cont[k] = (block_x[16*j+k] + m) * id;
- q_base[k] = gguf_nearest_int(q_cont[k]);
- if (q_base[k] < 0) q_base[k] = 0;
- if (q_base[k] > 3) q_base[k] = 3;
+ memcpy(q_shaped_all, q_base_all, QK_K * sizeof(int));
+
+ float e_live[QK_K];
+ for (int i = 0; i < QK_K; i++) {
+ int jj = i >> 4;
+ float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+ float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+ float deq = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
+ /* Residual against the adjusted target (DC-corrected view) */
+ e_live[i] = adj_block_x[i] - deq;
}
- /* Step 2: D₆ Hadamard Error Shaping
- * For each 6-element group, greedily flip the rounding decision
- * that most reduces the D₆-folded vesica error component.
- *
- * D₆ fold on 6-element groups: antipodal pairs (0,3), (1,4), (2,5)
- * vesica[k] = e[k] + e[k+3] (k=0,1,2) — DC-like, propagates
- * wave[k] = e[k] - e[k+3] (k=0,1,2) — noise-like, cancels
- *
- * Weight vesica 4× over wave + penalize DC (sum of all 6 errors) */
- int q_shaped[16];
- memcpy(q_shaped, q_base, 16 * sizeof(int));
-
- /* Process groups: [0..5], [6..11], tail [12..15] handled by D₆ metric on available pairs */
- for (int g = 0; g < 2; g++) {
- int g_off = g * 6;
- if (g_off + 5 >= 16) break;
-
- /* Multiple greedy passes — each pass finds the single best flip */
- for (int pass = 0; pass < 6; pass++) {
- int best_k = -1;
- int best_q_alt = 0;
- float best_delta = 0.0f; /* improvement = current_metric - alt_metric */
-
- /* Compute current group errors */
- float e_cur[6];
- for (int kk = 0; kk < 6; kk++) {
- int ii = g_off + kk;
- float deq = d * (float)q_shaped[ii] - m;
- e_cur[kk] = block_x[16*j+ii] - deq;
- }
+ float v_live[QK_K / 2];
+ float vesica_cur = 0.0f, dc_cur = 0.0f;
+ for (int i = 0; i < QK_K / 2; i++) {
+ v_live[i] = e_live[i] + e_live[i + QK_K / 2];
+ vesica_cur += v_live[i] * v_live[i];
+ }
+ for (int i = 0; i < QK_K; i++) dc_cur += e_live[i];
+ float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
- /* Current D₆ metric: vesica energy + DC² */
- float vesica_cur = 0.0f, dc_cur = 0.0f;
- for (int p = 0; p < 3; p++) {
- float v = e_cur[p] + e_cur[p+3];
- vesica_cur += v * v;
- }
- for (int kk = 0; kk < 6; kk++) dc_cur += e_cur[kk];
- float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
-
- /* Try flipping each element */
- for (int k = 0; k < 6; k++) {
- int idx = g_off + k;
- int q_cur = q_shaped[idx];
-
- /* Try the alternative rounding */
- int q_try;
- if (q_cont[idx] - (float)q_cur >= 0) {
- q_try = q_cur + 1;
- } else {
- q_try = q_cur - 1;
- }
- if (q_try < 0 || q_try > 3) continue;
-
- /* Compute alt errors (only element k changes) */
- float e_alt[6];
- for (int kk = 0; kk < 6; kk++) e_alt[kk] = e_cur[kk];
- float deq_try = d * (float)q_try - m;
- e_alt[k] = block_x[16*j+idx] - deq_try;
-
- /* Alt D₆ metric */
- float vesica_alt = 0.0f, dc_alt = 0.0f;
- for (int p = 0; p < 3; p++) {
- float v = e_alt[p] + e_alt[p+3];
- vesica_alt += v * v;
- }
- for (int kk = 0; kk < 6; kk++) dc_alt += e_alt[kk];
- float metric_alt = 4.0f * vesica_alt + dc_alt * dc_alt;
-
- float delta = metric_cur - metric_alt;
- if (delta > best_delta) {
- best_delta = delta;
- best_k = k;
- best_q_alt = q_try;
- }
+ /* Deterministic greedy descent: accept only strict improvements */
+ for (int pass = 0; pass < QK_K; pass++) {
+ int best_k = -1;
+ int best_q_alt = 0;
+ float best_delta = 0.0f; /* strictly positive threshold */
+
+ for (int k = 0; k < QK_K; k++) {
+ int jj = k >> 4;
+ float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+ if (d_s < 1e-15f) continue;
+
+ int q_cur = q_shaped_all[k];
+ int q_try = (q_cont_all[k] - (float)q_cur >= 0.0f)
+ ? q_cur + 1 : q_cur - 1;
+ if (q_try < 0 || q_try > 3) continue;
+
+ float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+ float e_new = adj_block_x[k] - (d_s * (float)q_try - m_s);
+ float de = e_new - e_live[k];
+
+ int pi = (k < QK_K / 2) ? k : k - QK_K / 2;
+ float v_new = v_live[pi] + de;
+
+ float vesica_alt = vesica_cur - v_live[pi]*v_live[pi] + v_new*v_new;
+ float dc_alt = dc_cur + de;
+ float delta = metric_cur - (4.0f * vesica_alt + dc_alt * dc_alt);
+
+ if (delta > best_delta) {
+ best_delta = delta;
+ best_k = k;
+ best_q_alt = q_try;
}
+ }
- if (best_k < 0) break; /* no improvement found */
- q_shaped[g_off + best_k] = best_q_alt; /* commit the flip */
+ if (best_k < 0) break; /* converged — no further improvement */
+
+ q_shaped_all[best_k] = best_q_alt;
+ {
+ int jj_c = best_k >> 4;
+ float d_c = dm * (float)(output[blk].scales[jj_c] & 0xF);
+ float m_c = mm * (float)(output[blk].scales[jj_c] >> 4);
+ float e_new_c = adj_block_x[best_k] - (d_c * (float)best_q_alt - m_c);
+ float de_c = e_new_c - e_live[best_k];
+ int pi_c = (best_k < QK_K / 2) ? best_k : best_k - QK_K / 2;
+ float v_new_c = v_live[pi_c] + de_c;
+ vesica_cur += v_new_c * v_new_c - v_live[pi_c] * v_live[pi_c];
+ dc_cur += de_c;
+ metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
+ v_live[pi_c] = v_new_c;
+ e_live[best_k]= e_new_c;
}
}
- /* Step 3: Final error comparison — only keep shaped if it improves
- * or is within 5% of baseline (vesica shaping trades element MSE
- * for better spectral distribution of error) */
+ /* Choose base vs shaped by comparing MSE against original weights */
float err_base = 0.0f, err_shaped = 0.0f;
- for (int k = 0; k < 16; k++) {
- float x = block_x[16*j+k];
- float w = (imat_importance) ?
- imat_importance[blk * QK_K + 16*j + k] : 1.0f;
- float deq_b = d * (float)q_base[k] - m;
- float deq_s = d * (float)q_shaped[k] - m;
- err_base += (x - deq_b) * (x - deq_b) * w;
- err_shaped += (x - deq_s) * (x - deq_s) * w;
+ for (int i = 0; i < QK_K; i++) {
+ int jj = i >> 4;
+ float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
+ float m_s = mm * (float)(output[blk].scales[jj] >> 4);
+ float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
+ float deq_b = (d_s > 1e-15f) ? (d_s * (float)q_base_all[i] - m_s) : 0.0f;
+ float deq_s = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
+ float xv = block_x[i]; /* original weight for error report */
+ err_base += (xv - deq_b) * (xv - deq_b) * w;
+ err_shaped += (xv - deq_s) * (xv - deq_s) * w;
+ }
+ {
+ int use_shaped = (err_shaped <= err_base);
+ for (int i = 0; i < QK_K; i++)
+ L[i] = (uint8_t)(use_shaped ? q_shaped_all[i] : q_base_all[i]);
}
+ }
+
+ /* ── Cross-weight error diffusion — intra-sub-block Floyd-Steinberg ──
+ *
+ * Implements cross-weight error diffusion within each 16-weight sub-block.
+ * After the greedy descent has committed quantisation codes, the residual
+ * of each weight is partially propagated forward to the next position in
+ * the same sub-block (7/16 of the error), re-quantising if the diffused
+ * target falls in a different bin.
+ *
+ * This is the "cross-weight" dimension of the error-diffusion request:
+ * neighbouring weights share and partially absorb each other's rounding
+ * error, shaping the within-block spectrum away from the DC component
+ * that Phase 3.9 already propagates between blocks.
+ *
+ * Staying within sub-blocks avoids scale-mismatch artefacts that would
+ * arise from diffusing across the dm * Ls[j] boundary between sub-blocks.
+ *
+ * The diffused codes are accepted only when they reduce the weighted MSE
+ * against the ORIGINAL weight (not the adjusted target), so the diffusion
+ * cannot increase the total reconstruction error.
+ */
+ {
+ int fs_j, fs_k;
+ for (fs_j = 0; fs_j < N_SUB; fs_j++) {
+ int base = fs_j * 16;
+ float d_s = dm * (float)(output[blk].scales[fs_j] & 0xF);
+ float m_s = mm * (float)(output[blk].scales[fs_j] >> 4);
+ if (d_s < 1e-15f) continue;
+
+ float carry = 0.0f; /* FS carry from position k-1 */
+
+ for (fs_k = 0; fs_k < 16; fs_k++) {
+ int idx = base + fs_k;
+ float x_orig = block_x[idx];
+ float x_adj = adj_block_x[idx] + carry; /* adjusted + diffused */
+
+ /* Propose new code from diffused target */
+ int q_fs = gguf_nearest_int((x_adj + m_s) / d_s);
+ if (q_fs < 0) q_fs = 0; if (q_fs > 3) q_fs = 3;
+
+ if (q_fs != (int)L[idx]) {
+ /* Accept only when MSE against original weight improves */
+ float w_imp = (imat_importance)
+ ? imat_importance[blk * QK_K + idx] : 1.0f;
+ float deq_old = d_s * (float)L[idx] - m_s;
+ float deq_new = d_s * (float)q_fs - m_s;
+ float e_old = (x_orig - deq_old) * (x_orig - deq_old) * w_imp;
+ float e_new = (x_orig - deq_new) * (x_orig - deq_new) * w_imp;
+ if (e_new < e_old)
+ L[idx] = (uint8_t)q_fs;
+ }
- int *q_final = (err_shaped <= err_base * 1.05f) ? q_shaped : q_base;
- for (int k = 0; k < 16; k++)
- L[16 * j + k] = (uint8_t)q_final[k];
+ /* Propagate 7/16 of the residual (adj target vs committed code) */
+ {
+ float deq_final = d_s * (float)L[idx] - m_s;
+ float residual = (adj_block_x[idx] - deq_final);
+ carry = (fs_k < 15) ? residual * (7.0f / 16.0f) : 0.0f;
+ }
+ }
+ }
}
for (int j = 0; j < QK_K; j += 128) {
@@ -3315,11 +3296,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
total_err += berr;
}
- /* Free thread-local sub-block graphs */
for (int _ti = 0; _ti < _n_omp_threads; _ti++)
hpc_destroy(_tl_graphs[_ti]);
free(_tl_graphs);
+ free(block_dc_bias);
free(seeds);
free(candidate_errors);
free(candidate_d);
@@ -3332,14 +3313,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
if (verbose) {
float rmse = sqrtf(total_err / (float)n_elements);
- /* Compute weight σ for fidelity classification */
double w_sum2 = 0.0;
for (int64_t i = 0; i < n_elements; i++)
w_sum2 += (double)weights[i] * (double)weights[i];
- float w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
+ w_sigma = (float)sqrt(w_sum2 / (double)n_elements);
float rmse_over_sigma = (w_sigma > 1e-15f) ? rmse / w_sigma : 0.0f;
- /* Fidelity classification */
const char *fidelity_class;
const char *fidelity_icon;
if (rmse <= 1.0e-04f) {
@@ -3493,8 +3472,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
if (should_quantize(ti, gguf_names[i])) {
if (is_attention_tensor(gguf_names[i])) {
- /* Promote attention Q/K/V/O to Q4_0 for higher precision.
- * Attention scores are most sensitive to quantization noise. */
tensor_types[i] = GGML_TYPE_Q4_0;
int64_t n_blocks_q4 = (ti->n_elements + QK4_0 - 1) / QK4_0;
tensor_sizes[i] = n_blocks_q4 * sizeof(BlockQ4_0);
@@ -3506,18 +3483,15 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
tensor_sizes[i] = ggml_type_size(quant_type, ti->n_elements);
}
} else if (ti->n_dims >= 2) {
- /* 2D non-quantized tensors (embeddings, output) → F16 */
tensor_types[i] = GGML_TYPE_F16;
tensor_sizes[i] = ti->n_elements * sizeof(uint16_t);
} else {
- /* 1D tensors (norms, biases) → F32 */
tensor_types[i] = GGML_TYPE_F32;
tensor_sizes[i] = ti->n_elements * sizeof(float);
}
tensor_offsets[i] = data_offset;
- /* Align each tensor to 32 bytes */
data_offset += tensor_sizes[i];
data_offset = (data_offset + GGUF_DEFAULT_ALIGNMENT - 1) &
~(uint64_t)(GGUF_DEFAULT_ALIGNMENT - 1);
@@ -3592,7 +3566,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
int src = tensor_src_idx[i];
const STTensorInfo *ti = st_multi_tensor_info(mf, src);
uint64_t dims[ST_MAX_DIMS];
- /* GGUF uses reversed dimension order from SafeTensors/PyTorch */
int nd = ti->n_dims;
for (int d = 0; d < nd; d++) {
dims[d] = (uint64_t)ti->shape[nd - 1 - d];
@@ -3622,7 +3595,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
print_progress_bar(i, total_tensors, gguf_names[i], quant_start);
if (tensor_types[i] == GGML_TYPE_Q2_K) {
- /* ── HPC-Optimized Q2_K Quantization ── */
float *f32_data = st_multi_tensor_to_f32(mf, src);
if (!f32_data) {
fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
@@ -3633,7 +3605,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
int64_t n_elements = ti->n_elements;
float tensor_error = 0.0f;
- /* Pad to QK_K boundary */
int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
if (padded > n_elements) {
f32_data = realloc(f32_data, padded * sizeof(float));
@@ -3645,7 +3616,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
int64_t n_blocks = n_elements / QK_K;
BlockQ2K *quant_data = calloc(n_blocks, sizeof(BlockQ2K));
- /* Look up imatrix importance for this tensor */
const float *imp = NULL;
if (imatrix) {
const IMatrixEntry *ime = imatrix_find_any(imatrix,
@@ -3666,13 +3636,11 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
float rmse = sqrtf(tensor_error / (float)ti->n_elements);
- /* Compute weight σ for fidelity gate */
double wss = 0.0;
for (int64_t j = 0; j < ti->n_elements; j++)
wss += (double)f32_data[j] * (double)f32_data[j];
float w_sig = (float)sqrt(wss / (double)ti->n_elements);
- /* Fidelity gate: classify RMSE vs 1e-04 target */
const char *fid;
if (rmse <= 1.0e-04f) fid = "★★★★ ULTRA";
else if (rmse <= 3.0e-04f) fid = "★★★☆ HIGH";
@@ -3695,7 +3663,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
free(quant_data);
free(f32_data);
} else if (tensor_types[i] == GGML_TYPE_Q4_0) {
- /* ── HPC-Optimized Q4_0 Quantization (attention tensors) ── */
float *f32_data = st_multi_tensor_to_f32(mf, src);
if (!f32_data) {
fprintf(stderr, "\n ERROR: Failed to convert tensor '%s' to F32\n",
@@ -3705,7 +3672,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
int64_t n_elements = ti->n_elements;
- /* Pad to QK4_0 boundary */
int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
if (padded > n_elements) {
f32_data = realloc(f32_data, padded * sizeof(float));
@@ -3718,7 +3684,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
BlockQ4_0 *q4_data = calloc(n_blocks_q4, sizeof(BlockQ4_0));
float tensor_error = 0.0f;
- /* Look up imatrix importance for this tensor */
const float *imp = NULL;
if (imatrix) {
const IMatrixEntry *ime = imatrix_find_any(imatrix,
@@ -3739,7 +3704,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
float rmse = sqrtf(tensor_error / (float)ti->n_elements);
- /* Compute weight σ for fidelity gate */
double wss4 = 0.0;
for (int64_t j = 0; j < ti->n_elements; j++)
wss4 += (double)f32_data[j] * (double)f32_data[j];
@@ -3767,7 +3731,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
free(q4_data);
free(f32_data);
} else if (tensor_types[i] == GGML_TYPE_F16) {
- /* ── Store as F16 (embeddings, output, 2D non-quantized) ── */
float *f32_data = st_multi_tensor_to_f32(mf, src);
if (!f32_data) {
fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
@@ -3775,7 +3738,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
continue;
}
- /* Convert F32 → F16 */
uint16_t *f16_data = (uint16_t *)malloc(ti->n_elements * sizeof(uint16_t));
for (int64_t j = 0; j < ti->n_elements; j++)
f16_data[j] = gguf_fp32_to_fp16(f32_data[j]);
@@ -3793,7 +3755,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
free(f16_data);
free(f32_data);
} else {
- /* ── Keep as F32 (1D: norms, biases) ── */
float *f32_data = st_multi_tensor_to_f32(mf, src);
if (!f32_data) {
fprintf(stderr, "\n ERROR: Failed to convert tensor '%s'\n",
@@ -3814,7 +3775,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
free(f32_data);
}
- /* Pad to alignment */
gguf_write_padding(fp, GGUF_DEFAULT_ALIGNMENT);
}
@@ -3823,8 +3783,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
long final_size = ftell(fp);
fclose(fp);
- /* ── Final summary with Shor fidelity metrics ── */
- /* Compute original model size (all as F32) */
int64_t original_f32_size = 0;
for (int i = 0; i < total_tensors; i++) {
const STTensorInfo *ti = st_multi_tensor_info(mf, tensor_src_idx[i]);
@@ -3840,7 +3798,6 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
float mean_mse_per_tensor = (quant_count > 0) ?
total_error_sum / (float)quant_count : 0.0f;
- /* Fidelity classification */
const char *overall_fid, *overall_icon;
if (total_rmse <= 1.0e-04f) { overall_fid = "ULTRA (≤1e-04)"; overall_icon = "★★★★"; }
else if (total_rmse <= 3.0e-04f) { overall_fid = "HIGH (≤3e-04)"; overall_icon = "★★★☆"; }
@@ -3931,12 +3888,12 @@ void hexstate_init(void)
/* Quantize a single tensor's F32 data to Q2_K using HPC optimization.
*
* Parameters:
- * weights: input F32 data (must be padded to multiple of 256)
- * n_elements: number of elements (must be multiple of 256)
- * output: output buffer (must be n_elements/256 * 84 bytes)
- * out_error: pointer to receive total MSE (can be NULL)
- * opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended)
- * verbose: 1 for per-block diagnostics
+ * weights: input F32 data (must be padded to multiple of 256)
+ * n_elements: number of elements (must be multiple of 256)
+ * output: output buffer (must be n_elements/256 * 84 bytes)
+ * out_error: pointer to receive total MSE (can be NULL)
+ * opt_mode: 0=HPC, 1=MSE, 2=Hybrid (recommended)
+ * verbose: 1 for per-block diagnostics
*/
void hexstate_quantize_tensor_q2k(const float *weights, int64_t n_elements,
void *output, float *out_error,
@@ -3967,12 +3924,12 @@ int hexstate_q2k_block_elements(void) { return QK_K; }
/* HPC-optimized Q4_0 quantization for attention tensors.
* Called from Python requantizer via ctypes.
- * weights: input F32 weights
- * n_elements: number of elements (must be multiple of 32)
- * output: output buffer (must be n_elements/32 * 18 bytes)
- * out_error: pointer to receive total MSE (can be NULL)
- * imat_importance: optional per-element importance weights
- * verbose: 1 for per-block diagnostics
+ * weights: input F32 weights
+ * n_elements: number of elements (must be multiple of 32)
+ * output: output buffer (must be n_elements/32 * 18 bytes)
+ * out_error: pointer to receive total MSE (can be NULL)
+ * imat_importance: optional per-element importance weights
+ * verbose: 1 for per-block diagnostics
*/
void hexstate_quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
void *output, float *out_error,
@@ -4220,4 +4177,4 @@ int main(int argc, char **argv)
st_multi_close(mf);
return result;
}
-#endif /* HEXSTATE_LIBRARY */
+#endif /* HEXSTATE_LIBRARY */
\ No newline at end of file