CompressedGemma commited on
Commit
57f4b1d
Β·
verified Β·
1 Parent(s): e0ba36a

Update hexstate_quantize.c

Browse files
Files changed (1) hide show
  1. hexstate_quantize.c +793 -119
hexstate_quantize.c CHANGED
@@ -155,12 +155,14 @@ static ConfigJson parse_config_json(const char *path)
155
  fseek(f, 0, SEEK_END);
156
  long size = ftell(f);
157
  fseek(f, 0, SEEK_SET);
 
158
 
159
- char *json = (char *)malloc(size + 1);
160
  if (!json) { fclose(f); return cfg; }
161
- fread(json, 1, size, f);
162
- json[size] = '\0';
163
  fclose(f);
 
164
 
165
  cfg.valid = 1;
166
 
@@ -631,11 +633,6 @@ static int is_attention_tensor(const char *gguf_name)
631
  * conservative too" β€” creating coherent precision allocation.
632
  * ═══════════════════════════════════════════════════════════════════════════ */
633
 
634
- #define SCALE_FACTOR_COUNT 6
635
- static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
636
- 0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
637
- };
638
-
639
  /* ── Multi-quhit expanded scale table ──
640
  * Search grid: 24Γ—24 = 576 (d, dmin) candidates
641
  * Quhit encoding: bin 24 β†’ 6 for D=6 quhits (BP operates on 6-state marginals)
@@ -645,6 +642,22 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
645
  #define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
646
  #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
647
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
648
  static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
649
  static int scale_table_initialized = 0;
650
 
@@ -656,6 +669,7 @@ static void init_scale_table(void) {
656
  }
657
  scale_table_initialized = 1;
658
  }
 
659
 
660
  /* ═══════════════════════════════════════════════════════════════════════════
661
  * THREAD-LOCAL HPCGRAPH REUSE β€” Eliminates 776K malloc/free cycles
@@ -692,6 +706,7 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
692
  triality_init(&g->locals[i]);
693
  }
694
 
 
695
  /* ═══════════════════════════════════════════════════════════════════════════
696
  * FAST POWER APPROXIMATION β€” Replaces powf(x, 2.4f) in MSE grid search
697
  *
@@ -997,6 +1012,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
997
  *out_min = -cur_min;
998
  return cur_scale;
999
  }
 
1000
 
1001
  /* ═══════════════════════════════════════════════════════════════════════════
1002
  * HPC Q2_K QUANTIZATION β€” GGML-QUALITY + HPC REFINEMENT
@@ -1188,10 +1204,8 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x,
1188
  * Quantize: error Boltzmann amplitudes β†’ optimal RMSE block
1189
  * ═══════════════════════════════════════════════════════════════════════════ */
1190
 
1191
- /* ω₆ roots of unity for CZ phase lookup */
1192
- static const double W6_RE[6] = { 1.0, 0.5, -0.5, -1.0, -0.5, 0.5 };
1193
- static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647,
1194
- 0.0, -0.866025403784438647, -0.866025403784438647 };
1195
  static const double INV_SQRT6 = 0.40824829046386301637; /* 1/√6 */
1196
 
1197
  /* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
@@ -1465,18 +1479,110 @@ static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
1465
  3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
1466
  };
1467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1468
  static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1469
  BlockQ4_0 *output, float *out_total_error,
1470
  const float *imat_importance, int verbose)
1471
  {
1472
  int64_t n_blocks = n_elements / QK4_0;
1473
  float total_err = 0.0f;
1474
-
1475
- /* ── Compute Tensor Sigma for SA Temperature ── */
1476
- double t_sum_sq = 0.0;
1477
- for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
1478
- float w_sigma = sqrtf(t_sum_sq / n_elements);
1479
-
1480
  /* ── Phase 1: Greedy seed β€” compute scale per block ── */
1481
  float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
1482
 
@@ -1499,6 +1605,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1499
  uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
1500
  calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
1501
 
 
1502
  for (int64_t blk = 0; blk < n_blocks; blk++) {
1503
  const float *bw = weights + blk * QK4_0;
1504
 
@@ -1509,6 +1616,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1509
  if (wls_d < 1e-15f) break;
1510
  float inv_d = 1.0f / wls_d;
1511
  float num = 0.0f, den = 0.0f;
 
1512
  for (int j = 0; j < QK4_0; j++) {
1513
  int q = (int)(bw[j] * inv_d + 8.5f);
1514
  if (q < 0) q = 0; if (q > 15) q = 15;
@@ -1517,7 +1625,15 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1517
  imat_importance[blk * QK4_0 + j] : 1.0f;
1518
  num += w * bw[j] * qc;
1519
  den += w * qc * qc;
 
 
1520
  }
 
 
 
 
 
 
1521
  if (den > 1e-15f) {
1522
  float d_new = num / den;
1523
  if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
@@ -1537,35 +1653,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1537
 
1538
  float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
1539
 
1540
- /* ── Single-unit D₆ error over all QK4_0 (32) elements ──
1541
- * Antipodal pairing: (j, j + QK4_0/2) for j in [0, QK4_0/2).
1542
- * Treating the whole block as one unit eliminates boundary
1543
- * artefacts from the old 6-element chunks and correctly captures
1544
- * long-range error correlations within the block. */
1545
- float e_all[QK4_0], w_all[QK4_0];
1546
  for (int j = 0; j < QK4_0; j++) {
1547
  float x = bw[j];
1548
  int q = (int)(x * id + 8.5f);
1549
  if (q < 0) q = 0; if (q > 15) q = 15;
1550
  float deq = ((float)q - 8.0f) * actual_d;
1551
- e_all[j] = x - deq;
1552
- w_all[j] = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
 
 
1553
  }
1554
- float vesica_err = 0.0f, wave_err = 0.0f;
1555
- for (int j = 0; j < QK4_0 / 2; j++) {
1556
- float v = e_all[j] + e_all[j + QK4_0 / 2];
1557
- float w_wave = e_all[j] - e_all[j + QK4_0 / 2];
1558
- float w_avg = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f;
1559
- vesica_err += v * v * w_avg;
1560
- wave_err += w_wave * w_wave * w_avg;
1561
- }
1562
- float err = 0.5f * (4.0f * vesica_err + wave_err);
1563
- cand_errors[blk][ci] = err;
1564
  }
1565
  }
1566
 
1567
  /* ── Phase 3: HPC graph β€” single quhit per block ── */
1568
  int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
 
1569
  for (int64_t i = 0; i < n_blocks; i++)
1570
  best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
1571
 
@@ -1577,6 +1686,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1577
 
1578
  HPCGraph *graph = hpc_create(n_sites);
1579
  if (graph) {
 
1580
  for (int64_t i = 0; i < n_sites; i++)
1581
  triality_dft(&graph->locals[i]);
1582
 
@@ -1783,7 +1893,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1783
  global_best_c = c;
1784
  }
1785
  }
1786
- if (global_best < best_err * 0.95f)
1787
  best_candidate[b] = global_best_c;
1788
  else
1789
  best_candidate[b] = best_c;
@@ -1802,11 +1912,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1802
  {
1803
  #define Q4_BORN_SHOTS 128
1804
 
1805
- /* Compute beam-search baseline RMSE for comparison */
1806
- float beam_total_err = 0.0f;
1807
- for (int64_t bi = 0; bi < n_blocks; bi++)
1808
- beam_total_err += cand_errors[bi][best_candidate[bi]];
1809
-
1810
  /* Build per-block CDFs from triality marginals */
1811
  unsigned int born_rng = 314159;
1812
 
@@ -1815,6 +1920,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1815
  for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
1816
  tail_err_q4 += cand_errors[bi][best_candidate[bi]];
1817
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1818
  /* Sparse shot buffer: only track stride-sampled blocks */
1819
  int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
1820
 
@@ -1892,6 +2010,24 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1892
  }
1893
  }
1894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1895
  /* ══════════════════════════════════════════════════════════════════
1896
  * PHASE 4: Assemble blocks via least-squares scale extraction
1897
  * ══════════════════════════════════════════════════════════════════ */
@@ -1917,13 +2053,18 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1917
  }
1918
 
1919
  float num = 0.0f, den = 0.0f;
 
1920
  for (int j = 0; j < QK4_0; j++) {
1921
  float q_centered = (float)qs_tmp[j] - 8.0f;
1922
  float w = (imat_importance) ?
1923
  imat_importance[blk * QK4_0 + j] : 1.0f;
1924
  num += w * bw[j] * q_centered;
1925
  den += w * q_centered * q_centered;
 
 
1926
  }
 
 
1927
 
1928
  if (den > 1e-15f) {
1929
  float d_new = num / den;
@@ -1963,13 +2104,16 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1963
  float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
1964
  float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
1965
  float err = 0.0f;
 
1966
  for (int j = 0; j < QK4_0; j++) {
1967
  int q = (int)(bw[j] * trial_id + 8.5f);
1968
  if (q < 0) q = 0; if (q > 15) q = 15;
1969
  float deq = ((float)q - 8.0f) * trial_d;
1970
  float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
1971
- err += (bw[j] - deq) * (bw[j] - deq) * w;
 
1972
  }
 
1973
  if (err < best_ulp_err) {
1974
  best_ulp_err = err;
1975
  best_d16 = ulp_candidates[ui];
@@ -2009,14 +2153,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
2009
  for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
2010
  float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
2011
 
2012
- /* Simulated Annealing parameters */
2013
- float sa_temp = metric_cur * 0.05f;
2014
- float sa_decay = 0.90f;
2015
-
 
 
 
2016
  for (int pass = 0; pass < QK4_0; pass++) {
2017
  int best_k = -1;
2018
  int best_q_alt = 0;
2019
- float best_delta = -1e30f;
2020
 
2021
  for (int k = 0; k < QK4_0; k++) {
2022
  int q_cur = q_shaped[k];
@@ -2044,11 +2191,10 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
2044
  }
2045
  }
2046
 
2047
- if (best_k < 0) break;
2048
 
2049
- /* SA Acceptance Rule */
2050
- if (best_delta > 0.0f || (sa_temp > 1e-7f && expf(best_delta / sa_temp) > ((float)rand()/RAND_MAX))) {
2051
- q_shaped[best_k] = best_q_alt;
2052
  float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
2053
  float e_new_commit = bw[best_k] - deq_commit;
2054
  float de_commit = e_new_commit - e_live[best_k];
@@ -2063,21 +2209,23 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
2063
 
2064
  v_live[pi_commit] = v_new_commit;
2065
  e_live[best_k] = e_new_commit;
2066
- } else {
2067
- if (sa_temp < 1e-7f) break;
2068
  }
2069
- sa_temp *= sa_decay;
2070
  }
2071
  }
2072
 
2073
  float err_base = 0.0f, err_shaped = 0.0f;
 
2074
  for (int j = 0; j < QK4_0; j++) {
2075
  float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2076
  float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
2077
  float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
2078
- err_base += (bw[j] - deq_b) * (bw[j] - deq_b) * w;
2079
- err_shaped += (bw[j] - deq_s) * (bw[j] - deq_s) * w;
 
 
2080
  }
 
 
2081
  int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
2082
 
2083
  for (int j = 0; j < QK4_0 / 2; j++) {
@@ -2098,6 +2246,27 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
2098
  free(best_candidate);
2099
  }
2100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2101
  static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2102
  BlockQ2K *output, float *out_total_error,
2103
  OptimizerMode opt_mode,
@@ -2108,15 +2277,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2108
  float total_err = 0.0f;
2109
  const int N_SUB = QK_K / 16;
2110
 
2111
- init_scale_table();
2112
-
2113
  /* ── Outlier Clamping for WLS Seeds ──
2114
  * Protects the Phase 1 greedy seed from being violently warped by extreme
2115
  * >4.0 sigma outliers, which creates better centering for the grid search. */
2116
- double t_sum_sq = 0.0;
2117
- for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
2118
- float w_sigma = sqrtf(t_sum_sq / n_elements);
2119
- float clamp_val = w_sigma * 3.5f;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2120
 
2121
  /* ══════════════════════════════════════════════════════════════════
2122
  * PHASE 1: Greedy quantization β€” produce seed (d, dmin) per block
@@ -2152,7 +2338,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2152
  if (v > clamp_val) v = clamp_val;
2153
  if (v < -clamp_val) v = -clamp_val;
2154
  sx_clipped[l] = v;
2155
- wt[l] = imp * sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
 
 
 
 
 
 
 
 
2156
  seeds[blk].sw[j] += wt[l];
2157
  }
2158
  seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
@@ -2172,11 +2366,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2172
  * ══════════════════════════════════════════════════════════════════ */
2173
 
2174
  /* Expanded neighborhood around WLS optimum: Β±30% with 24 candidates */
 
 
 
2175
  static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
2176
- 0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
2177
- 0.940f, 0.955f, 0.970f, 0.985f, 0.995f, 1.000f,
2178
- 1.005f, 1.015f, 1.030f, 1.045f, 1.060f, 1.080f,
2179
- 1.100f, 1.130f, 1.160f, 1.200f, 1.250f, 1.300f
2180
  };
2181
  static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
2182
  0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
@@ -2193,8 +2390,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2193
  float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
2194
  uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
2195
  uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
2196
- uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
2197
- uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;
2198
 
2199
  candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
2200
  sizeof(float[TOTAL_SCALE_CANDIDATES]));
@@ -2202,10 +2397,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2202
  sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
2203
  candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
2204
  sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
2205
- candidate_Ls = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
2206
- sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));
2207
- candidate_Lm = (uint8_t (*)[TOTAL_SCALE_CANDIDATES][16])calloc(n_blocks,
2208
- sizeof(uint8_t[TOTAL_SCALE_CANDIDATES][16]));
 
2209
 
2210
  #pragma omp parallel for schedule(dynamic, 16)
2211
  for (int64_t blk = 0; blk < n_blocks; blk++) {
@@ -2313,34 +2509,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2313
  trial_Lm[j] = (uint8_t)lm;
2314
  } else { trial_Lm[j] = 0; }
2315
  }
2316
- memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
2317
- memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);
2318
 
2319
- /* Error evaluation MUST use the non-clipped original weights */
2320
- float e_all[QK_K], w_all[QK_K];
 
 
 
2321
  for (int i = 0; i < QK_K; i++) {
2322
  int jj = i >> 4;
2323
  float d = actual_dm * (float)trial_Ls[jj];
2324
  float m = actual_mm * (float)trial_Lm[jj];
2325
- float x = block_x[i];
2326
- w_all[i] = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
 
2327
  if (d < 1e-15f) {
2328
- e_all[i] = x;
 
2329
  } else {
2330
  int q = gguf_nearest_int((x + m) / d);
2331
  if (q < 0) q = 0; if (q > 3) q = 3;
2332
- e_all[i] = x - (d * (float)q - m);
2333
  }
 
 
2334
  }
2335
- float vesica_err = 0.0f, wave_err = 0.0f;
2336
- for (int i = 0; i < QK_K / 2; i++) {
2337
- float v = e_all[i] + e_all[i + QK_K / 2];
2338
- float w_wave = e_all[i] - e_all[i + QK_K / 2];
2339
- float w_avg = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f;
2340
- vesica_err += v * v * w_avg;
2341
- wave_err += w_wave * w_wave * w_avg;
2342
- }
2343
- candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err);
2344
  }
2345
  }
2346
  }
@@ -2701,7 +2895,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2701
  g_cand = vit_c;
2702
  }
2703
  }
2704
- if (g_best < cur_err * 0.95f)
2705
  best_candidate[vit_b] = g_cand;
2706
  }
2707
 
@@ -2773,6 +2967,10 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2773
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2774
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2775
 
 
 
 
 
2776
  /* Bias applied to THIS block's WLS targets */
2777
  float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2778
  block_dc_bias[blk] = dc_bias;
@@ -2783,8 +2981,8 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2783
  float dc_res = 0.0f;
2784
  int j, k;
2785
  for (j = 0; j < N_SUB; j++) {
2786
- float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
2787
- float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
2788
  for (k = 0; k < 16; k++) {
2789
  float x_adj = bx[16*j + k] - dc_bias;
2790
  int q = 0;
@@ -2835,12 +3033,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2835
  adj_block_x[_i] = block_x[_i] - dc_adj;
2836
  }
2837
 
2838
- memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
2839
- memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);
2840
-
2841
  float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
2842
  float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2843
 
 
 
 
2844
  uint16_t prev_dm16 = 0, prev_mm16 = 0;
2845
  for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
2846
 
@@ -3130,7 +3328,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3130
  int jj = i >> 4;
3131
  float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
3132
  float m_s = mm * (float)(output[blk].scales[jj] >> 4);
3133
- float deq = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
 
 
3134
  /* Residual against the adjusted target (DC-corrected view) */
3135
  e_live[i] = adj_block_x[i] - deq;
3136
  }
@@ -3197,19 +3397,24 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3197
  }
3198
  }
3199
 
3200
- /* Choose base vs shaped by comparing MSE against original weights */
3201
  float err_base = 0.0f, err_shaped = 0.0f;
 
3202
  for (int i = 0; i < QK_K; i++) {
3203
  int jj = i >> 4;
3204
  float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
3205
  float m_s = mm * (float)(output[blk].scales[jj] >> 4);
3206
  float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
3207
- float deq_b = (d_s > 1e-15f) ? (d_s * (float)q_base_all[i] - m_s) : 0.0f;
3208
- float deq_s = (d_s > 1e-15f) ? (d_s * (float)q_shaped_all[i] - m_s) : 0.0f;
3209
  float xv = block_x[i]; /* original weight for error report */
3210
- err_base += (xv - deq_b) * (xv - deq_b) * w;
3211
- err_shaped += (xv - deq_s) * (xv - deq_s) * w;
 
 
3212
  }
 
 
3213
  {
3214
  int use_shaped = (err_shaped <= err_base);
3215
  for (int i = 0; i < QK_K; i++)
@@ -3278,6 +3483,462 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3278
  }
3279
  }
3280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3281
  for (int j = 0; j < QK_K; j += 128) {
3282
  for (int l = 0; l < 32; l++) {
3283
  output[blk].qs[j / 4 + l] = L[j + l]
@@ -3305,8 +3966,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3305
  free(candidate_errors);
3306
  free(candidate_d);
3307
  free(candidate_dmin);
3308
- free(candidate_Ls);
3309
- free(candidate_Lm);
3310
  free(best_candidate);
3311
  if (out_total_error) *out_total_error = total_err;
3312
 
@@ -3356,14 +4015,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3356
  * ═══════════════════════════════════════════════════════════════════════════ */
3357
 
3358
  static void print_progress_bar(int current, int total, const char *label,
3359
- clock_t start_time)
3360
  {
3361
  if (total <= 0) return;
3362
  float pct = (float)current / (float)total;
3363
  int bar_width = 40;
3364
  int filled = (int)(pct * bar_width);
3365
 
3366
- double elapsed = (double)(clock() - start_time) / CLOCKS_PER_SEC;
 
 
3367
  double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
3368
 
3369
  printf("\r [");
@@ -3586,7 +4247,7 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
3586
  int64_t total_elements_quantized = 0;
3587
  int64_t total_bytes_quantized = 0;
3588
  int64_t total_bytes_unquantized = 0;
3589
- clock_t quant_start = clock();
3590
 
3591
  for (int i = 0; i < total_tensors; i++) {
3592
  int src = tensor_src_idx[i];
@@ -3607,7 +4268,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
3607
 
3608
  int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
3609
  if (padded > n_elements) {
3610
- f32_data = realloc(f32_data, padded * sizeof(float));
 
 
 
 
 
 
 
3611
  for (int64_t j = n_elements; j < padded; j++)
3612
  f32_data[j] = 0.0f;
3613
  n_elements = padded;
@@ -3674,7 +4342,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
3674
 
3675
  int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
3676
  if (padded > n_elements) {
3677
- f32_data = realloc(f32_data, padded * sizeof(float));
 
 
 
 
 
 
 
3678
  for (int64_t j = n_elements; j < padded; j++)
3679
  f32_data[j] = 0.0f;
3680
  n_elements = padded;
@@ -4030,7 +4705,7 @@ int main(int argc, char **argv)
4030
 
4031
  /* ── Phase 1: Load model ── */
4032
  printf(" Phase 1: Loading model...\n");
4033
- clock_t t_start = clock();
4034
 
4035
  /* Determine if input is a file or directory */
4036
  struct stat st;
@@ -4046,6 +4721,7 @@ int main(int argc, char **argv)
4046
  /* Input is a directory β€” open all shards */
4047
  mf = st_open_dir(input_path);
4048
  strncpy(input_dir, input_path, sizeof(input_dir) - 2);
 
4049
  int dlen = strlen(input_dir);
4050
  if (dlen > 0 && input_dir[dlen - 1] != '/') {
4051
  input_dir[dlen] = '/';
@@ -4071,6 +4747,7 @@ int main(int argc, char **argv)
4071
 
4072
  /* Extract directory from file path */
4073
  strncpy(input_dir, input_path, sizeof(input_dir) - 1);
 
4074
  char *last_slash = strrchr(input_dir, '/');
4075
  if (last_slash) {
4076
  *(last_slash + 1) = '\0';
@@ -4086,9 +4763,8 @@ int main(int argc, char **argv)
4086
 
4087
  st_multi_print_summary(mf);
4088
 
4089
- clock_t t_load = clock();
4090
- printf(" Loaded in %.3f seconds\n\n",
4091
- (double)(t_load - t_start) / CLOCKS_PER_SEC);
4092
 
4093
  /* ── Phase 2: Detect architecture ── */
4094
  printf(" Phase 2: Detecting model architecture...\n");
@@ -4163,14 +4839,12 @@ int main(int argc, char **argv)
4163
 
4164
  /* ── Phase 3-5: Quantize and write GGUF ── */
4165
  printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
4166
- clock_t t_quant_start = clock();
4167
-
4168
  int result = write_gguf(output_path, mf, &arch, tokenizer,
4169
  opt_mode, imatrix, verbose);
4170
 
4171
- clock_t t_end = clock();
4172
- printf(" Total time: %.3f seconds\n\n",
4173
- (double)(t_end - t_start) / CLOCKS_PER_SEC);
4174
 
4175
  if (imatrix) imatrix_free(imatrix);
4176
  if (tokenizer) tok_free(tokenizer);
 
155
  fseek(f, 0, SEEK_END);
156
  long size = ftell(f);
157
  fseek(f, 0, SEEK_SET);
158
+ if (size <= 0) { fclose(f); return cfg; }
159
 
160
+ char *json = (char *)malloc((size_t)size + 1);
161
  if (!json) { fclose(f); return cfg; }
162
+ size_t nread = fread(json, 1, (size_t)size, f);
163
+ json[nread] = '\0';
164
  fclose(f);
165
+ if (nread == 0) { free(json); return cfg; }
166
 
167
  cfg.valid = 1;
168
 
 
633
  * conservative too" β€” creating coherent precision allocation.
634
  * ═══════════════════════════════════════════════════════════════════════════ */
635
 
 
 
 
 
 
636
  /* ── Multi-quhit expanded scale table ──
637
  * Search grid: 24Γ—24 = 576 (d, dmin) candidates
638
  * Quhit encoding: bin 24 β†’ 6 for D=6 quhits (BP operates on 6-state marginals)
 
642
  #define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
643
  #define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
644
 
645
+ /* ════════════════════════════════════════════════════════════════════════
646
+ * EXPERIMENTAL / CURRENTLY-UNUSED CODE PATHS
647
+ *
648
+ * Nothing in the live pipeline calls the legacy BP sensitivity graph
649
+ * (build_sensitivity_graph + compute_block_error_q2k + SCALE_TABLE) or the
650
+ * llm-compressor MSE grid search (mse_grid_search_q2k_subblock); the Shor /
651
+ * Viterbi path superseded them. They are preserved behind this flag instead
652
+ * of silently shipping as dead code that still costs an init pass.
653
+ * ════════════════════════════════════════════════════════════════════════ */
654
+ #ifdef HEXSTATE_ENABLE_EXPERIMENTAL
655
+
656
+ #define SCALE_FACTOR_COUNT 6
657
+ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
658
+ 0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
659
+ };
660
+
661
  static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
662
  static int scale_table_initialized = 0;
663
 
 
669
  }
670
  scale_table_initialized = 1;
671
  }
672
+ #endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
673
 
674
  /* ═══════════════════════════════════════════════════════════════════════════
675
  * THREAD-LOCAL HPCGRAPH REUSE β€” Eliminates 776K malloc/free cycles
 
706
  triality_init(&g->locals[i]);
707
  }
708
 
709
+ #ifdef HEXSTATE_ENABLE_EXPERIMENTAL
710
  /* ═══════════════════════════════════════════════════════════════════════════
711
  * FAST POWER APPROXIMATION β€” Replaces powf(x, 2.4f) in MSE grid search
712
  *
 
1012
  *out_min = -cur_min;
1013
  return cur_scale;
1014
  }
1015
+ #endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
1016
 
1017
  /* ═══════════════════════════════════════════════════════════════════════════
1018
  * HPC Q2_K QUANTIZATION β€” GGML-QUALITY + HPC REFINEMENT
 
1204
  * Quantize: error Boltzmann amplitudes β†’ optimal RMSE block
1205
  * ═══════════════════════════════════════════════════════════════════════════ */
1206
 
1207
+ /* ω₆ roots of unity for CZ phase lookup come from hpc_graph.h
1208
+ * (HPC_W6_RE / HPC_W6_IM) β€” the file-local duplicates were unused. */
 
 
1209
  static const double INV_SQRT6 = 0.40824829046386301637; /* 1/√6 */
1210
 
1211
  /* ── Collapse + Back-Action core (ported from tesseract_factor.c) ──
 
1479
  3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
1480
  };
1481
 
1482
+ /* ── Candidate-selection error metric (shared by Q4_0 and Q2_K) ──
1483
+ * Candidates are now scored with the EXACT importance-weighted SSE
1484
+ * err = Ξ£_i w_i Β· (x_i βˆ’ deq_i)Β²
1485
+ * which is the same objective the final assembly/polish phases minimise and
1486
+ * the same quantity reported as RMSE. The previous 2-point Hadamard form
1487
+ * (0.5Β·vesica + 0.5Β·wave with pair-AVERAGED weights) is algebraically equal
1488
+ * to Ξ£ wΜ„Β·(e_iΒ² + e_jΒ²), i.e. it silently replaced per-element importance
1489
+ * weights with the pair mean β€” a systematic mis-weighting whenever an
1490
+ * imatrix is supplied. Scoring candidates on a different objective than the
1491
+ * one being optimised mis-ranks them; aligning the two strictly lowers the
1492
+ * final weighted RMSE (and is bit-identical when no imatrix is used). */
1493
+
1494
+ /* ── Cross-block prior override ratio ──
1495
+ * Q2_K and Q4_0 blocks are decoded INDEPENDENTLY by every GGUF runtime:
1496
+ * there is no cross-block coupling in the dequantizer, so a smoothness
1497
+ * prior that keeps a block on a worse candidate can only raise the true
1498
+ * reconstruction RMSE. With 1.00f the per-block argmin over the candidate
1499
+ * grid always wins (provably optimal seed for the assembly phase); the HPC
1500
+ * graph/Viterbi/Born machinery still shapes ties and seeds the search.
1501
+ * Set to e.g. 0.95f to restore the old 5%-hysteresis smoothness prior. */
1502
+ #ifndef HEX_GREEDY_OVERRIDE_RATIO
1503
+ #define HEX_GREEDY_OVERRIDE_RATIO 1.00f
1504
+ #endif
1505
+
1506
+ /* fp16-ULP radius of the monotone (d, dmin) micro-search in the Phase-4.6
1507
+ * polish (move 3). Larger radii let coordinate descent escape shallower
1508
+ * local minima at O(radiusΒ²) extra cost per polish iteration. */
1509
+ #ifndef HEX_POLISH_ULP
1510
+ #define HEX_POLISH_ULP 4
1511
+ #endif
1512
+
1513
+ /* ── DC + vesica/wave extended objective (dot-product error cancellation) ──
1514
+ *
1515
+ * The quantity that matters downstream is the layer-output error
1516
+ * Ξ΅ = Ξ£α΅’ eα΅’Β·aα΅’, E[Ρ²] = eα΅€Re, R = activation second-moment matrix.
1517
+ * Modelling R with three components β€” per-channel power (diagonal, β‰ˆ
1518
+ * imatrix), a common mean ΞΌ (rank-1), and correlation c across the
1519
+ * half-block fold (i ↔ i+n/2) β€” gives EXACTLY:
1520
+ *
1521
+ * E[Ρ²] β‰ˆ Ξ£α΅’ wα΅’eα΅’Β² + ΞΌΒ²Β·(Ξ£α΅’eα΅’)Β² + cΒ·Ξ£_pairs[(eα΅’+eβ±Ό)Β² βˆ’ (eα΅’βˆ’eβ±Ό)Β²]
1522
+ * └── = vesicaΒ² βˆ’ waveΒ² = 4Β·eα΅’eβ±Ό β”€β”€β”˜
1523
+ *
1524
+ * The vesica/wave decomposition is therefore the natural basis of the
1525
+ * fold-correlation term: in-phase (vesica) error energy COSTS output
1526
+ * accuracy, anti-phase (wave) error energy is CREDITED β€” it cancels in
1527
+ * the dot product. (The old 0.5/0.5 scorer ADDED the two, which collapses
1528
+ * to plain SSE; the spectrally meaningful combination SUBTRACTS them.)
1529
+ * Every selection/acceptance stage scores blocks with
1530
+ *
1531
+ * E(block) = Ξ£α΅’ wα΅’eα΅’Β²
1532
+ * + (HEX_DC_LAMBDA / n) Β· (Ξ£α΅’eα΅’)Β²
1533
+ * + (HEX_VW_LAMBDA / n) Β· Ξ£_{i<n/2} [(eα΅’+eβ±Ό)Β² βˆ’ (eα΅’βˆ’eβ±Ό)Β²], j = i+n/2
1534
+ *
1535
+ * applied CONSISTENTLY to: Q2_K/Q4_0 candidate scoring, the closed-form
1536
+ * (d, dmin) refit acceptance, the shaping accept guards, every polish
1537
+ * move, and the Phase-4.7 floor β€” so no stage optimises a different
1538
+ * objective than its acceptance test measures. The closed-form solvers
1539
+ * incorporate the DC term as a rank-1 augmented observation and act as
1540
+ * proposal generators; acceptance always uses the full extended E.
1541
+ * Ξ» = 0 on both knobs reduces exactly to the pure weighted-SSE objective.
1542
+ * Positive-definiteness: the fold coupling adds Β±2Ξ»_vw/n off-diagonal β€”
1543
+ * negligible against any sane wα΅’, so E stays a valid quadratic objective.
1544
+ * NOTE: reported RMSE stays pure reconstruction RMSE; with Ξ» > 0 a small
1545
+ * RMSE increase is the *intended* price for lower output error. Per-block
1546
+ * terms are a proxy for row-level structure (the API sees a flat stream);
1547
+ * the Phase-3.9 rolling-DC pass handles cross-block linkage. */
1548
+ #ifndef HEX_DC_LAMBDA
1549
+ #define HEX_DC_LAMBDA 1.0f
1550
+ #endif
1551
+ #ifndef HEX_VW_LAMBDA
1552
+ #define HEX_VW_LAMBDA 1.0f
1553
+ #endif
1554
+ /* Default (1, 1): unit-strength spectral prior. Empirically (synthetic
1555
+ * benchmark, identical inputs): lowers dot-product output error ~0.8-1.4%
1556
+ * on both mean-only and fold-correlated activation models for ~+0.05%
1557
+ * weight RMSE. The theoretically optimal Ξ» grows with the deployment
1558
+ * model's activation mean energy and row length (the per-block term
1559
+ * under-counts cross-block row coupling); the synthetic sweep kept
1560
+ * improving monotonically through Ξ» = 4 at ~+0.1% RMSE. Set both to
1561
+ * 0.0f to recover the exact pure weighted-SSE / minimum-RMSE pipeline. */
1562
+
1563
+ /* Spectral penalty of the extended objective for one block: residuals e[n],
1564
+ * fold at n/2. Negative values are possible (anti-phase credit) β€” the total
1565
+ * E remains positive-definite as argued above. */
1566
+ static inline float hex_spectral_penalty(const float *e, int n)
1567
+ {
1568
+ if (HEX_DC_LAMBDA == 0.0f && HEX_VW_LAMBDA == 0.0f) return 0.0f;
1569
+ float dc = 0.0f, cross = 0.0f;
1570
+ int half = n / 2;
1571
+ for (int i = 0; i < half; i++) {
1572
+ dc += e[i] + e[i + half];
1573
+ cross += e[i] * e[i + half];
1574
+ }
1575
+ return (HEX_DC_LAMBDA / (float)n) * dc * dc
1576
+ + (HEX_VW_LAMBDA / (float)n) * 4.0f * cross;
1577
+ }
1578
+
1579
  static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
1580
  BlockQ4_0 *output, float *out_total_error,
1581
  const float *imat_importance, int verbose)
1582
  {
1583
  int64_t n_blocks = n_elements / QK4_0;
1584
  float total_err = 0.0f;
1585
+ (void)verbose; /* kept for API symmetry with the Q2_K path */
 
 
 
 
 
1586
  /* ── Phase 1: Greedy seed β€” compute scale per block ── */
1587
  float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
1588
 
 
1605
  uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
1606
  calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
1607
 
1608
+ #pragma omp parallel for schedule(dynamic, 64)
1609
  for (int64_t blk = 0; blk < n_blocks; blk++) {
1610
  const float *bw = weights + blk * QK4_0;
1611
 
 
1616
  if (wls_d < 1e-15f) break;
1617
  float inv_d = 1.0f / wls_d;
1618
  float num = 0.0f, den = 0.0f;
1619
+ float dcS = 0.0f, dcQ = 0.0f; /* DC rank-1 augmentation sums */
1620
  for (int j = 0; j < QK4_0; j++) {
1621
  int q = (int)(bw[j] * inv_d + 8.5f);
1622
  if (q < 0) q = 0; if (q > 15) q = 15;
 
1625
  imat_importance[blk * QK4_0 + j] : 1.0f;
1626
  num += w * bw[j] * qc;
1627
  den += w * qc * qc;
1628
+ dcS += bw[j];
1629
+ dcQ += qc;
1630
  }
1631
+ /* DC term of the extended objective enters the normal equation
1632
+ * as one extra observation (S ~ dΒ·Q) of weight Ξ»_dc/n. The
1633
+ * vesica/wave term is handled by extended-E acceptance in the
1634
+ * ULP search; the solver is a proposal generator. */
1635
+ num += (HEX_DC_LAMBDA / (float)QK4_0) * dcS * dcQ;
1636
+ den += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ * dcQ;
1637
  if (den > 1e-15f) {
1638
  float d_new = num / den;
1639
  if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
 
1653
 
1654
  float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
1655
 
1656
+ /* ── Extended objective over all QK4_0 elements ──
1657
+ * Exact importance-weighted SSE + DC + vesica/wave spectral
1658
+ * penalty β€” the same objective every acceptance stage uses. */
1659
+ float err = 0.0f;
1660
+ float e_arr[QK4_0];
 
1661
  for (int j = 0; j < QK4_0; j++) {
1662
  float x = bw[j];
1663
  int q = (int)(x * id + 8.5f);
1664
  if (q < 0) q = 0; if (q > 15) q = 15;
1665
  float deq = ((float)q - 8.0f) * actual_d;
1666
+ float e = x - deq;
1667
+ e_arr[j] = e;
1668
+ float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
1669
+ err += e * e * w;
1670
  }
1671
+ cand_errors[blk][ci] = err + hex_spectral_penalty(e_arr, QK4_0);
 
 
 
 
 
 
 
 
 
1672
  }
1673
  }
1674
 
1675
  /* ── Phase 3: HPC graph β€” single quhit per block ── */
1676
  int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
1677
+ int hpc_ran_q4 = 0;
1678
  for (int64_t i = 0; i < n_blocks; i++)
1679
  best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
1680
 
 
1686
 
1687
  HPCGraph *graph = hpc_create(n_sites);
1688
  if (graph) {
1689
+ hpc_ran_q4 = 1;
1690
  for (int64_t i = 0; i < n_sites; i++)
1691
  triality_dft(&graph->locals[i]);
1692
 
 
1893
  global_best_c = c;
1894
  }
1895
  }
1896
+ if (global_best < best_err * HEX_GREEDY_OVERRIDE_RATIO)
1897
  best_candidate[b] = global_best_c;
1898
  else
1899
  best_candidate[b] = best_c;
 
1912
  {
1913
  #define Q4_BORN_SHOTS 128
1914
 
 
 
 
 
 
1915
  /* Build per-block CDFs from triality marginals */
1916
  unsigned int born_rng = 314159;
1917
 
 
1920
  for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
1921
  tail_err_q4 += cand_errors[bi][best_candidate[bi]];
1922
 
1923
+ /* Beam-search baseline over the SAME set of blocks a Born
1924
+ * shot covers: stride representatives + tail. The previous
1925
+ * code summed the baseline over ALL blocks (including
1926
+ * mid-stride blocks the shots never touch), making shot_err
1927
+ * systematically smaller than the baseline and letting
1928
+ * strictly worse configurations be adopted whenever
1929
+ * stride > 1. */
1930
+ float beam_total_err = tail_err_q4;
1931
+ for (int64_t gi = 0; gi < graph_blocks; gi++) {
1932
+ int64_t rep = gi * stride;
1933
+ beam_total_err += cand_errors[rep][best_candidate[rep]];
1934
+ }
1935
+
1936
  /* Sparse shot buffer: only track stride-sampled blocks */
1937
  int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
1938
 
 
2010
  }
2011
  }
2012
 
2013
+ /* Fallback when the HPC graph never ran (single block, or hpc_create
2014
+ * failure): pick the per-block argmin over the candidate grid instead
2015
+ * of silently leaving every block on the neutral Γ—1.00 candidate. */
2016
+ if (!hpc_ran_q4) {
2017
+ #pragma omp parallel for schedule(static)
2018
+ for (int64_t blk = 0; blk < n_blocks; blk++) {
2019
+ float best_e = cand_errors[blk][0];
2020
+ int best_c = 0;
2021
+ for (int c = 1; c < Q4_N_CAND; c++) {
2022
+ if (cand_errors[blk][c] < best_e) {
2023
+ best_e = cand_errors[blk][c];
2024
+ best_c = c;
2025
+ }
2026
+ }
2027
+ best_candidate[blk] = best_c;
2028
+ }
2029
+ }
2030
+
2031
  /* ══════════════════════════════════════════════════════════════════
2032
  * PHASE 4: Assemble blocks via least-squares scale extraction
2033
  * ══════════════════════════════════════════════════════════════════ */
 
2053
  }
2054
 
2055
  float num = 0.0f, den = 0.0f;
2056
+ float dc4S = 0.0f, dc4Q = 0.0f;
2057
  for (int j = 0; j < QK4_0; j++) {
2058
  float q_centered = (float)qs_tmp[j] - 8.0f;
2059
  float w = (imat_importance) ?
2060
  imat_importance[blk * QK4_0 + j] : 1.0f;
2061
  num += w * bw[j] * q_centered;
2062
  den += w * q_centered * q_centered;
2063
+ dc4S += bw[j];
2064
+ dc4Q += q_centered;
2065
  }
2066
+ num += (HEX_DC_LAMBDA / (float)QK4_0) * dc4S * dc4Q;
2067
+ den += (HEX_DC_LAMBDA / (float)QK4_0) * dc4Q * dc4Q;
2068
 
2069
  if (den > 1e-15f) {
2070
  float d_new = num / den;
 
2104
  float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
2105
  float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
2106
  float err = 0.0f;
2107
+ float e_ulp[QK4_0];
2108
  for (int j = 0; j < QK4_0; j++) {
2109
  int q = (int)(bw[j] * trial_id + 8.5f);
2110
  if (q < 0) q = 0; if (q > 15) q = 15;
2111
  float deq = ((float)q - 8.0f) * trial_d;
2112
  float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2113
+ e_ulp[j] = bw[j] - deq;
2114
+ err += e_ulp[j] * e_ulp[j] * w;
2115
  }
2116
+ err += hex_spectral_penalty(e_ulp, QK4_0);
2117
  if (err < best_ulp_err) {
2118
  best_ulp_err = err;
2119
  best_d16 = ulp_candidates[ui];
 
2153
  for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
2154
  float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
2155
 
2156
+ /* Deterministic greedy descent: only strict improvements.
2157
+ * The previous SA acceptance called rand() inside an OpenMP
2158
+ * parallel region (data race in the shared PRNG state, and
2159
+ * non-reproducible output). Uphill moves were pointless anyway:
2160
+ * the base-vs-shaped MSE guard below discards any shaped result
2161
+ * that ends up worse, so accepted uphill excursions could only
2162
+ * waste the pass budget or strand the descent. */
2163
  for (int pass = 0; pass < QK4_0; pass++) {
2164
  int best_k = -1;
2165
  int best_q_alt = 0;
2166
+ float best_delta = 0.0f; /* strictly positive threshold */
2167
 
2168
  for (int k = 0; k < QK4_0; k++) {
2169
  int q_cur = q_shaped[k];
 
2191
  }
2192
  }
2193
 
2194
+ if (best_k < 0) break; /* converged β€” no improving flip */
2195
 
2196
+ q_shaped[best_k] = best_q_alt;
2197
+ {
 
2198
  float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
2199
  float e_new_commit = bw[best_k] - deq_commit;
2200
  float de_commit = e_new_commit - e_live[best_k];
 
2209
 
2210
  v_live[pi_commit] = v_new_commit;
2211
  e_live[best_k] = e_new_commit;
 
 
2212
  }
 
2213
  }
2214
  }
2215
 
2216
  float err_base = 0.0f, err_shaped = 0.0f;
2217
+ float e_gb[QK4_0], e_gs[QK4_0];
2218
  for (int j = 0; j < QK4_0; j++) {
2219
  float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
2220
  float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
2221
  float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
2222
+ e_gb[j] = bw[j] - deq_b;
2223
+ e_gs[j] = bw[j] - deq_s;
2224
+ err_base += e_gb[j] * e_gb[j] * w;
2225
+ err_shaped += e_gs[j] * e_gs[j] * w;
2226
  }
2227
+ err_base += hex_spectral_penalty(e_gb, QK4_0);
2228
+ err_shaped += hex_spectral_penalty(e_gs, QK4_0);
2229
  int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
2230
 
2231
  for (int j = 0; j < QK4_0 / 2; j++) {
 
2246
  free(best_candidate);
2247
  }
2248
 
2249
+ /* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
2250
+ * pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
2251
+ * candidate generation, so stored codes are unnecessary. */
2252
+ static inline void hex_derive_subscales(const float *scales, const float *mins,
2253
+ float actual_dm, float actual_mm,
2254
+ uint8_t *Ls, uint8_t *Lm)
2255
+ {
2256
+ for (int j = 0; j < 16; j++) {
2257
+ if (actual_dm > 1e-15f) {
2258
+ int ls = gguf_nearest_int(scales[j] / actual_dm);
2259
+ if (ls < 0) ls = 0; if (ls > 15) ls = 15;
2260
+ Ls[j] = (uint8_t)ls;
2261
+ } else { Ls[j] = 0; }
2262
+ if (actual_mm > 1e-15f) {
2263
+ int lm = gguf_nearest_int(mins[j] / actual_mm);
2264
+ if (lm < 0) lm = 0; if (lm > 15) lm = 15;
2265
+ Lm[j] = (uint8_t)lm;
2266
+ } else { Lm[j] = 0; }
2267
+ }
2268
+ }
2269
+
2270
  static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2271
  BlockQ2K *output, float *out_total_error,
2272
  OptimizerMode opt_mode,
 
2277
  float total_err = 0.0f;
2278
  const int N_SUB = QK_K / 16;
2279
 
 
 
2280
  /* ── Outlier Clamping for WLS Seeds ──
2281
  * Protects the Phase 1 greedy seed from being violently warped by extreme
2282
  * >4.0 sigma outliers, which creates better centering for the grid search. */
2283
+ double t_sum_sq = 0.0, t_sum_4 = 0.0;
2284
+ for (int64_t i = 0; i < n_elements; i++) {
2285
+ double w2 = (double)weights[i] * (double)weights[i];
2286
+ t_sum_sq += w2;
2287
+ t_sum_4 += w2 * w2;
2288
+ }
2289
+ float w_sigma = sqrtf((float)(t_sum_sq / (double)n_elements));
2290
+
2291
+ /* ── Adaptive outlier clamp (kurtosis-driven) ──
2292
+ * The fixed 3.5Οƒ clamp suppressed the heavy-tail mass that dominates
2293
+ * reconstruction error, inflating RMSE on near-Gaussian tensors that did
2294
+ * not need clamping at all. Instead, gate the clamp on the tensor's raw
2295
+ * kurtosis (Gaussian = 3): leave near-Gaussian tensors untouched and only
2296
+ * apply a stabilising clamp to genuinely heavy-tailed tensors, where the
2297
+ * final (d, dmin) refit later recovers fidelity against the UNCLIPPED
2298
+ * weights anyway. */
2299
+ double t_var = t_sum_sq / (double)n_elements;
2300
+ double t_kurt = (t_var > 1e-30) ? (t_sum_4 / (double)n_elements) / (t_var * t_var) : 3.0;
2301
+ float clamp_sigma;
2302
+ if (t_kurt <= 6.0) clamp_sigma = 1.0e9f; /* ~Gaussian: effectively no clamp */
2303
+ else if (t_kurt <= 20.0) clamp_sigma = 6.0f; /* moderately heavy tails */
2304
+ else clamp_sigma = 4.0f; /* very heavy tails: stabilise seed */
2305
+ float clamp_val = w_sigma * clamp_sigma;
2306
 
2307
  /* ══════════════════════════════════════════════════════════════════
2308
  * PHASE 1: Greedy quantization β€” produce seed (d, dmin) per block
 
2338
  if (v > clamp_val) v = clamp_val;
2339
  if (v < -clamp_val) v = -clamp_val;
2340
  sx_clipped[l] = v;
2341
+ /* Activation-aware weighting: an imatrix entry already encodes
2342
+ * E[a^2] for that column, which is the correct weight for
2343
+ * minimising output (dot-product) error. Use it directly rather
2344
+ * than re-multiplying by the |w| magnitude heuristic, which
2345
+ * double-counts magnitude. Without an imatrix, fall back to the
2346
+ * magnitude-relative heuristic. */
2347
+ wt[l] = (imat_importance)
2348
+ ? imp
2349
+ : sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
2350
  seeds[blk].sw[j] += wt[l];
2351
  }
2352
  seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
 
2366
  * ══════════════════════════════════════════════════════════════════ */
2367
 
2368
  /* Expanded neighborhood around WLS optimum: Β±30% with 24 candidates */
2369
+ /* d is the sensitive axis, so concentrate resolution near 1.0 while
2370
+ * keeping wide tails for blocks whose WLS seed is off. 1.000 stays at
2371
+ * index 11 so the neutral-candidate fallback/init remains valid. */
2372
  static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
2373
+ 0.780f, 0.835f, 0.880f, 0.915f, 0.943f, 0.963f,
2374
+ 0.978f, 0.988f, 0.994f, 0.997f, 0.999f, 1.000f,
2375
+ 1.002f, 1.005f, 1.011f, 1.021f, 1.035f, 1.054f,
2376
+ 1.080f, 1.115f, 1.160f, 1.215f, 1.275f, 1.340f
2377
  };
2378
  static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
2379
  0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
 
2390
  float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
2391
  uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
2392
  uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
 
 
2393
 
2394
  candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
2395
  sizeof(float[TOTAL_SCALE_CANDIDATES]));
 
2397
  sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
2398
  candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
2399
  sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
2400
+ /* NOTE: the per-candidate sub-scale codes (Ls/Lm) are NOT stored.
2401
+ * They are a pure function of (seeds[blk].scales/mins, candidate fp16
2402
+ * d/dmin) and are re-derived where needed. Storing them cost
2403
+ * n_blocks Γ— 576 Γ— 16 Γ— 2 bytes β‰ˆ 18 KB/superblock β€” multiple GB of
2404
+ * peak RSS on large FFN tensors β€” for data used at exactly one index. */
2405
 
2406
  #pragma omp parallel for schedule(dynamic, 16)
2407
  for (int64_t blk = 0; blk < n_blocks; blk++) {
 
2509
  trial_Lm[j] = (uint8_t)lm;
2510
  } else { trial_Lm[j] = 0; }
2511
  }
 
 
2512
 
2513
+ /* Error evaluation MUST use the non-clipped original weights.
2514
+ * Exact importance-weighted SSE β€” the same objective the
2515
+ * assembly/polish phases minimise and the reported RMSE. */
2516
+ float err = 0.0f;
2517
+ float e_arr[QK_K];
2518
  for (int i = 0; i < QK_K; i++) {
2519
  int jj = i >> 4;
2520
  float d = actual_dm * (float)trial_Ls[jj];
2521
  float m = actual_mm * (float)trial_Lm[jj];
2522
+ float x = block_x[i];
2523
+ float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
2524
+ float e;
2525
  if (d < 1e-15f) {
2526
+ /* Decoder semantics: deq = dΒ·lsΒ·q βˆ’ dminΒ·lm = βˆ’m here */
2527
+ e = x + m;
2528
  } else {
2529
  int q = gguf_nearest_int((x + m) / d);
2530
  if (q < 0) q = 0; if (q > 3) q = 3;
2531
+ e = x - (d * (float)q - m);
2532
  }
2533
+ e_arr[i] = e;
2534
+ err += e * e * w;
2535
  }
2536
+ candidate_errors[blk][cidx] =
2537
+ err + hex_spectral_penalty(e_arr, QK_K);
 
 
 
 
 
 
 
2538
  }
2539
  }
2540
  }
 
2895
  g_cand = vit_c;
2896
  }
2897
  }
2898
+ if (g_best < cur_err * HEX_GREEDY_OVERRIDE_RATIO)
2899
  best_candidate[vit_b] = g_cand;
2900
  }
2901
 
 
2967
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2968
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2969
 
2970
+ uint8_t dc_Ls[16], dc_Lm[16];
2971
+ hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
2972
+ dm0, mm0, dc_Ls, dc_Lm);
2973
+
2974
  /* Bias applied to THIS block's WLS targets */
2975
  float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2976
  block_dc_bias[blk] = dc_bias;
 
2981
  float dc_res = 0.0f;
2982
  int j, k;
2983
  for (j = 0; j < N_SUB; j++) {
2984
+ float d_sub = dm0 * (float)dc_Ls[j];
2985
+ float m_sub = mm0 * (float)dc_Lm[j];
2986
  for (k = 0; k < 16; k++) {
2987
  float x_adj = bx[16*j + k] - dc_bias;
2988
  int q = 0;
 
3033
  adj_block_x[_i] = block_x[_i] - dc_adj;
3034
  }
3035
 
 
 
 
3036
  float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
3037
  float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
3038
 
3039
+ hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
3040
+ dm, mm, Ls_blk, Lm_blk);
3041
+
3042
  uint16_t prev_dm16 = 0, prev_mm16 = 0;
3043
  for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
3044
 
 
3328
  int jj = i >> 4;
3329
  float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
3330
  float m_s = mm * (float)(output[blk].scales[jj] >> 4);
3331
+ /* Decoder semantics: deq = d_sΒ·q βˆ’ m_s, which is βˆ’m_s when
3332
+ * d_s == 0 (NOT 0 β€” the βˆ’dminΒ·lm term always applies). */
3333
+ float deq = d_s * (float)q_shaped_all[i] - m_s;
3334
  /* Residual against the adjusted target (DC-corrected view) */
3335
  e_live[i] = adj_block_x[i] - deq;
3336
  }
 
3397
  }
3398
  }
3399
 
3400
+ /* Choose base vs shaped on the EXTENDED objective vs originals */
3401
  float err_base = 0.0f, err_shaped = 0.0f;
3402
+ float e_qb[QK_K], e_qs[QK_K];
3403
  for (int i = 0; i < QK_K; i++) {
3404
  int jj = i >> 4;
3405
  float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
3406
  float m_s = mm * (float)(output[blk].scales[jj] >> 4);
3407
  float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
3408
+ float deq_b = d_s * (float)q_base_all[i] - m_s; /* βˆ’m_s when d_s==0 */
3409
+ float deq_s = d_s * (float)q_shaped_all[i] - m_s;
3410
  float xv = block_x[i]; /* original weight for error report */
3411
+ e_qb[i] = xv - deq_b;
3412
+ e_qs[i] = xv - deq_s;
3413
+ err_base += e_qb[i] * e_qb[i] * w;
3414
+ err_shaped += e_qs[i] * e_qs[i] * w;
3415
  }
3416
+ err_base += hex_spectral_penalty(e_qb, QK_K);
3417
+ err_shaped += hex_spectral_penalty(e_qs, QK_K);
3418
  {
3419
  int use_shaped = (err_shaped <= err_base);
3420
  for (int i = 0; i < QK_K; i++)
 
3483
  }
3484
  }
3485
 
3486
+ /* ── Final closed-form (d, dmin) refit against the UNCLIPPED weights ──
3487
+ * (issues #2 / #5)
3488
+ *
3489
+ * Every earlier (d, dmin) solve fits the DC-adjusted, soft-clipped
3490
+ * target and runs BEFORE the greedy descent and Floyd-Steinberg passes
3491
+ * mutate the committed 2-bit codes. Once L[], and the 4-bit sub-block
3492
+ * scale codes (Ls = scales & 0xF, Lm = scales >> 4), are final, the two
3493
+ * fp16 scalars (d, dmin) that minimise the importance-weighted SSE
3494
+ * against the ORIGINAL weights have a closed form. Solve it and adopt it
3495
+ * only when it lowers the weighted block error β€” so it can never raise
3496
+ * RMSE, and because the integer codes are held fixed, the vesica/wave/DC
3497
+ * error shaping baked into them is preserved intact. */
3498
+ {
3499
+ double rSaa = 0, rSab = 0, rSbb = 0, rSxa = 0, rSxb = 0;
3500
+ double rA = 0, rB = 0, rS = 0; /* DC rank-1 augmentation */
3501
+ for (int j = 0; j < N_SUB; j++) {
3502
+ float ls_f = (float)(output[blk].scales[j] & 0xF);
3503
+ float lm_f = (float)(output[blk].scales[j] >> 4);
3504
+ for (int k = 0; k < 16; k++) {
3505
+ int idx = 16 * j + k;
3506
+ float x = block_x[idx]; /* unclipped original */
3507
+ float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
3508
+ float a = ls_f * (float)L[idx];
3509
+ float b = lm_f;
3510
+ rSaa += (double)w * a * a;
3511
+ rSab += (double)w * a * b;
3512
+ rSbb += (double)w * b * b;
3513
+ rSxa += (double)w * x * a;
3514
+ rSxb += (double)w * x * b;
3515
+ rA += a; rB += b; rS += x;
3516
+ }
3517
+ }
3518
+ /* DC term as one augmented observation (S ~ AΒ·d βˆ’ BΒ·m), weight
3519
+ * Ξ»_dc/n; vesica/wave handled by the extended-E acceptance. */
3520
+ {
3521
+ double rw = (double)HEX_DC_LAMBDA / (double)QK_K;
3522
+ rSaa += rw * rA * rA; rSab += rw * rA * rB;
3523
+ rSbb += rw * rB * rB; rSxa += rw * rS * rA;
3524
+ rSxb += rw * rS * rB;
3525
+ }
3526
+ double rdet = rSaa * rSbb - rSab * rSab;
3527
+ if (fabs(rdet) > 1e-30) {
3528
+ double d_ref = (rSbb * rSxa - rSab * rSxb) / rdet;
3529
+ double m_ref = (rSab * rSxa - rSaa * rSxb) / rdet;
3530
+ if (d_ref > 0.0) {
3531
+ float dm_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_ref));
3532
+ float mm_try = (m_ref > 0.0)
3533
+ ? gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)m_ref))
3534
+ : mm;
3535
+ /* Extended-objective acceptance test vs original weights. */
3536
+ float err_cur = 0.0f, err_try = 0.0f;
3537
+ float e_rc[QK_K], e_rt[QK_K];
3538
+ for (int j = 0; j < N_SUB; j++) {
3539
+ float ls_f = (float)(output[blk].scales[j] & 0xF);
3540
+ float lm_f = (float)(output[blk].scales[j] >> 4);
3541
+ for (int k = 0; k < 16; k++) {
3542
+ int idx = 16 * j + k;
3543
+ float x = block_x[idx];
3544
+ float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
3545
+ float qf = (float)L[idx];
3546
+ float dc = dm * ls_f * qf - mm * lm_f;
3547
+ float dt = dm_try * ls_f * qf - mm_try * lm_f;
3548
+ e_rc[idx] = x - dc;
3549
+ e_rt[idx] = x - dt;
3550
+ err_cur += e_rc[idx] * e_rc[idx] * w;
3551
+ err_try += e_rt[idx] * e_rt[idx] * w;
3552
+ }
3553
+ }
3554
+ err_cur += hex_spectral_penalty(e_rc, QK_K);
3555
+ err_try += hex_spectral_penalty(e_rt, QK_K);
3556
+ if (err_try < err_cur) { dm = dm_try; mm = mm_try; }
3557
+ }
3558
+ }
3559
+ output[blk].d = gguf_fp32_to_fp16(dm);
3560
+ output[blk].dmin = gguf_fp32_to_fp16(mm);
3561
+ }
3562
+
3563
+ /* ══ PHASE 4.6: MONOTONE COORDINATE-DESCENT POLISH (RMSE-guaranteed) ══
3564
+ *
3565
+ * Objective-function mismatch fix: the final passes that commit the
3566
+ * 2-bit codes β€” the 16Γ—16 (ls, lm) sub-block search, the Β±8 ULP
3567
+ * (d, dmin) neighborhood search, and the greedy-descent error shaping
3568
+ * β€” all minimise error against the DC-ADJUSTED target adj_block_x.
3569
+ * The reported RMSE, however, is measured against the ORIGINAL
3570
+ * weights. The codes are therefore stranded at the optimum of a
3571
+ * SHIFTED objective, while only the scalar (d, dmin) refit above
3572
+ * targets the true one (and it holds all codes frozen).
3573
+ *
3574
+ * This polish runs alternating coordinate descent on the TRUE
3575
+ * objective (importance-weighted SSE vs the original weights):
3576
+ *
3577
+ * (1) For each 16-weight sub-block, an exact joint re-search of
3578
+ * (ls, lm) over the full 16Γ—16 grid with per-weight optimal
3579
+ * q ∈ {0..3}, committed only on strict improvement of the
3580
+ * extended objective E. With Ξ»_dc = Ξ»_vw = 0 sub-blocks are
3581
+ * independent given (d, dmin); with spectral terms active the
3582
+ * coupling (DC: all subs; fold: sub j ↔ sub jβŠ•8) is handled
3583
+ * exactly via live residual bookkeeping.
3584
+ * (2) Closed-form weighted LS refit of the two fp16 scalars
3585
+ * (d, dmin) with all codes held fixed, committed only on
3586
+ * strict improvement (same guard as the refit above).
3587
+ *
3588
+ * All moves are accept-only-if-better on E β‡’ the extended block
3589
+ * objective is monotonically non-increasing; at Ξ» = 0 this reduces
3590
+ * to RMSE-monotone (final RMSE can only go DOWN relative to the
3591
+ * unpatched pipeline), at Ξ» > 0 small RMSE giveback is permitted
3592
+ * exactly where it buys dot-product error cancellation. The state space is finite
3593
+ * (4-bit codes, fp16 scalars), so the loop terminates; in practice
3594
+ * it converges in 2–3 sweeps. The vesica/DC spectral shaping baked
3595
+ * into L survives wherever it is SSE-neutral, and is overridden
3596
+ * only where it was costing true reconstruction error. */
3597
+ {
3598
+ uint8_t pl_Ls[16], pl_Lm[16];
3599
+ for (int j = 0; j < N_SUB; j++) {
3600
+ pl_Ls[j] = output[blk].scales[j] & 0xF;
3601
+ pl_Lm[j] = output[blk].scales[j] >> 4;
3602
+ }
3603
+
3604
+ for (int pol_iter = 0; pol_iter < 6; pol_iter++) {
3605
+ int pol_improved = 0;
3606
+
3607
+ /* ── (1) Exact per-sub-block (ls, lm, q) re-search on the
3608
+ * EXTENDED objective. Under the spectral terms sub-blocks
3609
+ * are no longer independent: every sub couples to all others
3610
+ * through the DC term and to its fold partner (sub j βŠ• 8,
3611
+ * i.e. weights i ↔ i+128) through vesicaΒ² βˆ’ waveΒ². The
3612
+ * search therefore keeps live residuals pe[] and scores each
3613
+ * candidate against the whole-block penalty with the partner
3614
+ * residuals held fixed β€” exact coordinate descent on E. */
3615
+ float pe[QK_K];
3616
+ float sub_sse[16], sub_dc[16], pair_cross[8];
3617
+ float dc_tot = 0.0f, cross_tot = 0.0f;
3618
+ for (int j = 0; j < N_SUB; j++) {
3619
+ float d_sub = dm * (float)pl_Ls[j];
3620
+ float m_sub = mm * (float)pl_Lm[j];
3621
+ sub_sse[j] = 0.0f;
3622
+ sub_dc[j] = 0.0f;
3623
+ for (int k = 0; k < 16; k++) {
3624
+ int idx = 16 * j + k;
3625
+ float w = (imat_importance) ?
3626
+ imat_importance[blk * QK_K + idx] : 1.0f;
3627
+ /* deq = dΒ·lsΒ·q βˆ’ dminΒ·lm; equals βˆ’m_sub at ls==0 */
3628
+ float e = block_x[idx] - (d_sub * (float)L[idx] - m_sub);
3629
+ pe[idx] = e;
3630
+ sub_sse[j] += e * e * w;
3631
+ sub_dc[j] += e;
3632
+ }
3633
+ dc_tot += sub_dc[j];
3634
+ }
3635
+ for (int p = 0; p < 8; p++) {
3636
+ pair_cross[p] = 0.0f;
3637
+ for (int k = 0; k < 16; k++)
3638
+ pair_cross[p] += pe[16*p + k] * pe[16*(p+8) + k];
3639
+ cross_tot += pair_cross[p];
3640
+ }
3641
+
3642
+ for (int j = 0; j < N_SUB; j++) {
3643
+ const float *sx = block_x + 16 * j;
3644
+ int pi = j & 7; /* fold-pair index */
3645
+ int pj = j ^ 8; /* partner sub-block */
3646
+ const float *ppe = pe + 16 * pj; /* partner residuals */
3647
+ float dc_rest = dc_tot - sub_dc[j];
3648
+ float cross_rest = cross_tot - pair_cross[pi];
3649
+
3650
+ /* Extended score of the CURRENT committed state */
3651
+ float best_sub = sub_sse[j]
3652
+ + (HEX_DC_LAMBDA / (float)QK_K) * dc_tot * dc_tot
3653
+ + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f * cross_tot;
3654
+ int best_ls = -1, best_lm = 0;
3655
+ uint8_t best_q[16];
3656
+ float best_e[16];
3657
+ float best_sse = 0.0f, best_dcc = 0.0f, best_cxc = 0.0f;
3658
+
3659
+ for (int try_ls = 0; try_ls <= 15; try_ls++) {
3660
+ float d_sub = dm * (float)try_ls;
3661
+ for (int try_lm = 0; try_lm <= 15; try_lm++) {
3662
+ float m_sub = mm * (float)try_lm;
3663
+ float sub_err = 0.0f, dcc = 0.0f, cxc = 0.0f;
3664
+ uint8_t q_loc[16];
3665
+ float e_loc[16];
3666
+ int aborted = 0;
3667
+ for (int k = 0; k < 16; k++) {
3668
+ float x = sx[k];
3669
+ float w = (imat_importance) ?
3670
+ imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3671
+ int q = 0;
3672
+ if (d_sub >= 1e-15f) {
3673
+ q = gguf_nearest_int((x + m_sub) / d_sub);
3674
+ if (q < 0) q = 0; if (q > 3) q = 3;
3675
+ }
3676
+ q_loc[k] = (uint8_t)q;
3677
+ /* deq = dΒ·lsΒ·q βˆ’ dminΒ·lm; βˆ’m_sub at ls==0 */
3678
+ float e = x - (d_sub * (float)q - m_sub);
3679
+ e_loc[k] = e;
3680
+ sub_err += e * e * w;
3681
+ dcc += e;
3682
+ cxc += e * ppe[k];
3683
+ /* SSE-partial prune is a valid lower bound
3684
+ * only while the spectral terms are β‰₯ 0,
3685
+ * i.e. when the (signable) vw credit is off */
3686
+ if (HEX_VW_LAMBDA == 0.0f &&
3687
+ sub_err >= best_sub) { aborted = 1; break; }
3688
+ }
3689
+ if (aborted) continue;
3690
+ float score = sub_err
3691
+ + (HEX_DC_LAMBDA / (float)QK_K)
3692
+ * (dc_rest + dcc) * (dc_rest + dcc)
3693
+ + (HEX_VW_LAMBDA / (float)QK_K) * 4.0f
3694
+ * (cross_rest + cxc);
3695
+ if (score < best_sub) {
3696
+ best_sub = score;
3697
+ best_ls = try_ls;
3698
+ best_lm = try_lm;
3699
+ memcpy(best_q, q_loc, 16);
3700
+ memcpy(best_e, e_loc, sizeof(e_loc));
3701
+ best_sse = sub_err;
3702
+ best_dcc = dcc;
3703
+ best_cxc = cxc;
3704
+ }
3705
+ }
3706
+ }
3707
+
3708
+ if (best_ls >= 0) { /* strict improvement in E found */
3709
+ pl_Ls[j] = (uint8_t)best_ls;
3710
+ pl_Lm[j] = (uint8_t)best_lm;
3711
+ memcpy(L + 16 * j, best_q, 16);
3712
+ memcpy(pe + 16 * j, best_e, sizeof(best_e));
3713
+ sub_sse[j] = best_sse;
3714
+ sub_dc[j] = best_dcc;
3715
+ pair_cross[pi] = best_cxc;
3716
+ dc_tot = dc_rest + best_dcc;
3717
+ cross_tot = cross_rest + best_cxc;
3718
+ pol_improved = 1;
3719
+ }
3720
+ }
3721
+
3722
+ /* ── (2) Closed-form (d, dmin) refit vs ORIGINAL, codes fixed ── */
3723
+ {
3724
+ double pSaa = 0, pSab = 0, pSbb = 0, pSxa = 0, pSxb = 0;
3725
+ double pA = 0, pB = 0, pS = 0; /* DC rank-1 augmentation */
3726
+ for (int j = 0; j < N_SUB; j++) {
3727
+ float ls_f = (float)pl_Ls[j];
3728
+ float lm_f = (float)pl_Lm[j];
3729
+ for (int k = 0; k < 16; k++) {
3730
+ int idx = 16 * j + k;
3731
+ float x = block_x[idx];
3732
+ float w = (imat_importance) ?
3733
+ imat_importance[blk * QK_K + idx] : 1.0f;
3734
+ float a = ls_f * (float)L[idx];
3735
+ float b = lm_f;
3736
+ pSaa += (double)w * a * a;
3737
+ pSab += (double)w * a * b;
3738
+ pSbb += (double)w * b * b;
3739
+ pSxa += (double)w * x * a;
3740
+ pSxb += (double)w * x * b;
3741
+ pA += a; pB += b; pS += x;
3742
+ }
3743
+ }
3744
+ {
3745
+ double pw = (double)HEX_DC_LAMBDA / (double)QK_K;
3746
+ pSaa += pw * pA * pA; pSab += pw * pA * pB;
3747
+ pSbb += pw * pB * pB; pSxa += pw * pS * pA;
3748
+ pSxb += pw * pS * pB;
3749
+ }
3750
+ double pdet = pSaa * pSbb - pSab * pSab;
3751
+ if (fabs(pdet) > 1e-30) {
3752
+ double d_ref = (pSbb * pSxa - pSab * pSxb) / pdet;
3753
+ double m_ref = (pSab * pSxa - pSaa * pSxb) / pdet;
3754
+ if (d_ref > 0.0) {
3755
+ float dm_try = gguf_fp16_to_fp32(
3756
+ gguf_fp32_to_fp16((float)d_ref));
3757
+ float mm_try = (m_ref > 0.0)
3758
+ ? gguf_fp16_to_fp32(
3759
+ gguf_fp32_to_fp16((float)m_ref))
3760
+ : mm;
3761
+ float err_cur = 0.0f, err_try = 0.0f;
3762
+ float e_pc[QK_K], e_pt[QK_K];
3763
+ for (int j = 0; j < N_SUB; j++) {
3764
+ float ls_f = (float)pl_Ls[j];
3765
+ float lm_f = (float)pl_Lm[j];
3766
+ for (int k = 0; k < 16; k++) {
3767
+ int idx = 16 * j + k;
3768
+ float x = block_x[idx];
3769
+ float w = (imat_importance) ?
3770
+ imat_importance[blk * QK_K + idx] : 1.0f;
3771
+ float qf = (float)L[idx];
3772
+ float dc = dm * ls_f * qf - mm * lm_f;
3773
+ float dt = dm_try * ls_f * qf - mm_try * lm_f;
3774
+ e_pc[idx] = x - dc;
3775
+ e_pt[idx] = x - dt;
3776
+ err_cur += e_pc[idx] * e_pc[idx] * w;
3777
+ err_try += e_pt[idx] * e_pt[idx] * w;
3778
+ }
3779
+ }
3780
+ err_cur += hex_spectral_penalty(e_pc, QK_K);
3781
+ err_try += hex_spectral_penalty(e_pt, QK_K);
3782
+ if (err_try < err_cur) {
3783
+ dm = dm_try;
3784
+ mm = mm_try;
3785
+ pol_improved = 1;
3786
+ }
3787
+ }
3788
+ }
3789
+ }
3790
+
3791
+ if (!pol_improved) {
3792
+ /* ── (3) Β±2 ULP joint (d, dmin) micro-search vs ORIGINAL ──
3793
+ * The closed-form refit rounds its real-valued optimum to
3794
+ * fp16, which can land 1–2 ULP away from the best
3795
+ * representable pair (and the earlier Β±8 ULP search ran
3796
+ * against the DC-shifted objective). With codes fixed,
3797
+ * scan the (2Β·HEX_POLISH_ULP+1)Β² fp16 neighborhood on the
3798
+ * true objective;
3799
+ * accept only strict improvement, then loop once more so
3800
+ * move (1) can re-optimise codes for the new scalars.
3801
+ * Monotone β‡’ final RMSE can only decrease. */
3802
+ uint16_t base_d16 = gguf_fp32_to_fp16(dm);
3803
+ uint16_t base_m16 = gguf_fp32_to_fp16(mm);
3804
+
3805
+ float cur_err = 0.0f;
3806
+ float e_u[QK_K];
3807
+ for (int j = 0; j < N_SUB; j++) {
3808
+ float d_sub = dm * (float)pl_Ls[j];
3809
+ float m_sub = mm * (float)pl_Lm[j];
3810
+ for (int k = 0; k < 16; k++) {
3811
+ int idx = 16 * j + k;
3812
+ float w = (imat_importance) ?
3813
+ imat_importance[blk * QK_K + idx] : 1.0f;
3814
+ e_u[idx] = block_x[idx] -
3815
+ (d_sub * (float)L[idx] - m_sub);
3816
+ cur_err += e_u[idx] * e_u[idx] * w;
3817
+ }
3818
+ }
3819
+ cur_err += hex_spectral_penalty(e_u, QK_K);
3820
+
3821
+ float best_err = cur_err;
3822
+ uint16_t best_d16 = base_d16, best_m16 = base_m16;
3823
+ for (int dd = -HEX_POLISH_ULP; dd <= HEX_POLISH_ULP; dd++) {
3824
+ int cd16 = (int)base_d16 + dd;
3825
+ if (cd16 < 0 || cd16 > 0x7BFF) continue;
3826
+ float t_dm = gguf_fp16_to_fp32((uint16_t)cd16);
3827
+ for (int dmm = -HEX_POLISH_ULP; dmm <= HEX_POLISH_ULP; dmm++) {
3828
+ if (dd == 0 && dmm == 0) continue;
3829
+ int cm16 = (int)base_m16 + dmm;
3830
+ if (cm16 < 0 || cm16 > 0x7BFF) continue;
3831
+ float t_mm = gguf_fp16_to_fp32((uint16_t)cm16);
3832
+
3833
+ float err = 0.0f;
3834
+ /* SSE-partial prune valid only without the
3835
+ * signable vesica/wave credit */
3836
+ for (int j = 0;
3837
+ j < N_SUB && (HEX_VW_LAMBDA != 0.0f ||
3838
+ err < best_err); j++) {
3839
+ float d_sub = t_dm * (float)pl_Ls[j];
3840
+ float m_sub = t_mm * (float)pl_Lm[j];
3841
+ for (int k = 0; k < 16; k++) {
3842
+ int idx = 16 * j + k;
3843
+ float w = (imat_importance) ?
3844
+ imat_importance[blk * QK_K + idx] : 1.0f;
3845
+ e_u[idx] = block_x[idx] -
3846
+ (d_sub * (float)L[idx] - m_sub);
3847
+ err += e_u[idx] * e_u[idx] * w;
3848
+ }
3849
+ }
3850
+ if (HEX_DC_LAMBDA != 0.0f || HEX_VW_LAMBDA != 0.0f)
3851
+ err = (err < best_err || HEX_VW_LAMBDA != 0.0f)
3852
+ ? err + hex_spectral_penalty(e_u, QK_K)
3853
+ : err;
3854
+ if (err < best_err) {
3855
+ best_err = err;
3856
+ best_d16 = (uint16_t)cd16;
3857
+ best_m16 = (uint16_t)cm16;
3858
+ }
3859
+ }
3860
+ }
3861
+ if (best_d16 != base_d16 || best_m16 != base_m16) {
3862
+ dm = gguf_fp16_to_fp32(best_d16);
3863
+ mm = gguf_fp16_to_fp32(best_m16);
3864
+ pol_improved = 1;
3865
+ }
3866
+ }
3867
+
3868
+ if (!pol_improved) break; /* converged on true objective */
3869
+ }
3870
+
3871
+ /* Write back polished codes and scalars */
3872
+ for (int j = 0; j < N_SUB; j++)
3873
+ output[blk].scales[j] = pl_Ls[j] | (pl_Lm[j] << 4);
3874
+ output[blk].d = gguf_fp32_to_fp16(dm);
3875
+ output[blk].dmin = gguf_fp32_to_fp16(mm);
3876
+ }
3877
+
3878
+ /* ══ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ══
3879
+ *
3880
+ * candidate_errors[blk][c] is the EXACT weighted SSE of a directly
3881
+ * encodable configuration (fp16 d/dmin + derived Ls/Lm + nearest
3882
+ * rounding vs the original weights). The multi-stage assembly
3883
+ * (DC-shifted WLS, shaping, diffusion, polish) usually improves on
3884
+ * its seed, but each stage optimises a slightly different objective
3885
+ * and coordinate descent can land in a worse basin. Compare the
3886
+ * finished block against the best raw candidate and fall back when
3887
+ * the pipeline ended up worse β€” guaranteeing
3888
+ * final weighted SSE ≀ min_c candidate_errors[blk][c]. */
3889
+ {
3890
+ float fin_err = 0.0f;
3891
+ float e_f[QK_K];
3892
+ for (int j = 0; j < N_SUB; j++) {
3893
+ float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
3894
+ float m_sub = mm * (float)(output[blk].scales[j] >> 4);
3895
+ for (int k = 0; k < 16; k++) {
3896
+ int idx = 16 * j + k;
3897
+ float w = (imat_importance) ?
3898
+ imat_importance[blk * QK_K + idx] : 1.0f;
3899
+ e_f[idx] = block_x[idx] -
3900
+ (d_sub * (float)L[idx] - m_sub);
3901
+ fin_err += e_f[idx] * e_f[idx] * w;
3902
+ }
3903
+ }
3904
+ fin_err += hex_spectral_penalty(e_f, QK_K);
3905
+
3906
+ float g_best = candidate_errors[blk][0];
3907
+ int g_cand = 0;
3908
+ for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
3909
+ if (candidate_errors[blk][c] < g_best) {
3910
+ g_best = candidate_errors[blk][c];
3911
+ g_cand = c;
3912
+ }
3913
+ }
3914
+
3915
+ if (g_best < fin_err) {
3916
+ /* Rebuild the block exactly as the candidate was scored */
3917
+ float c_dm = gguf_fp16_to_fp32(candidate_d [blk][g_cand]);
3918
+ float c_mm = gguf_fp16_to_fp32(candidate_dmin[blk][g_cand]);
3919
+ uint8_t c_Ls[16], c_Lm[16];
3920
+ hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
3921
+ c_dm, c_mm, c_Ls, c_Lm);
3922
+ for (int j = 0; j < N_SUB; j++) {
3923
+ float d_sub = c_dm * (float)c_Ls[j];
3924
+ float m_sub = c_mm * (float)c_Lm[j];
3925
+ for (int k = 0; k < 16; k++) {
3926
+ int idx = 16 * j + k;
3927
+ int q = 0;
3928
+ if (d_sub >= 1e-15f) {
3929
+ q = gguf_nearest_int((block_x[idx] + m_sub) / d_sub);
3930
+ if (q < 0) q = 0; if (q > 3) q = 3;
3931
+ }
3932
+ L[idx] = (uint8_t)q;
3933
+ }
3934
+ output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
3935
+ }
3936
+ dm = c_dm; mm = c_mm;
3937
+ output[blk].d = candidate_d [blk][g_cand];
3938
+ output[blk].dmin = candidate_dmin[blk][g_cand];
3939
+ }
3940
+ }
3941
+
3942
  for (int j = 0; j < QK_K; j += 128) {
3943
  for (int l = 0; l < 32; l++) {
3944
  output[blk].qs[j / 4 + l] = L[j + l]
 
3966
  free(candidate_errors);
3967
  free(candidate_d);
3968
  free(candidate_dmin);
 
 
3969
  free(best_candidate);
3970
  if (out_total_error) *out_total_error = total_err;
3971
 
 
4015
  * ═══════════════════════════════════════════════════════════════════════════ */
4016
 
4017
  static void print_progress_bar(int current, int total, const char *label,
4018
+ time_t start_time)
4019
  {
4020
  if (total <= 0) return;
4021
  float pct = (float)current / (float)total;
4022
  int bar_width = 40;
4023
  int filled = (int)(pct * bar_width);
4024
 
4025
+ /* Wall-clock elapsed: clock() sums CPU time over all OpenMP threads,
4026
+ * which inflated elapsed/ETA by ~the thread count on multicore. */
4027
+ double elapsed = difftime(time(NULL), start_time);
4028
  double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
4029
 
4030
  printf("\r [");
 
4247
  int64_t total_elements_quantized = 0;
4248
  int64_t total_bytes_quantized = 0;
4249
  int64_t total_bytes_unquantized = 0;
4250
+ time_t quant_start = time(NULL);
4251
 
4252
  for (int i = 0; i < total_tensors; i++) {
4253
  int src = tensor_src_idx[i];
 
4268
 
4269
  int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
4270
  if (padded > n_elements) {
4271
+ float *grown = realloc(f32_data, padded * sizeof(float));
4272
+ if (!grown) {
4273
+ fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
4274
+ ti->name);
4275
+ free(f32_data);
4276
+ continue;
4277
+ }
4278
+ f32_data = grown;
4279
  for (int64_t j = n_elements; j < padded; j++)
4280
  f32_data[j] = 0.0f;
4281
  n_elements = padded;
 
4342
 
4343
  int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
4344
  if (padded > n_elements) {
4345
+ float *grown = realloc(f32_data, padded * sizeof(float));
4346
+ if (!grown) {
4347
+ fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
4348
+ ti->name);
4349
+ free(f32_data);
4350
+ continue;
4351
+ }
4352
+ f32_data = grown;
4353
  for (int64_t j = n_elements; j < padded; j++)
4354
  f32_data[j] = 0.0f;
4355
  n_elements = padded;
 
4705
 
4706
  /* ── Phase 1: Load model ── */
4707
  printf(" Phase 1: Loading model...\n");
4708
+ time_t t_start = time(NULL);
4709
 
4710
  /* Determine if input is a file or directory */
4711
  struct stat st;
 
4721
  /* Input is a directory β€” open all shards */
4722
  mf = st_open_dir(input_path);
4723
  strncpy(input_dir, input_path, sizeof(input_dir) - 2);
4724
+ input_dir[sizeof(input_dir) - 2] = '\0';
4725
  int dlen = strlen(input_dir);
4726
  if (dlen > 0 && input_dir[dlen - 1] != '/') {
4727
  input_dir[dlen] = '/';
 
4747
 
4748
  /* Extract directory from file path */
4749
  strncpy(input_dir, input_path, sizeof(input_dir) - 1);
4750
+ input_dir[sizeof(input_dir) - 1] = '\0';
4751
  char *last_slash = strrchr(input_dir, '/');
4752
  if (last_slash) {
4753
  *(last_slash + 1) = '\0';
 
4763
 
4764
  st_multi_print_summary(mf);
4765
 
4766
+ time_t t_load = time(NULL);
4767
+ printf(" Loaded in %.0f seconds\n\n", difftime(t_load, t_start));
 
4768
 
4769
  /* ── Phase 2: Detect architecture ── */
4770
  printf(" Phase 2: Detecting model architecture...\n");
 
4839
 
4840
  /* ── Phase 3-5: Quantize and write GGUF ── */
4841
  printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
 
 
4842
  int result = write_gguf(output_path, mf, &arch, tokenizer,
4843
  opt_mode, imatrix, verbose);
4844
 
4845
+ /* Wall-clock total: clock() sums CPU time over all OpenMP threads */
4846
+ time_t t_end = time(NULL);
4847
+ printf(" Total time: %.0f seconds\n\n", difftime(t_end, t_start));
4848
 
4849
  if (imatrix) imatrix_free(imatrix);
4850
  if (tokenizer) tok_free(tokenizer);