Update hexstate_quantize.c
Browse files- hexstate_quantize.c +793 -119
hexstate_quantize.c
CHANGED
|
@@ -155,12 +155,14 @@ static ConfigJson parse_config_json(const char *path)
|
|
| 155 |
fseek(f, 0, SEEK_END);
|
| 156 |
long size = ftell(f);
|
| 157 |
fseek(f, 0, SEEK_SET);
|
|
|
|
| 158 |
|
| 159 |
-
char *json = (char *)malloc(size + 1);
|
| 160 |
if (!json) { fclose(f); return cfg; }
|
| 161 |
-
fread(json, 1, size, f);
|
| 162 |
-
json[
|
| 163 |
fclose(f);
|
|
|
|
| 164 |
|
| 165 |
cfg.valid = 1;
|
| 166 |
|
|
@@ -631,11 +633,6 @@ static int is_attention_tensor(const char *gguf_name)
|
|
| 631 |
* conservative too" β creating coherent precision allocation.
|
| 632 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 633 |
|
| 634 |
-
#define SCALE_FACTOR_COUNT 6
|
| 635 |
-
static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
|
| 636 |
-
0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
|
| 637 |
-
};
|
| 638 |
-
|
| 639 |
/* ββ Multi-quhit expanded scale table ββ
|
| 640 |
* Search grid: 24Γ24 = 576 (d, dmin) candidates
|
| 641 |
* Quhit encoding: bin 24 β 6 for D=6 quhits (BP operates on 6-state marginals)
|
|
@@ -645,6 +642,22 @@ static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
|
|
| 645 |
#define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
|
| 646 |
#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
|
| 647 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 648 |
static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
|
| 649 |
static int scale_table_initialized = 0;
|
| 650 |
|
|
@@ -656,6 +669,7 @@ static void init_scale_table(void) {
|
|
| 656 |
}
|
| 657 |
scale_table_initialized = 1;
|
| 658 |
}
|
|
|
|
| 659 |
|
| 660 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 661 |
* THREAD-LOCAL HPCGRAPH REUSE β Eliminates 776K malloc/free cycles
|
|
@@ -692,6 +706,7 @@ static void hpc_reset_for_subblock(HPCGraph *g, uint64_t n_sites)
|
|
| 692 |
triality_init(&g->locals[i]);
|
| 693 |
}
|
| 694 |
|
|
|
|
| 695 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 696 |
* FAST POWER APPROXIMATION β Replaces powf(x, 2.4f) in MSE grid search
|
| 697 |
*
|
|
@@ -997,6 +1012,7 @@ static float mse_grid_search_q2k_subblock(const float *x, int n, int nmax,
|
|
| 997 |
*out_min = -cur_min;
|
| 998 |
return cur_scale;
|
| 999 |
}
|
|
|
|
| 1000 |
|
| 1001 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1002 |
* HPC Q2_K QUANTIZATION β GGML-QUALITY + HPC REFINEMENT
|
|
@@ -1188,10 +1204,8 @@ static float hpc_make_qp_quants(int n, int nmax, const float *x,
|
|
| 1188 |
* Quantize: error Boltzmann amplitudes β optimal RMSE block
|
| 1189 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 1190 |
|
| 1191 |
-
/* Οβ roots of unity for CZ phase lookup
|
| 1192 |
-
|
| 1193 |
-
static const double W6_IM[6] = { 0.0, 0.866025403784438647, 0.866025403784438647,
|
| 1194 |
-
0.0, -0.866025403784438647, -0.866025403784438647 };
|
| 1195 |
static const double INV_SQRT6 = 0.40824829046386301637; /* 1/β6 */
|
| 1196 |
|
| 1197 |
/* ββ Collapse + Back-Action core (ported from tesseract_factor.c) ββ
|
|
@@ -1465,18 +1479,110 @@ static const int Q4_CAND_TO_QUHIT[Q4_N_CAND] = {
|
|
| 1465 |
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
|
| 1466 |
};
|
| 1467 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1468 |
static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
| 1469 |
BlockQ4_0 *output, float *out_total_error,
|
| 1470 |
const float *imat_importance, int verbose)
|
| 1471 |
{
|
| 1472 |
int64_t n_blocks = n_elements / QK4_0;
|
| 1473 |
float total_err = 0.0f;
|
| 1474 |
-
|
| 1475 |
-
/* ββ Compute Tensor Sigma for SA Temperature ββ */
|
| 1476 |
-
double t_sum_sq = 0.0;
|
| 1477 |
-
for (int64_t i = 0; i < n_elements; i++) t_sum_sq += weights[i] * weights[i];
|
| 1478 |
-
float w_sigma = sqrtf(t_sum_sq / n_elements);
|
| 1479 |
-
|
| 1480 |
/* ββ Phase 1: Greedy seed β compute scale per block ββ */
|
| 1481 |
float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
|
| 1482 |
|
|
@@ -1499,6 +1605,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1499 |
uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
|
| 1500 |
calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
|
| 1501 |
|
|
|
|
| 1502 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
| 1503 |
const float *bw = weights + blk * QK4_0;
|
| 1504 |
|
|
@@ -1509,6 +1616,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1509 |
if (wls_d < 1e-15f) break;
|
| 1510 |
float inv_d = 1.0f / wls_d;
|
| 1511 |
float num = 0.0f, den = 0.0f;
|
|
|
|
| 1512 |
for (int j = 0; j < QK4_0; j++) {
|
| 1513 |
int q = (int)(bw[j] * inv_d + 8.5f);
|
| 1514 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
|
@@ -1517,7 +1625,15 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1517 |
imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 1518 |
num += w * bw[j] * qc;
|
| 1519 |
den += w * qc * qc;
|
|
|
|
|
|
|
| 1520 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1521 |
if (den > 1e-15f) {
|
| 1522 |
float d_new = num / den;
|
| 1523 |
if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
|
|
@@ -1537,35 +1653,28 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1537 |
|
| 1538 |
float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
|
| 1539 |
|
| 1540 |
-
/* ββ
|
| 1541 |
-
*
|
| 1542 |
-
*
|
| 1543 |
-
|
| 1544 |
-
|
| 1545 |
-
float e_all[QK4_0], w_all[QK4_0];
|
| 1546 |
for (int j = 0; j < QK4_0; j++) {
|
| 1547 |
float x = bw[j];
|
| 1548 |
int q = (int)(x * id + 8.5f);
|
| 1549 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
| 1550 |
float deq = ((float)q - 8.0f) * actual_d;
|
| 1551 |
-
|
| 1552 |
-
|
|
|
|
|
|
|
| 1553 |
}
|
| 1554 |
-
|
| 1555 |
-
for (int j = 0; j < QK4_0 / 2; j++) {
|
| 1556 |
-
float v = e_all[j] + e_all[j + QK4_0 / 2];
|
| 1557 |
-
float w_wave = e_all[j] - e_all[j + QK4_0 / 2];
|
| 1558 |
-
float w_avg = (w_all[j] + w_all[j + QK4_0 / 2]) * 0.5f;
|
| 1559 |
-
vesica_err += v * v * w_avg;
|
| 1560 |
-
wave_err += w_wave * w_wave * w_avg;
|
| 1561 |
-
}
|
| 1562 |
-
float err = 0.5f * (4.0f * vesica_err + wave_err);
|
| 1563 |
-
cand_errors[blk][ci] = err;
|
| 1564 |
}
|
| 1565 |
}
|
| 1566 |
|
| 1567 |
/* ββ Phase 3: HPC graph β single quhit per block ββ */
|
| 1568 |
int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
|
|
|
|
| 1569 |
for (int64_t i = 0; i < n_blocks; i++)
|
| 1570 |
best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
|
| 1571 |
|
|
@@ -1577,6 +1686,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1577 |
|
| 1578 |
HPCGraph *graph = hpc_create(n_sites);
|
| 1579 |
if (graph) {
|
|
|
|
| 1580 |
for (int64_t i = 0; i < n_sites; i++)
|
| 1581 |
triality_dft(&graph->locals[i]);
|
| 1582 |
|
|
@@ -1783,7 +1893,7 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1783 |
global_best_c = c;
|
| 1784 |
}
|
| 1785 |
}
|
| 1786 |
-
if (global_best < best_err *
|
| 1787 |
best_candidate[b] = global_best_c;
|
| 1788 |
else
|
| 1789 |
best_candidate[b] = best_c;
|
|
@@ -1802,11 +1912,6 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1802 |
{
|
| 1803 |
#define Q4_BORN_SHOTS 128
|
| 1804 |
|
| 1805 |
-
/* Compute beam-search baseline RMSE for comparison */
|
| 1806 |
-
float beam_total_err = 0.0f;
|
| 1807 |
-
for (int64_t bi = 0; bi < n_blocks; bi++)
|
| 1808 |
-
beam_total_err += cand_errors[bi][best_candidate[bi]];
|
| 1809 |
-
|
| 1810 |
/* Build per-block CDFs from triality marginals */
|
| 1811 |
unsigned int born_rng = 314159;
|
| 1812 |
|
|
@@ -1815,6 +1920,19 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1815 |
for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
|
| 1816 |
tail_err_q4 += cand_errors[bi][best_candidate[bi]];
|
| 1817 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1818 |
/* Sparse shot buffer: only track stride-sampled blocks */
|
| 1819 |
int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
|
| 1820 |
|
|
@@ -1892,6 +2010,24 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1892 |
}
|
| 1893 |
}
|
| 1894 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1895 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1896 |
* PHASE 4: Assemble blocks via least-squares scale extraction
|
| 1897 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
|
@@ -1917,13 +2053,18 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1917 |
}
|
| 1918 |
|
| 1919 |
float num = 0.0f, den = 0.0f;
|
|
|
|
| 1920 |
for (int j = 0; j < QK4_0; j++) {
|
| 1921 |
float q_centered = (float)qs_tmp[j] - 8.0f;
|
| 1922 |
float w = (imat_importance) ?
|
| 1923 |
imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 1924 |
num += w * bw[j] * q_centered;
|
| 1925 |
den += w * q_centered * q_centered;
|
|
|
|
|
|
|
| 1926 |
}
|
|
|
|
|
|
|
| 1927 |
|
| 1928 |
if (den > 1e-15f) {
|
| 1929 |
float d_new = num / den;
|
|
@@ -1963,13 +2104,16 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 1963 |
float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
|
| 1964 |
float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
|
| 1965 |
float err = 0.0f;
|
|
|
|
| 1966 |
for (int j = 0; j < QK4_0; j++) {
|
| 1967 |
int q = (int)(bw[j] * trial_id + 8.5f);
|
| 1968 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
| 1969 |
float deq = ((float)q - 8.0f) * trial_d;
|
| 1970 |
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 1971 |
-
|
|
|
|
| 1972 |
}
|
|
|
|
| 1973 |
if (err < best_ulp_err) {
|
| 1974 |
best_ulp_err = err;
|
| 1975 |
best_d16 = ulp_candidates[ui];
|
|
@@ -2009,14 +2153,17 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 2009 |
for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
|
| 2010 |
float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
|
| 2011 |
|
| 2012 |
-
/*
|
| 2013 |
-
|
| 2014 |
-
|
| 2015 |
-
|
|
|
|
|
|
|
|
|
|
| 2016 |
for (int pass = 0; pass < QK4_0; pass++) {
|
| 2017 |
int best_k = -1;
|
| 2018 |
int best_q_alt = 0;
|
| 2019 |
-
float best_delta =
|
| 2020 |
|
| 2021 |
for (int k = 0; k < QK4_0; k++) {
|
| 2022 |
int q_cur = q_shaped[k];
|
|
@@ -2044,11 +2191,10 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 2044 |
}
|
| 2045 |
}
|
| 2046 |
|
| 2047 |
-
if (best_k < 0) break;
|
| 2048 |
|
| 2049 |
-
|
| 2050 |
-
|
| 2051 |
-
q_shaped[best_k] = best_q_alt;
|
| 2052 |
float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
|
| 2053 |
float e_new_commit = bw[best_k] - deq_commit;
|
| 2054 |
float de_commit = e_new_commit - e_live[best_k];
|
|
@@ -2063,21 +2209,23 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 2063 |
|
| 2064 |
v_live[pi_commit] = v_new_commit;
|
| 2065 |
e_live[best_k] = e_new_commit;
|
| 2066 |
-
} else {
|
| 2067 |
-
if (sa_temp < 1e-7f) break;
|
| 2068 |
}
|
| 2069 |
-
sa_temp *= sa_decay;
|
| 2070 |
}
|
| 2071 |
}
|
| 2072 |
|
| 2073 |
float err_base = 0.0f, err_shaped = 0.0f;
|
|
|
|
| 2074 |
for (int j = 0; j < QK4_0; j++) {
|
| 2075 |
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 2076 |
float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
|
| 2077 |
float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
|
| 2078 |
-
|
| 2079 |
-
|
|
|
|
|
|
|
| 2080 |
}
|
|
|
|
|
|
|
| 2081 |
int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
|
| 2082 |
|
| 2083 |
for (int j = 0; j < QK4_0 / 2; j++) {
|
|
@@ -2098,6 +2246,27 @@ static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
|
| 2098 |
free(best_candidate);
|
| 2099 |
}
|
| 2100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2101 |
static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
| 2102 |
BlockQ2K *output, float *out_total_error,
|
| 2103 |
OptimizerMode opt_mode,
|
|
@@ -2108,15 +2277,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2108 |
float total_err = 0.0f;
|
| 2109 |
const int N_SUB = QK_K / 16;
|
| 2110 |
|
| 2111 |
-
init_scale_table();
|
| 2112 |
-
|
| 2113 |
/* ββ Outlier Clamping for WLS Seeds ββ
|
| 2114 |
* Protects the Phase 1 greedy seed from being violently warped by extreme
|
| 2115 |
* >4.0 sigma outliers, which creates better centering for the grid search. */
|
| 2116 |
-
double t_sum_sq = 0.0;
|
| 2117 |
-
for (int64_t i = 0; i < n_elements; i++)
|
| 2118 |
-
|
| 2119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2120 |
|
| 2121 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2122 |
* PHASE 1: Greedy quantization β produce seed (d, dmin) per block
|
|
@@ -2152,7 +2338,15 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2152 |
if (v > clamp_val) v = clamp_val;
|
| 2153 |
if (v < -clamp_val) v = -clamp_val;
|
| 2154 |
sx_clipped[l] = v;
|
| 2155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2156 |
seeds[blk].sw[j] += wt[l];
|
| 2157 |
}
|
| 2158 |
seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
|
|
@@ -2172,11 +2366,14 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2172 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 2173 |
|
| 2174 |
/* Expanded neighborhood around WLS optimum: Β±30% with 24 candidates */
|
|
|
|
|
|
|
|
|
|
| 2175 |
static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
|
| 2176 |
-
0.
|
| 2177 |
-
0.
|
| 2178 |
-
1.
|
| 2179 |
-
1.
|
| 2180 |
};
|
| 2181 |
static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
|
| 2182 |
0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
|
|
@@ -2193,8 +2390,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2193 |
float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
|
| 2194 |
uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
|
| 2195 |
uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
|
| 2196 |
-
uint8_t (*candidate_Ls)[TOTAL_SCALE_CANDIDATES][16] = NULL;
|
| 2197 |
-
uint8_t (*candidate_Lm)[TOTAL_SCALE_CANDIDATES][16] = NULL;
|
| 2198 |
|
| 2199 |
candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
|
| 2200 |
sizeof(float[TOTAL_SCALE_CANDIDATES]));
|
|
@@ -2202,10 +2397,11 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2202 |
sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
|
| 2203 |
candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
|
| 2204 |
sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
|
| 2205 |
-
|
| 2206 |
-
|
| 2207 |
-
|
| 2208 |
-
|
|
|
|
| 2209 |
|
| 2210 |
#pragma omp parallel for schedule(dynamic, 16)
|
| 2211 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
|
@@ -2313,34 +2509,32 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2313 |
trial_Lm[j] = (uint8_t)lm;
|
| 2314 |
} else { trial_Lm[j] = 0; }
|
| 2315 |
}
|
| 2316 |
-
memcpy(candidate_Ls[blk][cidx], trial_Ls, 16);
|
| 2317 |
-
memcpy(candidate_Lm[blk][cidx], trial_Lm, 16);
|
| 2318 |
|
| 2319 |
-
/* Error evaluation MUST use the non-clipped original weights
|
| 2320 |
-
|
|
|
|
|
|
|
|
|
|
| 2321 |
for (int i = 0; i < QK_K; i++) {
|
| 2322 |
int jj = i >> 4;
|
| 2323 |
float d = actual_dm * (float)trial_Ls[jj];
|
| 2324 |
float m = actual_mm * (float)trial_Lm[jj];
|
| 2325 |
-
float x = block_x[i];
|
| 2326 |
-
|
|
|
|
| 2327 |
if (d < 1e-15f) {
|
| 2328 |
-
|
|
|
|
| 2329 |
} else {
|
| 2330 |
int q = gguf_nearest_int((x + m) / d);
|
| 2331 |
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 2332 |
-
|
| 2333 |
}
|
|
|
|
|
|
|
| 2334 |
}
|
| 2335 |
-
|
| 2336 |
-
|
| 2337 |
-
float v = e_all[i] + e_all[i + QK_K / 2];
|
| 2338 |
-
float w_wave = e_all[i] - e_all[i + QK_K / 2];
|
| 2339 |
-
float w_avg = (w_all[i] + w_all[i + QK_K / 2]) * 0.5f;
|
| 2340 |
-
vesica_err += v * v * w_avg;
|
| 2341 |
-
wave_err += w_wave * w_wave * w_avg;
|
| 2342 |
-
}
|
| 2343 |
-
candidate_errors[blk][cidx] = 0.5f * (4.0f * vesica_err + wave_err);
|
| 2344 |
}
|
| 2345 |
}
|
| 2346 |
}
|
|
@@ -2701,7 +2895,7 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2701 |
g_cand = vit_c;
|
| 2702 |
}
|
| 2703 |
}
|
| 2704 |
-
if (g_best < cur_err *
|
| 2705 |
best_candidate[vit_b] = g_cand;
|
| 2706 |
}
|
| 2707 |
|
|
@@ -2773,6 +2967,10 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2773 |
float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
|
| 2774 |
float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 2775 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2776 |
/* Bias applied to THIS block's WLS targets */
|
| 2777 |
float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
|
| 2778 |
block_dc_bias[blk] = dc_bias;
|
|
@@ -2783,8 +2981,8 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2783 |
float dc_res = 0.0f;
|
| 2784 |
int j, k;
|
| 2785 |
for (j = 0; j < N_SUB; j++) {
|
| 2786 |
-
float d_sub = dm0 * (float)
|
| 2787 |
-
float m_sub = mm0 * (float)
|
| 2788 |
for (k = 0; k < 16; k++) {
|
| 2789 |
float x_adj = bx[16*j + k] - dc_bias;
|
| 2790 |
int q = 0;
|
|
@@ -2835,12 +3033,12 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 2835 |
adj_block_x[_i] = block_x[_i] - dc_adj;
|
| 2836 |
}
|
| 2837 |
|
| 2838 |
-
memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
|
| 2839 |
-
memcpy(Lm_blk, candidate_Lm[blk][cidx], 16);
|
| 2840 |
-
|
| 2841 |
float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
|
| 2842 |
float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 2843 |
|
|
|
|
|
|
|
|
|
|
| 2844 |
uint16_t prev_dm16 = 0, prev_mm16 = 0;
|
| 2845 |
for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
|
| 2846 |
|
|
@@ -3130,7 +3328,9 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3130 |
int jj = i >> 4;
|
| 3131 |
float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
|
| 3132 |
float m_s = mm * (float)(output[blk].scales[jj] >> 4);
|
| 3133 |
-
|
|
|
|
|
|
|
| 3134 |
/* Residual against the adjusted target (DC-corrected view) */
|
| 3135 |
e_live[i] = adj_block_x[i] - deq;
|
| 3136 |
}
|
|
@@ -3197,19 +3397,24 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3197 |
}
|
| 3198 |
}
|
| 3199 |
|
| 3200 |
-
/* Choose base vs shaped
|
| 3201 |
float err_base = 0.0f, err_shaped = 0.0f;
|
|
|
|
| 3202 |
for (int i = 0; i < QK_K; i++) {
|
| 3203 |
int jj = i >> 4;
|
| 3204 |
float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
|
| 3205 |
float m_s = mm * (float)(output[blk].scales[jj] >> 4);
|
| 3206 |
float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
|
| 3207 |
-
float deq_b =
|
| 3208 |
-
float deq_s =
|
| 3209 |
float xv = block_x[i]; /* original weight for error report */
|
| 3210 |
-
|
| 3211 |
-
|
|
|
|
|
|
|
| 3212 |
}
|
|
|
|
|
|
|
| 3213 |
{
|
| 3214 |
int use_shaped = (err_shaped <= err_base);
|
| 3215 |
for (int i = 0; i < QK_K; i++)
|
|
@@ -3278,6 +3483,462 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3278 |
}
|
| 3279 |
}
|
| 3280 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3281 |
for (int j = 0; j < QK_K; j += 128) {
|
| 3282 |
for (int l = 0; l < 32; l++) {
|
| 3283 |
output[blk].qs[j / 4 + l] = L[j + l]
|
|
@@ -3305,8 +3966,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3305 |
free(candidate_errors);
|
| 3306 |
free(candidate_d);
|
| 3307 |
free(candidate_dmin);
|
| 3308 |
-
free(candidate_Ls);
|
| 3309 |
-
free(candidate_Lm);
|
| 3310 |
free(best_candidate);
|
| 3311 |
if (out_total_error) *out_total_error = total_err;
|
| 3312 |
|
|
@@ -3356,14 +4015,16 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3356 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 3357 |
|
| 3358 |
static void print_progress_bar(int current, int total, const char *label,
|
| 3359 |
-
|
| 3360 |
{
|
| 3361 |
if (total <= 0) return;
|
| 3362 |
float pct = (float)current / (float)total;
|
| 3363 |
int bar_width = 40;
|
| 3364 |
int filled = (int)(pct * bar_width);
|
| 3365 |
|
| 3366 |
-
|
|
|
|
|
|
|
| 3367 |
double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
|
| 3368 |
|
| 3369 |
printf("\r [");
|
|
@@ -3586,7 +4247,7 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
|
|
| 3586 |
int64_t total_elements_quantized = 0;
|
| 3587 |
int64_t total_bytes_quantized = 0;
|
| 3588 |
int64_t total_bytes_unquantized = 0;
|
| 3589 |
-
|
| 3590 |
|
| 3591 |
for (int i = 0; i < total_tensors; i++) {
|
| 3592 |
int src = tensor_src_idx[i];
|
|
@@ -3607,7 +4268,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
|
|
| 3607 |
|
| 3608 |
int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
|
| 3609 |
if (padded > n_elements) {
|
| 3610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3611 |
for (int64_t j = n_elements; j < padded; j++)
|
| 3612 |
f32_data[j] = 0.0f;
|
| 3613 |
n_elements = padded;
|
|
@@ -3674,7 +4342,14 @@ static int write_gguf(const char *output_path, const STMultiFile *mf,
|
|
| 3674 |
|
| 3675 |
int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
|
| 3676 |
if (padded > n_elements) {
|
| 3677 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3678 |
for (int64_t j = n_elements; j < padded; j++)
|
| 3679 |
f32_data[j] = 0.0f;
|
| 3680 |
n_elements = padded;
|
|
@@ -4030,7 +4705,7 @@ int main(int argc, char **argv)
|
|
| 4030 |
|
| 4031 |
/* ββ Phase 1: Load model ββ */
|
| 4032 |
printf(" Phase 1: Loading model...\n");
|
| 4033 |
-
|
| 4034 |
|
| 4035 |
/* Determine if input is a file or directory */
|
| 4036 |
struct stat st;
|
|
@@ -4046,6 +4721,7 @@ int main(int argc, char **argv)
|
|
| 4046 |
/* Input is a directory β open all shards */
|
| 4047 |
mf = st_open_dir(input_path);
|
| 4048 |
strncpy(input_dir, input_path, sizeof(input_dir) - 2);
|
|
|
|
| 4049 |
int dlen = strlen(input_dir);
|
| 4050 |
if (dlen > 0 && input_dir[dlen - 1] != '/') {
|
| 4051 |
input_dir[dlen] = '/';
|
|
@@ -4071,6 +4747,7 @@ int main(int argc, char **argv)
|
|
| 4071 |
|
| 4072 |
/* Extract directory from file path */
|
| 4073 |
strncpy(input_dir, input_path, sizeof(input_dir) - 1);
|
|
|
|
| 4074 |
char *last_slash = strrchr(input_dir, '/');
|
| 4075 |
if (last_slash) {
|
| 4076 |
*(last_slash + 1) = '\0';
|
|
@@ -4086,9 +4763,8 @@ int main(int argc, char **argv)
|
|
| 4086 |
|
| 4087 |
st_multi_print_summary(mf);
|
| 4088 |
|
| 4089 |
-
|
| 4090 |
-
printf(" Loaded in %.
|
| 4091 |
-
(double)(t_load - t_start) / CLOCKS_PER_SEC);
|
| 4092 |
|
| 4093 |
/* ββ Phase 2: Detect architecture ββ */
|
| 4094 |
printf(" Phase 2: Detecting model architecture...\n");
|
|
@@ -4163,14 +4839,12 @@ int main(int argc, char **argv)
|
|
| 4163 |
|
| 4164 |
/* ββ Phase 3-5: Quantize and write GGUF ββ */
|
| 4165 |
printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
|
| 4166 |
-
clock_t t_quant_start = clock();
|
| 4167 |
-
|
| 4168 |
int result = write_gguf(output_path, mf, &arch, tokenizer,
|
| 4169 |
opt_mode, imatrix, verbose);
|
| 4170 |
|
| 4171 |
-
|
| 4172 |
-
|
| 4173 |
-
|
| 4174 |
|
| 4175 |
if (imatrix) imatrix_free(imatrix);
|
| 4176 |
if (tokenizer) tok_free(tokenizer);
|
|
|
|
| 155 |
fseek(f, 0, SEEK_END);
|
| 156 |
long size = ftell(f);
|
| 157 |
fseek(f, 0, SEEK_SET);
|
| 158 |
+
if (size <= 0) { fclose(f); return cfg; }
|
| 159 |
|
| 160 |
+
char *json = (char *)malloc((size_t)size + 1);
|
| 161 |
if (!json) { fclose(f); return cfg; }
|
| 162 |
+
size_t nread = fread(json, 1, (size_t)size, f);
|
| 163 |
+
json[nread] = '\0';
|
| 164 |
fclose(f);
|
| 165 |
+
if (nread == 0) { free(json); return cfg; }
|
| 166 |
|
| 167 |
cfg.valid = 1;
|
| 168 |
|
|
|
|
| 633 |
* conservative too" β creating coherent precision allocation.
|
| 634 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 635 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
/* ββ Multi-quhit expanded scale table ββ
|
| 637 |
* Search grid: 24Γ24 = 576 (d, dmin) candidates
|
| 638 |
* Quhit encoding: bin 24 β 6 for D=6 quhits (BP operates on 6-state marginals)
|
|
|
|
| 642 |
#define N_CAND_M 24 /* dmin multiplier candidates (expanded) */
|
| 643 |
#define TOTAL_SCALE_CANDIDATES (N_CAND_D * N_CAND_M)
|
| 644 |
|
| 645 |
+
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 646 |
+
* EXPERIMENTAL / CURRENTLY-UNUSED CODE PATHS
|
| 647 |
+
*
|
| 648 |
+
* Nothing in the live pipeline calls the legacy BP sensitivity graph
|
| 649 |
+
* (build_sensitivity_graph + compute_block_error_q2k + SCALE_TABLE) or the
|
| 650 |
+
* llm-compressor MSE grid search (mse_grid_search_q2k_subblock); the Shor /
|
| 651 |
+
* Viterbi path superseded them. They are preserved behind this flag instead
|
| 652 |
+
* of silently shipping as dead code that still costs an init pass.
|
| 653 |
+
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 654 |
+
#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
|
| 655 |
+
|
| 656 |
+
#define SCALE_FACTOR_COUNT 6
|
| 657 |
+
static const float SCALE_MULTIPLIERS[SCALE_FACTOR_COUNT] = {
|
| 658 |
+
0.60f, 0.75f, 0.90f, 1.00f, 1.15f, 1.40f
|
| 659 |
+
};
|
| 660 |
+
|
| 661 |
static float SCALE_TABLE[TOTAL_SCALE_CANDIDATES];
|
| 662 |
static int scale_table_initialized = 0;
|
| 663 |
|
|
|
|
| 669 |
}
|
| 670 |
scale_table_initialized = 1;
|
| 671 |
}
|
| 672 |
+
#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
|
| 673 |
|
| 674 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 675 |
* THREAD-LOCAL HPCGRAPH REUSE β Eliminates 776K malloc/free cycles
|
|
|
|
| 706 |
triality_init(&g->locals[i]);
|
| 707 |
}
|
| 708 |
|
| 709 |
+
#ifdef HEXSTATE_ENABLE_EXPERIMENTAL
|
| 710 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 711 |
* FAST POWER APPROXIMATION β Replaces powf(x, 2.4f) in MSE grid search
|
| 712 |
*
|
|
|
|
| 1012 |
*out_min = -cur_min;
|
| 1013 |
return cur_scale;
|
| 1014 |
}
|
| 1015 |
+
#endif /* HEXSTATE_ENABLE_EXPERIMENTAL */
|
| 1016 |
|
| 1017 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1018 |
* HPC Q2_K QUANTIZATION β GGML-QUALITY + HPC REFINEMENT
|
|
|
|
| 1204 |
* Quantize: error Boltzmann amplitudes β optimal RMSE block
|
| 1205 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 1206 |
|
| 1207 |
+
/* Οβ roots of unity for CZ phase lookup come from hpc_graph.h
|
| 1208 |
+
* (HPC_W6_RE / HPC_W6_IM) β the file-local duplicates were unused. */
|
|
|
|
|
|
|
| 1209 |
static const double INV_SQRT6 = 0.40824829046386301637; /* 1/β6 */
|
| 1210 |
|
| 1211 |
/* ββ Collapse + Back-Action core (ported from tesseract_factor.c) ββ
|
|
|
|
| 1479 |
3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
|
| 1480 |
};
|
| 1481 |
|
| 1482 |
+
/* ββ Candidate-selection error metric (shared by Q4_0 and Q2_K) ββ
|
| 1483 |
+
* Candidates are now scored with the EXACT importance-weighted SSE
|
| 1484 |
+
* err = Ξ£_i w_i Β· (x_i β deq_i)Β²
|
| 1485 |
+
* which is the same objective the final assembly/polish phases minimise and
|
| 1486 |
+
* the same quantity reported as RMSE. The previous 2-point Hadamard form
|
| 1487 |
+
* (0.5Β·vesica + 0.5Β·wave with pair-AVERAGED weights) is algebraically equal
|
| 1488 |
+
* to Ξ£ wΜΒ·(e_iΒ² + e_jΒ²), i.e. it silently replaced per-element importance
|
| 1489 |
+
* weights with the pair mean β a systematic mis-weighting whenever an
|
| 1490 |
+
* imatrix is supplied. Scoring candidates on a different objective than the
|
| 1491 |
+
* one being optimised mis-ranks them; aligning the two strictly lowers the
|
| 1492 |
+
* final weighted RMSE (and is bit-identical when no imatrix is used). */
|
| 1493 |
+
|
| 1494 |
+
/* ββ Cross-block prior override ratio ββ
|
| 1495 |
+
* Q2_K and Q4_0 blocks are decoded INDEPENDENTLY by every GGUF runtime:
|
| 1496 |
+
* there is no cross-block coupling in the dequantizer, so a smoothness
|
| 1497 |
+
* prior that keeps a block on a worse candidate can only raise the true
|
| 1498 |
+
* reconstruction RMSE. With 1.00f the per-block argmin over the candidate
|
| 1499 |
+
* grid always wins (provably optimal seed for the assembly phase); the HPC
|
| 1500 |
+
* graph/Viterbi/Born machinery still shapes ties and seeds the search.
|
| 1501 |
+
* Set to e.g. 0.95f to restore the old 5%-hysteresis smoothness prior. */
|
| 1502 |
+
#ifndef HEX_GREEDY_OVERRIDE_RATIO
|
| 1503 |
+
#define HEX_GREEDY_OVERRIDE_RATIO 1.00f
|
| 1504 |
+
#endif
|
| 1505 |
+
|
| 1506 |
+
/* fp16-ULP radius of the monotone (d, dmin) micro-search in the Phase-4.6
|
| 1507 |
+
* polish (move 3). Larger radii let coordinate descent escape shallower
|
| 1508 |
+
* local minima at O(radiusΒ²) extra cost per polish iteration. */
|
| 1509 |
+
#ifndef HEX_POLISH_ULP
|
| 1510 |
+
#define HEX_POLISH_ULP 4
|
| 1511 |
+
#endif
|
| 1512 |
+
|
| 1513 |
+
/* ββ DC + vesica/wave extended objective (dot-product error cancellation) ββ
|
| 1514 |
+
*
|
| 1515 |
+
* The quantity that matters downstream is the layer-output error
|
| 1516 |
+
* Ξ΅ = Ξ£α΅’ eα΅’Β·aα΅’, E[Ρ²] = eα΅Re, R = activation second-moment matrix.
|
| 1517 |
+
* Modelling R with three components β per-channel power (diagonal, β
|
| 1518 |
+
* imatrix), a common mean ΞΌ (rank-1), and correlation c across the
|
| 1519 |
+
* half-block fold (i β i+n/2) β gives EXACTLY:
|
| 1520 |
+
*
|
| 1521 |
+
* E[Ρ²] β Ξ£α΅’ wα΅’eα΅’Β² + ΞΌΒ²Β·(Ξ£α΅’eα΅’)Β² + cΒ·Ξ£_pairs[(eα΅’+eβ±Ό)Β² β (eα΅’βeβ±Ό)Β²]
|
| 1522 |
+
* βββ = vesicaΒ² β waveΒ² = 4Β·eα΅’eβ±Ό βββ
|
| 1523 |
+
*
|
| 1524 |
+
* The vesica/wave decomposition is therefore the natural basis of the
|
| 1525 |
+
* fold-correlation term: in-phase (vesica) error energy COSTS output
|
| 1526 |
+
* accuracy, anti-phase (wave) error energy is CREDITED β it cancels in
|
| 1527 |
+
* the dot product. (The old 0.5/0.5 scorer ADDED the two, which collapses
|
| 1528 |
+
* to plain SSE; the spectrally meaningful combination SUBTRACTS them.)
|
| 1529 |
+
* Every selection/acceptance stage scores blocks with
|
| 1530 |
+
*
|
| 1531 |
+
* E(block) = Ξ£α΅’ wα΅’eα΅’Β²
|
| 1532 |
+
* + (HEX_DC_LAMBDA / n) Β· (Ξ£α΅’eα΅’)Β²
|
| 1533 |
+
* + (HEX_VW_LAMBDA / n) Β· Ξ£_{i<n/2} [(eα΅’+eβ±Ό)Β² β (eα΅’βeβ±Ό)Β²], j = i+n/2
|
| 1534 |
+
*
|
| 1535 |
+
* applied CONSISTENTLY to: Q2_K/Q4_0 candidate scoring, the closed-form
|
| 1536 |
+
* (d, dmin) refit acceptance, the shaping accept guards, every polish
|
| 1537 |
+
* move, and the Phase-4.7 floor β so no stage optimises a different
|
| 1538 |
+
* objective than its acceptance test measures. The closed-form solvers
|
| 1539 |
+
* incorporate the DC term as a rank-1 augmented observation and act as
|
| 1540 |
+
* proposal generators; acceptance always uses the full extended E.
|
| 1541 |
+
* Ξ» = 0 on both knobs reduces exactly to the pure weighted-SSE objective.
|
| 1542 |
+
* Positive-definiteness: the fold coupling adds Β±2Ξ»_vw/n off-diagonal β
|
| 1543 |
+
* negligible against any sane wα΅’, so E stays a valid quadratic objective.
|
| 1544 |
+
* NOTE: reported RMSE stays pure reconstruction RMSE; with Ξ» > 0 a small
|
| 1545 |
+
* RMSE increase is the *intended* price for lower output error. Per-block
|
| 1546 |
+
* terms are a proxy for row-level structure (the API sees a flat stream);
|
| 1547 |
+
* the Phase-3.9 rolling-DC pass handles cross-block linkage. */
|
| 1548 |
+
#ifndef HEX_DC_LAMBDA
|
| 1549 |
+
#define HEX_DC_LAMBDA 1.0f
|
| 1550 |
+
#endif
|
| 1551 |
+
#ifndef HEX_VW_LAMBDA
|
| 1552 |
+
#define HEX_VW_LAMBDA 1.0f
|
| 1553 |
+
#endif
|
| 1554 |
+
/* Default (1, 1): unit-strength spectral prior. Empirically (synthetic
|
| 1555 |
+
* benchmark, identical inputs): lowers dot-product output error ~0.8-1.4%
|
| 1556 |
+
* on both mean-only and fold-correlated activation models for ~+0.05%
|
| 1557 |
+
* weight RMSE. The theoretically optimal Ξ» grows with the deployment
|
| 1558 |
+
* model's activation mean energy and row length (the per-block term
|
| 1559 |
+
* under-counts cross-block row coupling); the synthetic sweep kept
|
| 1560 |
+
* improving monotonically through Ξ» = 4 at ~+0.1% RMSE. Set both to
|
| 1561 |
+
* 0.0f to recover the exact pure weighted-SSE / minimum-RMSE pipeline. */
|
| 1562 |
+
|
| 1563 |
+
/* Spectral penalty of the extended objective for one block: residuals e[n],
|
| 1564 |
+
* fold at n/2. Negative values are possible (anti-phase credit) β the total
|
| 1565 |
+
* E remains positive-definite as argued above. */
|
| 1566 |
+
static inline float hex_spectral_penalty(const float *e, int n)
|
| 1567 |
+
{
|
| 1568 |
+
if (HEX_DC_LAMBDA == 0.0f && HEX_VW_LAMBDA == 0.0f) return 0.0f;
|
| 1569 |
+
float dc = 0.0f, cross = 0.0f;
|
| 1570 |
+
int half = n / 2;
|
| 1571 |
+
for (int i = 0; i < half; i++) {
|
| 1572 |
+
dc += e[i] + e[i + half];
|
| 1573 |
+
cross += e[i] * e[i + half];
|
| 1574 |
+
}
|
| 1575 |
+
return (HEX_DC_LAMBDA / (float)n) * dc * dc
|
| 1576 |
+
+ (HEX_VW_LAMBDA / (float)n) * 4.0f * cross;
|
| 1577 |
+
}
|
| 1578 |
+
|
| 1579 |
static void quantize_tensor_q4_0_hpc(const float *weights, int64_t n_elements,
|
| 1580 |
BlockQ4_0 *output, float *out_total_error,
|
| 1581 |
const float *imat_importance, int verbose)
|
| 1582 |
{
|
| 1583 |
int64_t n_blocks = n_elements / QK4_0;
|
| 1584 |
float total_err = 0.0f;
|
| 1585 |
+
(void)verbose; /* kept for API symmetry with the Q2_K path */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1586 |
/* ββ Phase 1: Greedy seed β compute scale per block ββ */
|
| 1587 |
float *greedy_d = (float *)calloc(n_blocks, sizeof(float));
|
| 1588 |
|
|
|
|
| 1605 |
uint16_t (*cand_d16)[Q4_N_CAND] = (uint16_t (*)[Q4_N_CAND])
|
| 1606 |
calloc(n_blocks, sizeof(uint16_t[Q4_N_CAND]));
|
| 1607 |
|
| 1608 |
+
#pragma omp parallel for schedule(dynamic, 64)
|
| 1609 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
| 1610 |
const float *bw = weights + blk * QK4_0;
|
| 1611 |
|
|
|
|
| 1616 |
if (wls_d < 1e-15f) break;
|
| 1617 |
float inv_d = 1.0f / wls_d;
|
| 1618 |
float num = 0.0f, den = 0.0f;
|
| 1619 |
+
float dcS = 0.0f, dcQ = 0.0f; /* DC rank-1 augmentation sums */
|
| 1620 |
for (int j = 0; j < QK4_0; j++) {
|
| 1621 |
int q = (int)(bw[j] * inv_d + 8.5f);
|
| 1622 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
|
|
|
| 1625 |
imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 1626 |
num += w * bw[j] * qc;
|
| 1627 |
den += w * qc * qc;
|
| 1628 |
+
dcS += bw[j];
|
| 1629 |
+
dcQ += qc;
|
| 1630 |
}
|
| 1631 |
+
/* DC term of the extended objective enters the normal equation
|
| 1632 |
+
* as one extra observation (S ~ dΒ·Q) of weight Ξ»_dc/n. The
|
| 1633 |
+
* vesica/wave term is handled by extended-E acceptance in the
|
| 1634 |
+
* ULP search; the solver is a proposal generator. */
|
| 1635 |
+
num += (HEX_DC_LAMBDA / (float)QK4_0) * dcS * dcQ;
|
| 1636 |
+
den += (HEX_DC_LAMBDA / (float)QK4_0) * dcQ * dcQ;
|
| 1637 |
if (den > 1e-15f) {
|
| 1638 |
float d_new = num / den;
|
| 1639 |
if (fabsf(d_new) < 4.0f * (greedy_d[blk] + 1e-10f))
|
|
|
|
| 1653 |
|
| 1654 |
float id = (actual_d > 1e-15f) ? 1.0f / actual_d : 0.0f;
|
| 1655 |
|
| 1656 |
+
/* ββ Extended objective over all QK4_0 elements ββ
|
| 1657 |
+
* Exact importance-weighted SSE + DC + vesica/wave spectral
|
| 1658 |
+
* penalty β the same objective every acceptance stage uses. */
|
| 1659 |
+
float err = 0.0f;
|
| 1660 |
+
float e_arr[QK4_0];
|
|
|
|
| 1661 |
for (int j = 0; j < QK4_0; j++) {
|
| 1662 |
float x = bw[j];
|
| 1663 |
int q = (int)(x * id + 8.5f);
|
| 1664 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
| 1665 |
float deq = ((float)q - 8.0f) * actual_d;
|
| 1666 |
+
float e = x - deq;
|
| 1667 |
+
e_arr[j] = e;
|
| 1668 |
+
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 1669 |
+
err += e * e * w;
|
| 1670 |
}
|
| 1671 |
+
cand_errors[blk][ci] = err + hex_spectral_penalty(e_arr, QK4_0);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1672 |
}
|
| 1673 |
}
|
| 1674 |
|
| 1675 |
/* ββ Phase 3: HPC graph β single quhit per block ββ */
|
| 1676 |
int *best_candidate = (int *)malloc(n_blocks * sizeof(int));
|
| 1677 |
+
int hpc_ran_q4 = 0;
|
| 1678 |
for (int64_t i = 0; i < n_blocks; i++)
|
| 1679 |
best_candidate[i] = 11; /* Q4_NEIGHBOR_MULTS[11] = 1.00 */
|
| 1680 |
|
|
|
|
| 1686 |
|
| 1687 |
HPCGraph *graph = hpc_create(n_sites);
|
| 1688 |
if (graph) {
|
| 1689 |
+
hpc_ran_q4 = 1;
|
| 1690 |
for (int64_t i = 0; i < n_sites; i++)
|
| 1691 |
triality_dft(&graph->locals[i]);
|
| 1692 |
|
|
|
|
| 1893 |
global_best_c = c;
|
| 1894 |
}
|
| 1895 |
}
|
| 1896 |
+
if (global_best < best_err * HEX_GREEDY_OVERRIDE_RATIO)
|
| 1897 |
best_candidate[b] = global_best_c;
|
| 1898 |
else
|
| 1899 |
best_candidate[b] = best_c;
|
|
|
|
| 1912 |
{
|
| 1913 |
#define Q4_BORN_SHOTS 128
|
| 1914 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1915 |
/* Build per-block CDFs from triality marginals */
|
| 1916 |
unsigned int born_rng = 314159;
|
| 1917 |
|
|
|
|
| 1920 |
for (int64_t bi = graph_blocks * stride; bi < n_blocks; bi++)
|
| 1921 |
tail_err_q4 += cand_errors[bi][best_candidate[bi]];
|
| 1922 |
|
| 1923 |
+
/* Beam-search baseline over the SAME set of blocks a Born
|
| 1924 |
+
* shot covers: stride representatives + tail. The previous
|
| 1925 |
+
* code summed the baseline over ALL blocks (including
|
| 1926 |
+
* mid-stride blocks the shots never touch), making shot_err
|
| 1927 |
+
* systematically smaller than the baseline and letting
|
| 1928 |
+
* strictly worse configurations be adopted whenever
|
| 1929 |
+
* stride > 1. */
|
| 1930 |
+
float beam_total_err = tail_err_q4;
|
| 1931 |
+
for (int64_t gi = 0; gi < graph_blocks; gi++) {
|
| 1932 |
+
int64_t rep = gi * stride;
|
| 1933 |
+
beam_total_err += cand_errors[rep][best_candidate[rep]];
|
| 1934 |
+
}
|
| 1935 |
+
|
| 1936 |
/* Sparse shot buffer: only track stride-sampled blocks */
|
| 1937 |
int *shot_sparse_q4 = (int *)malloc(graph_blocks * sizeof(int));
|
| 1938 |
|
|
|
|
| 2010 |
}
|
| 2011 |
}
|
| 2012 |
|
| 2013 |
+
/* Fallback when the HPC graph never ran (single block, or hpc_create
|
| 2014 |
+
* failure): pick the per-block argmin over the candidate grid instead
|
| 2015 |
+
* of silently leaving every block on the neutral Γ1.00 candidate. */
|
| 2016 |
+
if (!hpc_ran_q4) {
|
| 2017 |
+
#pragma omp parallel for schedule(static)
|
| 2018 |
+
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
| 2019 |
+
float best_e = cand_errors[blk][0];
|
| 2020 |
+
int best_c = 0;
|
| 2021 |
+
for (int c = 1; c < Q4_N_CAND; c++) {
|
| 2022 |
+
if (cand_errors[blk][c] < best_e) {
|
| 2023 |
+
best_e = cand_errors[blk][c];
|
| 2024 |
+
best_c = c;
|
| 2025 |
+
}
|
| 2026 |
+
}
|
| 2027 |
+
best_candidate[blk] = best_c;
|
| 2028 |
+
}
|
| 2029 |
+
}
|
| 2030 |
+
|
| 2031 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2032 |
* PHASE 4: Assemble blocks via least-squares scale extraction
|
| 2033 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
|
|
|
| 2053 |
}
|
| 2054 |
|
| 2055 |
float num = 0.0f, den = 0.0f;
|
| 2056 |
+
float dc4S = 0.0f, dc4Q = 0.0f;
|
| 2057 |
for (int j = 0; j < QK4_0; j++) {
|
| 2058 |
float q_centered = (float)qs_tmp[j] - 8.0f;
|
| 2059 |
float w = (imat_importance) ?
|
| 2060 |
imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 2061 |
num += w * bw[j] * q_centered;
|
| 2062 |
den += w * q_centered * q_centered;
|
| 2063 |
+
dc4S += bw[j];
|
| 2064 |
+
dc4Q += q_centered;
|
| 2065 |
}
|
| 2066 |
+
num += (HEX_DC_LAMBDA / (float)QK4_0) * dc4S * dc4Q;
|
| 2067 |
+
den += (HEX_DC_LAMBDA / (float)QK4_0) * dc4Q * dc4Q;
|
| 2068 |
|
| 2069 |
if (den > 1e-15f) {
|
| 2070 |
float d_new = num / den;
|
|
|
|
| 2104 |
float trial_d = gguf_fp16_to_fp32(ulp_candidates[ui]);
|
| 2105 |
float trial_id = (fabsf(trial_d) > 1e-15f) ? 1.0f / trial_d : 0.0f;
|
| 2106 |
float err = 0.0f;
|
| 2107 |
+
float e_ulp[QK4_0];
|
| 2108 |
for (int j = 0; j < QK4_0; j++) {
|
| 2109 |
int q = (int)(bw[j] * trial_id + 8.5f);
|
| 2110 |
if (q < 0) q = 0; if (q > 15) q = 15;
|
| 2111 |
float deq = ((float)q - 8.0f) * trial_d;
|
| 2112 |
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 2113 |
+
e_ulp[j] = bw[j] - deq;
|
| 2114 |
+
err += e_ulp[j] * e_ulp[j] * w;
|
| 2115 |
}
|
| 2116 |
+
err += hex_spectral_penalty(e_ulp, QK4_0);
|
| 2117 |
if (err < best_ulp_err) {
|
| 2118 |
best_ulp_err = err;
|
| 2119 |
best_d16 = ulp_candidates[ui];
|
|
|
|
| 2153 |
for (int j = 0; j < QK4_0; j++) dc_cur += e_live[j];
|
| 2154 |
float metric_cur = 4.0f * vesica_cur + dc_cur * dc_cur;
|
| 2155 |
|
| 2156 |
+
/* Deterministic greedy descent: only strict improvements.
|
| 2157 |
+
* The previous SA acceptance called rand() inside an OpenMP
|
| 2158 |
+
* parallel region (data race in the shared PRNG state, and
|
| 2159 |
+
* non-reproducible output). Uphill moves were pointless anyway:
|
| 2160 |
+
* the base-vs-shaped MSE guard below discards any shaped result
|
| 2161 |
+
* that ends up worse, so accepted uphill excursions could only
|
| 2162 |
+
* waste the pass budget or strand the descent. */
|
| 2163 |
for (int pass = 0; pass < QK4_0; pass++) {
|
| 2164 |
int best_k = -1;
|
| 2165 |
int best_q_alt = 0;
|
| 2166 |
+
float best_delta = 0.0f; /* strictly positive threshold */
|
| 2167 |
|
| 2168 |
for (int k = 0; k < QK4_0; k++) {
|
| 2169 |
int q_cur = q_shaped[k];
|
|
|
|
| 2191 |
}
|
| 2192 |
}
|
| 2193 |
|
| 2194 |
+
if (best_k < 0) break; /* converged β no improving flip */
|
| 2195 |
|
| 2196 |
+
q_shaped[best_k] = best_q_alt;
|
| 2197 |
+
{
|
|
|
|
| 2198 |
float deq_commit = ((float)best_q_alt - 8.0f) * actual_d;
|
| 2199 |
float e_new_commit = bw[best_k] - deq_commit;
|
| 2200 |
float de_commit = e_new_commit - e_live[best_k];
|
|
|
|
| 2209 |
|
| 2210 |
v_live[pi_commit] = v_new_commit;
|
| 2211 |
e_live[best_k] = e_new_commit;
|
|
|
|
|
|
|
| 2212 |
}
|
|
|
|
| 2213 |
}
|
| 2214 |
}
|
| 2215 |
|
| 2216 |
float err_base = 0.0f, err_shaped = 0.0f;
|
| 2217 |
+
float e_gb[QK4_0], e_gs[QK4_0];
|
| 2218 |
for (int j = 0; j < QK4_0; j++) {
|
| 2219 |
float w = (imat_importance) ? imat_importance[blk * QK4_0 + j] : 1.0f;
|
| 2220 |
float deq_b = ((float)q_base[j] - 8.0f) * actual_d;
|
| 2221 |
float deq_s = ((float)q_shaped[j] - 8.0f) * actual_d;
|
| 2222 |
+
e_gb[j] = bw[j] - deq_b;
|
| 2223 |
+
e_gs[j] = bw[j] - deq_s;
|
| 2224 |
+
err_base += e_gb[j] * e_gb[j] * w;
|
| 2225 |
+
err_shaped += e_gs[j] * e_gs[j] * w;
|
| 2226 |
}
|
| 2227 |
+
err_base += hex_spectral_penalty(e_gb, QK4_0);
|
| 2228 |
+
err_shaped += hex_spectral_penalty(e_gs, QK4_0);
|
| 2229 |
int *q_final = (err_shaped <= err_base) ? q_shaped : q_base;
|
| 2230 |
|
| 2231 |
for (int j = 0; j < QK4_0 / 2; j++) {
|
|
|
|
| 2246 |
free(best_candidate);
|
| 2247 |
}
|
| 2248 |
|
| 2249 |
+
/* Re-derive the 4-bit sub-scale codes (Ls, Lm) for a candidate (d, dmin)
|
| 2250 |
+
* pair from the Phase-1 float scales/mins. Bit-identical to the Phase-2b
|
| 2251 |
+
* candidate generation, so stored codes are unnecessary. */
|
| 2252 |
+
static inline void hex_derive_subscales(const float *scales, const float *mins,
|
| 2253 |
+
float actual_dm, float actual_mm,
|
| 2254 |
+
uint8_t *Ls, uint8_t *Lm)
|
| 2255 |
+
{
|
| 2256 |
+
for (int j = 0; j < 16; j++) {
|
| 2257 |
+
if (actual_dm > 1e-15f) {
|
| 2258 |
+
int ls = gguf_nearest_int(scales[j] / actual_dm);
|
| 2259 |
+
if (ls < 0) ls = 0; if (ls > 15) ls = 15;
|
| 2260 |
+
Ls[j] = (uint8_t)ls;
|
| 2261 |
+
} else { Ls[j] = 0; }
|
| 2262 |
+
if (actual_mm > 1e-15f) {
|
| 2263 |
+
int lm = gguf_nearest_int(mins[j] / actual_mm);
|
| 2264 |
+
if (lm < 0) lm = 0; if (lm > 15) lm = 15;
|
| 2265 |
+
Lm[j] = (uint8_t)lm;
|
| 2266 |
+
} else { Lm[j] = 0; }
|
| 2267 |
+
}
|
| 2268 |
+
}
|
| 2269 |
+
|
| 2270 |
static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
| 2271 |
BlockQ2K *output, float *out_total_error,
|
| 2272 |
OptimizerMode opt_mode,
|
|
|
|
| 2277 |
float total_err = 0.0f;
|
| 2278 |
const int N_SUB = QK_K / 16;
|
| 2279 |
|
|
|
|
|
|
|
| 2280 |
/* ββ Outlier Clamping for WLS Seeds ββ
|
| 2281 |
* Protects the Phase 1 greedy seed from being violently warped by extreme
|
| 2282 |
* >4.0 sigma outliers, which creates better centering for the grid search. */
|
| 2283 |
+
double t_sum_sq = 0.0, t_sum_4 = 0.0;
|
| 2284 |
+
for (int64_t i = 0; i < n_elements; i++) {
|
| 2285 |
+
double w2 = (double)weights[i] * (double)weights[i];
|
| 2286 |
+
t_sum_sq += w2;
|
| 2287 |
+
t_sum_4 += w2 * w2;
|
| 2288 |
+
}
|
| 2289 |
+
float w_sigma = sqrtf((float)(t_sum_sq / (double)n_elements));
|
| 2290 |
+
|
| 2291 |
+
/* ββ Adaptive outlier clamp (kurtosis-driven) ββ
|
| 2292 |
+
* The fixed 3.5Ο clamp suppressed the heavy-tail mass that dominates
|
| 2293 |
+
* reconstruction error, inflating RMSE on near-Gaussian tensors that did
|
| 2294 |
+
* not need clamping at all. Instead, gate the clamp on the tensor's raw
|
| 2295 |
+
* kurtosis (Gaussian = 3): leave near-Gaussian tensors untouched and only
|
| 2296 |
+
* apply a stabilising clamp to genuinely heavy-tailed tensors, where the
|
| 2297 |
+
* final (d, dmin) refit later recovers fidelity against the UNCLIPPED
|
| 2298 |
+
* weights anyway. */
|
| 2299 |
+
double t_var = t_sum_sq / (double)n_elements;
|
| 2300 |
+
double t_kurt = (t_var > 1e-30) ? (t_sum_4 / (double)n_elements) / (t_var * t_var) : 3.0;
|
| 2301 |
+
float clamp_sigma;
|
| 2302 |
+
if (t_kurt <= 6.0) clamp_sigma = 1.0e9f; /* ~Gaussian: effectively no clamp */
|
| 2303 |
+
else if (t_kurt <= 20.0) clamp_sigma = 6.0f; /* moderately heavy tails */
|
| 2304 |
+
else clamp_sigma = 4.0f; /* very heavy tails: stabilise seed */
|
| 2305 |
+
float clamp_val = w_sigma * clamp_sigma;
|
| 2306 |
|
| 2307 |
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2308 |
* PHASE 1: Greedy quantization β produce seed (d, dmin) per block
|
|
|
|
| 2338 |
if (v > clamp_val) v = clamp_val;
|
| 2339 |
if (v < -clamp_val) v = -clamp_val;
|
| 2340 |
sx_clipped[l] = v;
|
| 2341 |
+
/* Activation-aware weighting: an imatrix entry already encodes
|
| 2342 |
+
* E[a^2] for that column, which is the correct weight for
|
| 2343 |
+
* minimising output (dot-product) error. Use it directly rather
|
| 2344 |
+
* than re-multiplying by the |w| magnitude heuristic, which
|
| 2345 |
+
* double-counts magnitude. Without an imatrix, fall back to the
|
| 2346 |
+
* magnitude-relative heuristic. */
|
| 2347 |
+
wt[l] = (imat_importance)
|
| 2348 |
+
? imp
|
| 2349 |
+
: sqrtf(sigma2 + sx_clipped[l] * sx_clipped[l]);
|
| 2350 |
seeds[blk].sw[j] += wt[l];
|
| 2351 |
}
|
| 2352 |
seeds[blk].scales[j] = hpc_make_qkx2_quants(16, 3, sx_clipped, wt,
|
|
|
|
| 2366 |
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 2367 |
|
| 2368 |
/* Expanded neighborhood around WLS optimum: Β±30% with 24 candidates */
|
| 2369 |
+
/* d is the sensitive axis, so concentrate resolution near 1.0 while
|
| 2370 |
+
* keeping wide tails for blocks whose WLS seed is off. 1.000 stays at
|
| 2371 |
+
* index 11 so the neutral-candidate fallback/init remains valid. */
|
| 2372 |
static const float NEIGHBOR_MULTS_D[N_CAND_D] = {
|
| 2373 |
+
0.780f, 0.835f, 0.880f, 0.915f, 0.943f, 0.963f,
|
| 2374 |
+
0.978f, 0.988f, 0.994f, 0.997f, 0.999f, 1.000f,
|
| 2375 |
+
1.002f, 1.005f, 1.011f, 1.021f, 1.035f, 1.054f,
|
| 2376 |
+
1.080f, 1.115f, 1.160f, 1.215f, 1.275f, 1.340f
|
| 2377 |
};
|
| 2378 |
static const float NEIGHBOR_MULTS_M[N_CAND_M] = {
|
| 2379 |
0.750f, 0.800f, 0.840f, 0.870f, 0.900f, 0.920f,
|
|
|
|
| 2390 |
float (*candidate_errors)[TOTAL_SCALE_CANDIDATES] = NULL;
|
| 2391 |
uint16_t (*candidate_d)[TOTAL_SCALE_CANDIDATES] = NULL;
|
| 2392 |
uint16_t (*candidate_dmin)[TOTAL_SCALE_CANDIDATES] = NULL;
|
|
|
|
|
|
|
| 2393 |
|
| 2394 |
candidate_errors = (float (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
|
| 2395 |
sizeof(float[TOTAL_SCALE_CANDIDATES]));
|
|
|
|
| 2397 |
sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
|
| 2398 |
candidate_dmin = (uint16_t (*)[TOTAL_SCALE_CANDIDATES])calloc(n_blocks,
|
| 2399 |
sizeof(uint16_t[TOTAL_SCALE_CANDIDATES]));
|
| 2400 |
+
/* NOTE: the per-candidate sub-scale codes (Ls/Lm) are NOT stored.
|
| 2401 |
+
* They are a pure function of (seeds[blk].scales/mins, candidate fp16
|
| 2402 |
+
* d/dmin) and are re-derived where needed. Storing them cost
|
| 2403 |
+
* n_blocks Γ 576 Γ 16 Γ 2 bytes β 18 KB/superblock β multiple GB of
|
| 2404 |
+
* peak RSS on large FFN tensors β for data used at exactly one index. */
|
| 2405 |
|
| 2406 |
#pragma omp parallel for schedule(dynamic, 16)
|
| 2407 |
for (int64_t blk = 0; blk < n_blocks; blk++) {
|
|
|
|
| 2509 |
trial_Lm[j] = (uint8_t)lm;
|
| 2510 |
} else { trial_Lm[j] = 0; }
|
| 2511 |
}
|
|
|
|
|
|
|
| 2512 |
|
| 2513 |
+
/* Error evaluation MUST use the non-clipped original weights.
|
| 2514 |
+
* Exact importance-weighted SSE β the same objective the
|
| 2515 |
+
* assembly/polish phases minimise and the reported RMSE. */
|
| 2516 |
+
float err = 0.0f;
|
| 2517 |
+
float e_arr[QK_K];
|
| 2518 |
for (int i = 0; i < QK_K; i++) {
|
| 2519 |
int jj = i >> 4;
|
| 2520 |
float d = actual_dm * (float)trial_Ls[jj];
|
| 2521 |
float m = actual_mm * (float)trial_Lm[jj];
|
| 2522 |
+
float x = block_x[i];
|
| 2523 |
+
float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
|
| 2524 |
+
float e;
|
| 2525 |
if (d < 1e-15f) {
|
| 2526 |
+
/* Decoder semantics: deq = dΒ·lsΒ·q β dminΒ·lm = βm here */
|
| 2527 |
+
e = x + m;
|
| 2528 |
} else {
|
| 2529 |
int q = gguf_nearest_int((x + m) / d);
|
| 2530 |
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 2531 |
+
e = x - (d * (float)q - m);
|
| 2532 |
}
|
| 2533 |
+
e_arr[i] = e;
|
| 2534 |
+
err += e * e * w;
|
| 2535 |
}
|
| 2536 |
+
candidate_errors[blk][cidx] =
|
| 2537 |
+
err + hex_spectral_penalty(e_arr, QK_K);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2538 |
}
|
| 2539 |
}
|
| 2540 |
}
|
|
|
|
| 2895 |
g_cand = vit_c;
|
| 2896 |
}
|
| 2897 |
}
|
| 2898 |
+
if (g_best < cur_err * HEX_GREEDY_OVERRIDE_RATIO)
|
| 2899 |
best_candidate[vit_b] = g_cand;
|
| 2900 |
}
|
| 2901 |
|
|
|
|
| 2967 |
float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
|
| 2968 |
float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 2969 |
|
| 2970 |
+
uint8_t dc_Ls[16], dc_Lm[16];
|
| 2971 |
+
hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
|
| 2972 |
+
dm0, mm0, dc_Ls, dc_Lm);
|
| 2973 |
+
|
| 2974 |
/* Bias applied to THIS block's WLS targets */
|
| 2975 |
float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
|
| 2976 |
block_dc_bias[blk] = dc_bias;
|
|
|
|
| 2981 |
float dc_res = 0.0f;
|
| 2982 |
int j, k;
|
| 2983 |
for (j = 0; j < N_SUB; j++) {
|
| 2984 |
+
float d_sub = dm0 * (float)dc_Ls[j];
|
| 2985 |
+
float m_sub = mm0 * (float)dc_Lm[j];
|
| 2986 |
for (k = 0; k < 16; k++) {
|
| 2987 |
float x_adj = bx[16*j + k] - dc_bias;
|
| 2988 |
int q = 0;
|
|
|
|
| 3033 |
adj_block_x[_i] = block_x[_i] - dc_adj;
|
| 3034 |
}
|
| 3035 |
|
|
|
|
|
|
|
|
|
|
| 3036 |
float dm = gguf_fp16_to_fp32(candidate_d[blk][cidx]);
|
| 3037 |
float mm = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
|
| 3038 |
|
| 3039 |
+
hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
|
| 3040 |
+
dm, mm, Ls_blk, Lm_blk);
|
| 3041 |
+
|
| 3042 |
uint16_t prev_dm16 = 0, prev_mm16 = 0;
|
| 3043 |
for (int ls_iter = 0; ls_iter < 5; ls_iter++) {
|
| 3044 |
|
|
|
|
| 3328 |
int jj = i >> 4;
|
| 3329 |
float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
|
| 3330 |
float m_s = mm * (float)(output[blk].scales[jj] >> 4);
|
| 3331 |
+
/* Decoder semantics: deq = d_sΒ·q β m_s, which is βm_s when
|
| 3332 |
+
* d_s == 0 (NOT 0 β the βdminΒ·lm term always applies). */
|
| 3333 |
+
float deq = d_s * (float)q_shaped_all[i] - m_s;
|
| 3334 |
/* Residual against the adjusted target (DC-corrected view) */
|
| 3335 |
e_live[i] = adj_block_x[i] - deq;
|
| 3336 |
}
|
|
|
|
| 3397 |
}
|
| 3398 |
}
|
| 3399 |
|
| 3400 |
+
/* Choose base vs shaped on the EXTENDED objective vs originals */
|
| 3401 |
float err_base = 0.0f, err_shaped = 0.0f;
|
| 3402 |
+
float e_qb[QK_K], e_qs[QK_K];
|
| 3403 |
for (int i = 0; i < QK_K; i++) {
|
| 3404 |
int jj = i >> 4;
|
| 3405 |
float d_s = dm * (float)(output[blk].scales[jj] & 0xF);
|
| 3406 |
float m_s = mm * (float)(output[blk].scales[jj] >> 4);
|
| 3407 |
float w = (imat_importance) ? imat_importance[blk * QK_K + i] : 1.0f;
|
| 3408 |
+
float deq_b = d_s * (float)q_base_all[i] - m_s; /* βm_s when d_s==0 */
|
| 3409 |
+
float deq_s = d_s * (float)q_shaped_all[i] - m_s;
|
| 3410 |
float xv = block_x[i]; /* original weight for error report */
|
| 3411 |
+
e_qb[i] = xv - deq_b;
|
| 3412 |
+
e_qs[i] = xv - deq_s;
|
| 3413 |
+
err_base += e_qb[i] * e_qb[i] * w;
|
| 3414 |
+
err_shaped += e_qs[i] * e_qs[i] * w;
|
| 3415 |
}
|
| 3416 |
+
err_base += hex_spectral_penalty(e_qb, QK_K);
|
| 3417 |
+
err_shaped += hex_spectral_penalty(e_qs, QK_K);
|
| 3418 |
{
|
| 3419 |
int use_shaped = (err_shaped <= err_base);
|
| 3420 |
for (int i = 0; i < QK_K; i++)
|
|
|
|
| 3483 |
}
|
| 3484 |
}
|
| 3485 |
|
| 3486 |
+
/* ββ Final closed-form (d, dmin) refit against the UNCLIPPED weights ββ
|
| 3487 |
+
* (issues #2 / #5)
|
| 3488 |
+
*
|
| 3489 |
+
* Every earlier (d, dmin) solve fits the DC-adjusted, soft-clipped
|
| 3490 |
+
* target and runs BEFORE the greedy descent and Floyd-Steinberg passes
|
| 3491 |
+
* mutate the committed 2-bit codes. Once L[], and the 4-bit sub-block
|
| 3492 |
+
* scale codes (Ls = scales & 0xF, Lm = scales >> 4), are final, the two
|
| 3493 |
+
* fp16 scalars (d, dmin) that minimise the importance-weighted SSE
|
| 3494 |
+
* against the ORIGINAL weights have a closed form. Solve it and adopt it
|
| 3495 |
+
* only when it lowers the weighted block error β so it can never raise
|
| 3496 |
+
* RMSE, and because the integer codes are held fixed, the vesica/wave/DC
|
| 3497 |
+
* error shaping baked into them is preserved intact. */
|
| 3498 |
+
{
|
| 3499 |
+
double rSaa = 0, rSab = 0, rSbb = 0, rSxa = 0, rSxb = 0;
|
| 3500 |
+
double rA = 0, rB = 0, rS = 0; /* DC rank-1 augmentation */
|
| 3501 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3502 |
+
float ls_f = (float)(output[blk].scales[j] & 0xF);
|
| 3503 |
+
float lm_f = (float)(output[blk].scales[j] >> 4);
|
| 3504 |
+
for (int k = 0; k < 16; k++) {
|
| 3505 |
+
int idx = 16 * j + k;
|
| 3506 |
+
float x = block_x[idx]; /* unclipped original */
|
| 3507 |
+
float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3508 |
+
float a = ls_f * (float)L[idx];
|
| 3509 |
+
float b = lm_f;
|
| 3510 |
+
rSaa += (double)w * a * a;
|
| 3511 |
+
rSab += (double)w * a * b;
|
| 3512 |
+
rSbb += (double)w * b * b;
|
| 3513 |
+
rSxa += (double)w * x * a;
|
| 3514 |
+
rSxb += (double)w * x * b;
|
| 3515 |
+
rA += a; rB += b; rS += x;
|
| 3516 |
+
}
|
| 3517 |
+
}
|
| 3518 |
+
/* DC term as one augmented observation (S ~ AΒ·d β BΒ·m), weight
|
| 3519 |
+
* Ξ»_dc/n; vesica/wave handled by the extended-E acceptance. */
|
| 3520 |
+
{
|
| 3521 |
+
double rw = (double)HEX_DC_LAMBDA / (double)QK_K;
|
| 3522 |
+
rSaa += rw * rA * rA; rSab += rw * rA * rB;
|
| 3523 |
+
rSbb += rw * rB * rB; rSxa += rw * rS * rA;
|
| 3524 |
+
rSxb += rw * rS * rB;
|
| 3525 |
+
}
|
| 3526 |
+
double rdet = rSaa * rSbb - rSab * rSab;
|
| 3527 |
+
if (fabs(rdet) > 1e-30) {
|
| 3528 |
+
double d_ref = (rSbb * rSxa - rSab * rSxb) / rdet;
|
| 3529 |
+
double m_ref = (rSab * rSxa - rSaa * rSxb) / rdet;
|
| 3530 |
+
if (d_ref > 0.0) {
|
| 3531 |
+
float dm_try = gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)d_ref));
|
| 3532 |
+
float mm_try = (m_ref > 0.0)
|
| 3533 |
+
? gguf_fp16_to_fp32(gguf_fp32_to_fp16((float)m_ref))
|
| 3534 |
+
: mm;
|
| 3535 |
+
/* Extended-objective acceptance test vs original weights. */
|
| 3536 |
+
float err_cur = 0.0f, err_try = 0.0f;
|
| 3537 |
+
float e_rc[QK_K], e_rt[QK_K];
|
| 3538 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3539 |
+
float ls_f = (float)(output[blk].scales[j] & 0xF);
|
| 3540 |
+
float lm_f = (float)(output[blk].scales[j] >> 4);
|
| 3541 |
+
for (int k = 0; k < 16; k++) {
|
| 3542 |
+
int idx = 16 * j + k;
|
| 3543 |
+
float x = block_x[idx];
|
| 3544 |
+
float w = (imat_importance) ? imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3545 |
+
float qf = (float)L[idx];
|
| 3546 |
+
float dc = dm * ls_f * qf - mm * lm_f;
|
| 3547 |
+
float dt = dm_try * ls_f * qf - mm_try * lm_f;
|
| 3548 |
+
e_rc[idx] = x - dc;
|
| 3549 |
+
e_rt[idx] = x - dt;
|
| 3550 |
+
err_cur += e_rc[idx] * e_rc[idx] * w;
|
| 3551 |
+
err_try += e_rt[idx] * e_rt[idx] * w;
|
| 3552 |
+
}
|
| 3553 |
+
}
|
| 3554 |
+
err_cur += hex_spectral_penalty(e_rc, QK_K);
|
| 3555 |
+
err_try += hex_spectral_penalty(e_rt, QK_K);
|
| 3556 |
+
if (err_try < err_cur) { dm = dm_try; mm = mm_try; }
|
| 3557 |
+
}
|
| 3558 |
+
}
|
| 3559 |
+
output[blk].d = gguf_fp32_to_fp16(dm);
|
| 3560 |
+
output[blk].dmin = gguf_fp32_to_fp16(mm);
|
| 3561 |
+
}
|
| 3562 |
+
|
| 3563 |
+
/* ββ PHASE 4.6: MONOTONE COORDINATE-DESCENT POLISH (RMSE-guaranteed) ββ
|
| 3564 |
+
*
|
| 3565 |
+
* Objective-function mismatch fix: the final passes that commit the
|
| 3566 |
+
* 2-bit codes β the 16Γ16 (ls, lm) sub-block search, the Β±8 ULP
|
| 3567 |
+
* (d, dmin) neighborhood search, and the greedy-descent error shaping
|
| 3568 |
+
* β all minimise error against the DC-ADJUSTED target adj_block_x.
|
| 3569 |
+
* The reported RMSE, however, is measured against the ORIGINAL
|
| 3570 |
+
* weights. The codes are therefore stranded at the optimum of a
|
| 3571 |
+
* SHIFTED objective, while only the scalar (d, dmin) refit above
|
| 3572 |
+
* targets the true one (and it holds all codes frozen).
|
| 3573 |
+
*
|
| 3574 |
+
* This polish runs alternating coordinate descent on the TRUE
|
| 3575 |
+
* objective (importance-weighted SSE vs the original weights):
|
| 3576 |
+
*
|
| 3577 |
+
* (1) For each 16-weight sub-block, an exact joint re-search of
|
| 3578 |
+
* (ls, lm) over the full 16Γ16 grid with per-weight optimal
|
| 3579 |
+
* q β {0..3}, committed only on strict improvement of the
|
| 3580 |
+
* extended objective E. With Ξ»_dc = Ξ»_vw = 0 sub-blocks are
|
| 3581 |
+
* independent given (d, dmin); with spectral terms active the
|
| 3582 |
+
* coupling (DC: all subs; fold: sub j β sub jβ8) is handled
|
| 3583 |
+
* exactly via live residual bookkeeping.
|
| 3584 |
+
* (2) Closed-form weighted LS refit of the two fp16 scalars
|
| 3585 |
+
* (d, dmin) with all codes held fixed, committed only on
|
| 3586 |
+
* strict improvement (same guard as the refit above).
|
| 3587 |
+
*
|
| 3588 |
+
* All moves are accept-only-if-better on E β the extended block
|
| 3589 |
+
* objective is monotonically non-increasing; at Ξ» = 0 this reduces
|
| 3590 |
+
* to RMSE-monotone (final RMSE can only go DOWN relative to the
|
| 3591 |
+
* unpatched pipeline), at Ξ» > 0 small RMSE giveback is permitted
|
| 3592 |
+
* exactly where it buys dot-product error cancellation. The state space is finite
|
| 3593 |
+
* (4-bit codes, fp16 scalars), so the loop terminates; in practice
|
| 3594 |
+
* it converges in 2β3 sweeps. The vesica/DC spectral shaping baked
|
| 3595 |
+
* into L survives wherever it is SSE-neutral, and is overridden
|
| 3596 |
+
* only where it was costing true reconstruction error. */
|
| 3597 |
+
{
|
| 3598 |
+
uint8_t pl_Ls[16], pl_Lm[16];
|
| 3599 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3600 |
+
pl_Ls[j] = output[blk].scales[j] & 0xF;
|
| 3601 |
+
pl_Lm[j] = output[blk].scales[j] >> 4;
|
| 3602 |
+
}
|
| 3603 |
+
|
| 3604 |
+
for (int pol_iter = 0; pol_iter < 6; pol_iter++) {
|
| 3605 |
+
int pol_improved = 0;
|
| 3606 |
+
|
| 3607 |
+
/* ββ (1) Exact per-sub-block (ls, lm, q) re-search on the
|
| 3608 |
+
* EXTENDED objective. Under the spectral terms sub-blocks
|
| 3609 |
+
* are no longer independent: every sub couples to all others
|
| 3610 |
+
* through the DC term and to its fold partner (sub j β 8,
|
| 3611 |
+
* i.e. weights i β i+128) through vesicaΒ² β waveΒ². The
|
| 3612 |
+
* search therefore keeps live residuals pe[] and scores each
|
| 3613 |
+
* candidate against the whole-block penalty with the partner
|
| 3614 |
+
* residuals held fixed β exact coordinate descent on E. */
|
| 3615 |
+
float pe[QK_K];
|
| 3616 |
+
float sub_sse[16], sub_dc[16], pair_cross[8];
|
| 3617 |
+
float dc_tot = 0.0f, cross_tot = 0.0f;
|
| 3618 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3619 |
+
float d_sub = dm * (float)pl_Ls[j];
|
| 3620 |
+
float m_sub = mm * (float)pl_Lm[j];
|
| 3621 |
+
sub_sse[j] = 0.0f;
|
| 3622 |
+
sub_dc[j] = 0.0f;
|
| 3623 |
+
for (int k = 0; k < 16; k++) {
|
| 3624 |
+
int idx = 16 * j + k;
|
| 3625 |
+
float w = (imat_importance) ?
|
| 3626 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3627 |
+
/* deq = dΒ·lsΒ·q β dminΒ·lm; equals βm_sub at ls==0 */
|
| 3628 |
+
float e = block_x[idx] - (d_sub * (float)L[idx] - m_sub);
|
| 3629 |
+
pe[idx] = e;
|
| 3630 |
+
sub_sse[j] += e * e * w;
|
| 3631 |
+
sub_dc[j] += e;
|
| 3632 |
+
}
|
| 3633 |
+
dc_tot += sub_dc[j];
|
| 3634 |
+
}
|
| 3635 |
+
for (int p = 0; p < 8; p++) {
|
| 3636 |
+
pair_cross[p] = 0.0f;
|
| 3637 |
+
for (int k = 0; k < 16; k++)
|
| 3638 |
+
pair_cross[p] += pe[16*p + k] * pe[16*(p+8) + k];
|
| 3639 |
+
cross_tot += pair_cross[p];
|
| 3640 |
+
}
|
| 3641 |
+
|
| 3642 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3643 |
+
const float *sx = block_x + 16 * j;
|
| 3644 |
+
int pi = j & 7; /* fold-pair index */
|
| 3645 |
+
int pj = j ^ 8; /* partner sub-block */
|
| 3646 |
+
const float *ppe = pe + 16 * pj; /* partner residuals */
|
| 3647 |
+
float dc_rest = dc_tot - sub_dc[j];
|
| 3648 |
+
float cross_rest = cross_tot - pair_cross[pi];
|
| 3649 |
+
|
| 3650 |
+
/* Extended score of the CURRENT committed state */
|
| 3651 |
+
float best_sub = sub_sse[j]
|
| 3652 |
+
+ (HEX_DC_LAMBDA / (float)QK_K) * dc_tot * dc_tot
|
| 3653 |
+
+ (HEX_VW_LAMBDA / (float)QK_K) * 4.0f * cross_tot;
|
| 3654 |
+
int best_ls = -1, best_lm = 0;
|
| 3655 |
+
uint8_t best_q[16];
|
| 3656 |
+
float best_e[16];
|
| 3657 |
+
float best_sse = 0.0f, best_dcc = 0.0f, best_cxc = 0.0f;
|
| 3658 |
+
|
| 3659 |
+
for (int try_ls = 0; try_ls <= 15; try_ls++) {
|
| 3660 |
+
float d_sub = dm * (float)try_ls;
|
| 3661 |
+
for (int try_lm = 0; try_lm <= 15; try_lm++) {
|
| 3662 |
+
float m_sub = mm * (float)try_lm;
|
| 3663 |
+
float sub_err = 0.0f, dcc = 0.0f, cxc = 0.0f;
|
| 3664 |
+
uint8_t q_loc[16];
|
| 3665 |
+
float e_loc[16];
|
| 3666 |
+
int aborted = 0;
|
| 3667 |
+
for (int k = 0; k < 16; k++) {
|
| 3668 |
+
float x = sx[k];
|
| 3669 |
+
float w = (imat_importance) ?
|
| 3670 |
+
imat_importance[blk * QK_K + 16*j + k] : 1.0f;
|
| 3671 |
+
int q = 0;
|
| 3672 |
+
if (d_sub >= 1e-15f) {
|
| 3673 |
+
q = gguf_nearest_int((x + m_sub) / d_sub);
|
| 3674 |
+
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 3675 |
+
}
|
| 3676 |
+
q_loc[k] = (uint8_t)q;
|
| 3677 |
+
/* deq = dΒ·lsΒ·q β dminΒ·lm; βm_sub at ls==0 */
|
| 3678 |
+
float e = x - (d_sub * (float)q - m_sub);
|
| 3679 |
+
e_loc[k] = e;
|
| 3680 |
+
sub_err += e * e * w;
|
| 3681 |
+
dcc += e;
|
| 3682 |
+
cxc += e * ppe[k];
|
| 3683 |
+
/* SSE-partial prune is a valid lower bound
|
| 3684 |
+
* only while the spectral terms are β₯ 0,
|
| 3685 |
+
* i.e. when the (signable) vw credit is off */
|
| 3686 |
+
if (HEX_VW_LAMBDA == 0.0f &&
|
| 3687 |
+
sub_err >= best_sub) { aborted = 1; break; }
|
| 3688 |
+
}
|
| 3689 |
+
if (aborted) continue;
|
| 3690 |
+
float score = sub_err
|
| 3691 |
+
+ (HEX_DC_LAMBDA / (float)QK_K)
|
| 3692 |
+
* (dc_rest + dcc) * (dc_rest + dcc)
|
| 3693 |
+
+ (HEX_VW_LAMBDA / (float)QK_K) * 4.0f
|
| 3694 |
+
* (cross_rest + cxc);
|
| 3695 |
+
if (score < best_sub) {
|
| 3696 |
+
best_sub = score;
|
| 3697 |
+
best_ls = try_ls;
|
| 3698 |
+
best_lm = try_lm;
|
| 3699 |
+
memcpy(best_q, q_loc, 16);
|
| 3700 |
+
memcpy(best_e, e_loc, sizeof(e_loc));
|
| 3701 |
+
best_sse = sub_err;
|
| 3702 |
+
best_dcc = dcc;
|
| 3703 |
+
best_cxc = cxc;
|
| 3704 |
+
}
|
| 3705 |
+
}
|
| 3706 |
+
}
|
| 3707 |
+
|
| 3708 |
+
if (best_ls >= 0) { /* strict improvement in E found */
|
| 3709 |
+
pl_Ls[j] = (uint8_t)best_ls;
|
| 3710 |
+
pl_Lm[j] = (uint8_t)best_lm;
|
| 3711 |
+
memcpy(L + 16 * j, best_q, 16);
|
| 3712 |
+
memcpy(pe + 16 * j, best_e, sizeof(best_e));
|
| 3713 |
+
sub_sse[j] = best_sse;
|
| 3714 |
+
sub_dc[j] = best_dcc;
|
| 3715 |
+
pair_cross[pi] = best_cxc;
|
| 3716 |
+
dc_tot = dc_rest + best_dcc;
|
| 3717 |
+
cross_tot = cross_rest + best_cxc;
|
| 3718 |
+
pol_improved = 1;
|
| 3719 |
+
}
|
| 3720 |
+
}
|
| 3721 |
+
|
| 3722 |
+
/* ββ (2) Closed-form (d, dmin) refit vs ORIGINAL, codes fixed ββ */
|
| 3723 |
+
{
|
| 3724 |
+
double pSaa = 0, pSab = 0, pSbb = 0, pSxa = 0, pSxb = 0;
|
| 3725 |
+
double pA = 0, pB = 0, pS = 0; /* DC rank-1 augmentation */
|
| 3726 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3727 |
+
float ls_f = (float)pl_Ls[j];
|
| 3728 |
+
float lm_f = (float)pl_Lm[j];
|
| 3729 |
+
for (int k = 0; k < 16; k++) {
|
| 3730 |
+
int idx = 16 * j + k;
|
| 3731 |
+
float x = block_x[idx];
|
| 3732 |
+
float w = (imat_importance) ?
|
| 3733 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3734 |
+
float a = ls_f * (float)L[idx];
|
| 3735 |
+
float b = lm_f;
|
| 3736 |
+
pSaa += (double)w * a * a;
|
| 3737 |
+
pSab += (double)w * a * b;
|
| 3738 |
+
pSbb += (double)w * b * b;
|
| 3739 |
+
pSxa += (double)w * x * a;
|
| 3740 |
+
pSxb += (double)w * x * b;
|
| 3741 |
+
pA += a; pB += b; pS += x;
|
| 3742 |
+
}
|
| 3743 |
+
}
|
| 3744 |
+
{
|
| 3745 |
+
double pw = (double)HEX_DC_LAMBDA / (double)QK_K;
|
| 3746 |
+
pSaa += pw * pA * pA; pSab += pw * pA * pB;
|
| 3747 |
+
pSbb += pw * pB * pB; pSxa += pw * pS * pA;
|
| 3748 |
+
pSxb += pw * pS * pB;
|
| 3749 |
+
}
|
| 3750 |
+
double pdet = pSaa * pSbb - pSab * pSab;
|
| 3751 |
+
if (fabs(pdet) > 1e-30) {
|
| 3752 |
+
double d_ref = (pSbb * pSxa - pSab * pSxb) / pdet;
|
| 3753 |
+
double m_ref = (pSab * pSxa - pSaa * pSxb) / pdet;
|
| 3754 |
+
if (d_ref > 0.0) {
|
| 3755 |
+
float dm_try = gguf_fp16_to_fp32(
|
| 3756 |
+
gguf_fp32_to_fp16((float)d_ref));
|
| 3757 |
+
float mm_try = (m_ref > 0.0)
|
| 3758 |
+
? gguf_fp16_to_fp32(
|
| 3759 |
+
gguf_fp32_to_fp16((float)m_ref))
|
| 3760 |
+
: mm;
|
| 3761 |
+
float err_cur = 0.0f, err_try = 0.0f;
|
| 3762 |
+
float e_pc[QK_K], e_pt[QK_K];
|
| 3763 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3764 |
+
float ls_f = (float)pl_Ls[j];
|
| 3765 |
+
float lm_f = (float)pl_Lm[j];
|
| 3766 |
+
for (int k = 0; k < 16; k++) {
|
| 3767 |
+
int idx = 16 * j + k;
|
| 3768 |
+
float x = block_x[idx];
|
| 3769 |
+
float w = (imat_importance) ?
|
| 3770 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3771 |
+
float qf = (float)L[idx];
|
| 3772 |
+
float dc = dm * ls_f * qf - mm * lm_f;
|
| 3773 |
+
float dt = dm_try * ls_f * qf - mm_try * lm_f;
|
| 3774 |
+
e_pc[idx] = x - dc;
|
| 3775 |
+
e_pt[idx] = x - dt;
|
| 3776 |
+
err_cur += e_pc[idx] * e_pc[idx] * w;
|
| 3777 |
+
err_try += e_pt[idx] * e_pt[idx] * w;
|
| 3778 |
+
}
|
| 3779 |
+
}
|
| 3780 |
+
err_cur += hex_spectral_penalty(e_pc, QK_K);
|
| 3781 |
+
err_try += hex_spectral_penalty(e_pt, QK_K);
|
| 3782 |
+
if (err_try < err_cur) {
|
| 3783 |
+
dm = dm_try;
|
| 3784 |
+
mm = mm_try;
|
| 3785 |
+
pol_improved = 1;
|
| 3786 |
+
}
|
| 3787 |
+
}
|
| 3788 |
+
}
|
| 3789 |
+
}
|
| 3790 |
+
|
| 3791 |
+
if (!pol_improved) {
|
| 3792 |
+
/* ββ (3) Β±2 ULP joint (d, dmin) micro-search vs ORIGINAL ββ
|
| 3793 |
+
* The closed-form refit rounds its real-valued optimum to
|
| 3794 |
+
* fp16, which can land 1β2 ULP away from the best
|
| 3795 |
+
* representable pair (and the earlier Β±8 ULP search ran
|
| 3796 |
+
* against the DC-shifted objective). With codes fixed,
|
| 3797 |
+
* scan the (2Β·HEX_POLISH_ULP+1)Β² fp16 neighborhood on the
|
| 3798 |
+
* true objective;
|
| 3799 |
+
* accept only strict improvement, then loop once more so
|
| 3800 |
+
* move (1) can re-optimise codes for the new scalars.
|
| 3801 |
+
* Monotone β final RMSE can only decrease. */
|
| 3802 |
+
uint16_t base_d16 = gguf_fp32_to_fp16(dm);
|
| 3803 |
+
uint16_t base_m16 = gguf_fp32_to_fp16(mm);
|
| 3804 |
+
|
| 3805 |
+
float cur_err = 0.0f;
|
| 3806 |
+
float e_u[QK_K];
|
| 3807 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3808 |
+
float d_sub = dm * (float)pl_Ls[j];
|
| 3809 |
+
float m_sub = mm * (float)pl_Lm[j];
|
| 3810 |
+
for (int k = 0; k < 16; k++) {
|
| 3811 |
+
int idx = 16 * j + k;
|
| 3812 |
+
float w = (imat_importance) ?
|
| 3813 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3814 |
+
e_u[idx] = block_x[idx] -
|
| 3815 |
+
(d_sub * (float)L[idx] - m_sub);
|
| 3816 |
+
cur_err += e_u[idx] * e_u[idx] * w;
|
| 3817 |
+
}
|
| 3818 |
+
}
|
| 3819 |
+
cur_err += hex_spectral_penalty(e_u, QK_K);
|
| 3820 |
+
|
| 3821 |
+
float best_err = cur_err;
|
| 3822 |
+
uint16_t best_d16 = base_d16, best_m16 = base_m16;
|
| 3823 |
+
for (int dd = -HEX_POLISH_ULP; dd <= HEX_POLISH_ULP; dd++) {
|
| 3824 |
+
int cd16 = (int)base_d16 + dd;
|
| 3825 |
+
if (cd16 < 0 || cd16 > 0x7BFF) continue;
|
| 3826 |
+
float t_dm = gguf_fp16_to_fp32((uint16_t)cd16);
|
| 3827 |
+
for (int dmm = -HEX_POLISH_ULP; dmm <= HEX_POLISH_ULP; dmm++) {
|
| 3828 |
+
if (dd == 0 && dmm == 0) continue;
|
| 3829 |
+
int cm16 = (int)base_m16 + dmm;
|
| 3830 |
+
if (cm16 < 0 || cm16 > 0x7BFF) continue;
|
| 3831 |
+
float t_mm = gguf_fp16_to_fp32((uint16_t)cm16);
|
| 3832 |
+
|
| 3833 |
+
float err = 0.0f;
|
| 3834 |
+
/* SSE-partial prune valid only without the
|
| 3835 |
+
* signable vesica/wave credit */
|
| 3836 |
+
for (int j = 0;
|
| 3837 |
+
j < N_SUB && (HEX_VW_LAMBDA != 0.0f ||
|
| 3838 |
+
err < best_err); j++) {
|
| 3839 |
+
float d_sub = t_dm * (float)pl_Ls[j];
|
| 3840 |
+
float m_sub = t_mm * (float)pl_Lm[j];
|
| 3841 |
+
for (int k = 0; k < 16; k++) {
|
| 3842 |
+
int idx = 16 * j + k;
|
| 3843 |
+
float w = (imat_importance) ?
|
| 3844 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3845 |
+
e_u[idx] = block_x[idx] -
|
| 3846 |
+
(d_sub * (float)L[idx] - m_sub);
|
| 3847 |
+
err += e_u[idx] * e_u[idx] * w;
|
| 3848 |
+
}
|
| 3849 |
+
}
|
| 3850 |
+
if (HEX_DC_LAMBDA != 0.0f || HEX_VW_LAMBDA != 0.0f)
|
| 3851 |
+
err = (err < best_err || HEX_VW_LAMBDA != 0.0f)
|
| 3852 |
+
? err + hex_spectral_penalty(e_u, QK_K)
|
| 3853 |
+
: err;
|
| 3854 |
+
if (err < best_err) {
|
| 3855 |
+
best_err = err;
|
| 3856 |
+
best_d16 = (uint16_t)cd16;
|
| 3857 |
+
best_m16 = (uint16_t)cm16;
|
| 3858 |
+
}
|
| 3859 |
+
}
|
| 3860 |
+
}
|
| 3861 |
+
if (best_d16 != base_d16 || best_m16 != base_m16) {
|
| 3862 |
+
dm = gguf_fp16_to_fp32(best_d16);
|
| 3863 |
+
mm = gguf_fp16_to_fp32(best_m16);
|
| 3864 |
+
pol_improved = 1;
|
| 3865 |
+
}
|
| 3866 |
+
}
|
| 3867 |
+
|
| 3868 |
+
if (!pol_improved) break; /* converged on true objective */
|
| 3869 |
+
}
|
| 3870 |
+
|
| 3871 |
+
/* Write back polished codes and scalars */
|
| 3872 |
+
for (int j = 0; j < N_SUB; j++)
|
| 3873 |
+
output[blk].scales[j] = pl_Ls[j] | (pl_Lm[j] << 4);
|
| 3874 |
+
output[blk].d = gguf_fp32_to_fp16(dm);
|
| 3875 |
+
output[blk].dmin = gguf_fp32_to_fp16(mm);
|
| 3876 |
+
}
|
| 3877 |
+
|
| 3878 |
+
/* ββ PHASE 4.7: CANDIDATE FLOOR (worst-case bound) ββ
|
| 3879 |
+
*
|
| 3880 |
+
* candidate_errors[blk][c] is the EXACT weighted SSE of a directly
|
| 3881 |
+
* encodable configuration (fp16 d/dmin + derived Ls/Lm + nearest
|
| 3882 |
+
* rounding vs the original weights). The multi-stage assembly
|
| 3883 |
+
* (DC-shifted WLS, shaping, diffusion, polish) usually improves on
|
| 3884 |
+
* its seed, but each stage optimises a slightly different objective
|
| 3885 |
+
* and coordinate descent can land in a worse basin. Compare the
|
| 3886 |
+
* finished block against the best raw candidate and fall back when
|
| 3887 |
+
* the pipeline ended up worse β guaranteeing
|
| 3888 |
+
* final weighted SSE β€ min_c candidate_errors[blk][c]. */
|
| 3889 |
+
{
|
| 3890 |
+
float fin_err = 0.0f;
|
| 3891 |
+
float e_f[QK_K];
|
| 3892 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3893 |
+
float d_sub = dm * (float)(output[blk].scales[j] & 0xF);
|
| 3894 |
+
float m_sub = mm * (float)(output[blk].scales[j] >> 4);
|
| 3895 |
+
for (int k = 0; k < 16; k++) {
|
| 3896 |
+
int idx = 16 * j + k;
|
| 3897 |
+
float w = (imat_importance) ?
|
| 3898 |
+
imat_importance[blk * QK_K + idx] : 1.0f;
|
| 3899 |
+
e_f[idx] = block_x[idx] -
|
| 3900 |
+
(d_sub * (float)L[idx] - m_sub);
|
| 3901 |
+
fin_err += e_f[idx] * e_f[idx] * w;
|
| 3902 |
+
}
|
| 3903 |
+
}
|
| 3904 |
+
fin_err += hex_spectral_penalty(e_f, QK_K);
|
| 3905 |
+
|
| 3906 |
+
float g_best = candidate_errors[blk][0];
|
| 3907 |
+
int g_cand = 0;
|
| 3908 |
+
for (int c = 1; c < TOTAL_SCALE_CANDIDATES; c++) {
|
| 3909 |
+
if (candidate_errors[blk][c] < g_best) {
|
| 3910 |
+
g_best = candidate_errors[blk][c];
|
| 3911 |
+
g_cand = c;
|
| 3912 |
+
}
|
| 3913 |
+
}
|
| 3914 |
+
|
| 3915 |
+
if (g_best < fin_err) {
|
| 3916 |
+
/* Rebuild the block exactly as the candidate was scored */
|
| 3917 |
+
float c_dm = gguf_fp16_to_fp32(candidate_d [blk][g_cand]);
|
| 3918 |
+
float c_mm = gguf_fp16_to_fp32(candidate_dmin[blk][g_cand]);
|
| 3919 |
+
uint8_t c_Ls[16], c_Lm[16];
|
| 3920 |
+
hex_derive_subscales(seeds[blk].scales, seeds[blk].mins,
|
| 3921 |
+
c_dm, c_mm, c_Ls, c_Lm);
|
| 3922 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3923 |
+
float d_sub = c_dm * (float)c_Ls[j];
|
| 3924 |
+
float m_sub = c_mm * (float)c_Lm[j];
|
| 3925 |
+
for (int k = 0; k < 16; k++) {
|
| 3926 |
+
int idx = 16 * j + k;
|
| 3927 |
+
int q = 0;
|
| 3928 |
+
if (d_sub >= 1e-15f) {
|
| 3929 |
+
q = gguf_nearest_int((block_x[idx] + m_sub) / d_sub);
|
| 3930 |
+
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 3931 |
+
}
|
| 3932 |
+
L[idx] = (uint8_t)q;
|
| 3933 |
+
}
|
| 3934 |
+
output[blk].scales[j] = c_Ls[j] | (c_Lm[j] << 4);
|
| 3935 |
+
}
|
| 3936 |
+
dm = c_dm; mm = c_mm;
|
| 3937 |
+
output[blk].d = candidate_d [blk][g_cand];
|
| 3938 |
+
output[blk].dmin = candidate_dmin[blk][g_cand];
|
| 3939 |
+
}
|
| 3940 |
+
}
|
| 3941 |
+
|
| 3942 |
for (int j = 0; j < QK_K; j += 128) {
|
| 3943 |
for (int l = 0; l < 32; l++) {
|
| 3944 |
output[blk].qs[j / 4 + l] = L[j + l]
|
|
|
|
| 3966 |
free(candidate_errors);
|
| 3967 |
free(candidate_d);
|
| 3968 |
free(candidate_dmin);
|
|
|
|
|
|
|
| 3969 |
free(best_candidate);
|
| 3970 |
if (out_total_error) *out_total_error = total_err;
|
| 3971 |
|
|
|
|
| 4015 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 4016 |
|
| 4017 |
static void print_progress_bar(int current, int total, const char *label,
|
| 4018 |
+
time_t start_time)
|
| 4019 |
{
|
| 4020 |
if (total <= 0) return;
|
| 4021 |
float pct = (float)current / (float)total;
|
| 4022 |
int bar_width = 40;
|
| 4023 |
int filled = (int)(pct * bar_width);
|
| 4024 |
|
| 4025 |
+
/* Wall-clock elapsed: clock() sums CPU time over all OpenMP threads,
|
| 4026 |
+
* which inflated elapsed/ETA by ~the thread count on multicore. */
|
| 4027 |
+
double elapsed = difftime(time(NULL), start_time);
|
| 4028 |
double eta = (pct > 0.01f) ? elapsed / pct * (1.0 - pct) : 0.0;
|
| 4029 |
|
| 4030 |
printf("\r [");
|
|
|
|
| 4247 |
int64_t total_elements_quantized = 0;
|
| 4248 |
int64_t total_bytes_quantized = 0;
|
| 4249 |
int64_t total_bytes_unquantized = 0;
|
| 4250 |
+
time_t quant_start = time(NULL);
|
| 4251 |
|
| 4252 |
for (int i = 0; i < total_tensors; i++) {
|
| 4253 |
int src = tensor_src_idx[i];
|
|
|
|
| 4268 |
|
| 4269 |
int64_t padded = (n_elements + QK_K - 1) / QK_K * QK_K;
|
| 4270 |
if (padded > n_elements) {
|
| 4271 |
+
float *grown = realloc(f32_data, padded * sizeof(float));
|
| 4272 |
+
if (!grown) {
|
| 4273 |
+
fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
|
| 4274 |
+
ti->name);
|
| 4275 |
+
free(f32_data);
|
| 4276 |
+
continue;
|
| 4277 |
+
}
|
| 4278 |
+
f32_data = grown;
|
| 4279 |
for (int64_t j = n_elements; j < padded; j++)
|
| 4280 |
f32_data[j] = 0.0f;
|
| 4281 |
n_elements = padded;
|
|
|
|
| 4342 |
|
| 4343 |
int64_t padded = (n_elements + QK4_0 - 1) / QK4_0 * QK4_0;
|
| 4344 |
if (padded > n_elements) {
|
| 4345 |
+
float *grown = realloc(f32_data, padded * sizeof(float));
|
| 4346 |
+
if (!grown) {
|
| 4347 |
+
fprintf(stderr, "\n ERROR: Out of memory padding '%s'\n",
|
| 4348 |
+
ti->name);
|
| 4349 |
+
free(f32_data);
|
| 4350 |
+
continue;
|
| 4351 |
+
}
|
| 4352 |
+
f32_data = grown;
|
| 4353 |
for (int64_t j = n_elements; j < padded; j++)
|
| 4354 |
f32_data[j] = 0.0f;
|
| 4355 |
n_elements = padded;
|
|
|
|
| 4705 |
|
| 4706 |
/* ββ Phase 1: Load model ββ */
|
| 4707 |
printf(" Phase 1: Loading model...\n");
|
| 4708 |
+
time_t t_start = time(NULL);
|
| 4709 |
|
| 4710 |
/* Determine if input is a file or directory */
|
| 4711 |
struct stat st;
|
|
|
|
| 4721 |
/* Input is a directory β open all shards */
|
| 4722 |
mf = st_open_dir(input_path);
|
| 4723 |
strncpy(input_dir, input_path, sizeof(input_dir) - 2);
|
| 4724 |
+
input_dir[sizeof(input_dir) - 2] = '\0';
|
| 4725 |
int dlen = strlen(input_dir);
|
| 4726 |
if (dlen > 0 && input_dir[dlen - 1] != '/') {
|
| 4727 |
input_dir[dlen] = '/';
|
|
|
|
| 4747 |
|
| 4748 |
/* Extract directory from file path */
|
| 4749 |
strncpy(input_dir, input_path, sizeof(input_dir) - 1);
|
| 4750 |
+
input_dir[sizeof(input_dir) - 1] = '\0';
|
| 4751 |
char *last_slash = strrchr(input_dir, '/');
|
| 4752 |
if (last_slash) {
|
| 4753 |
*(last_slash + 1) = '\0';
|
|
|
|
| 4763 |
|
| 4764 |
st_multi_print_summary(mf);
|
| 4765 |
|
| 4766 |
+
time_t t_load = time(NULL);
|
| 4767 |
+
printf(" Loaded in %.0f seconds\n\n", difftime(t_load, t_start));
|
|
|
|
| 4768 |
|
| 4769 |
/* ββ Phase 2: Detect architecture ββ */
|
| 4770 |
printf(" Phase 2: Detecting model architecture...\n");
|
|
|
|
| 4839 |
|
| 4840 |
/* ββ Phase 3-5: Quantize and write GGUF ββ */
|
| 4841 |
printf(" Phase 3: HPC-Optimized Q2_K Quantization + GGUF Output...\n");
|
|
|
|
|
|
|
| 4842 |
int result = write_gguf(output_path, mf, &arch, tokenizer,
|
| 4843 |
opt_mode, imatrix, verbose);
|
| 4844 |
|
| 4845 |
+
/* Wall-clock total: clock() sums CPU time over all OpenMP threads */
|
| 4846 |
+
time_t t_end = time(NULL);
|
| 4847 |
+
printf(" Total time: %.0f seconds\n\n", difftime(t_end, t_start));
|
| 4848 |
|
| 4849 |
if (imatrix) imatrix_free(imatrix);
|
| 4850 |
if (tokenizer) tok_free(tokenizer);
|