ALPHA
Browse files- hexstate_quantize.c +154 -1
hexstate_quantize.c
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
-
* hexstate_quantize.c β
|
| 3 |
*
|
| 4 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
* β HPC-Optimized GGUF Quantization Engine β
|
|
@@ -3229,6 +3229,159 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
|
|
| 3229 |
Lm_blk[j] = best_lm;
|
| 3230 |
}
|
| 3231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3232 |
output[blk].d = gguf_fp32_to_fp16(dm);
|
| 3233 |
output[blk].dmin = gguf_fp32_to_fp16(mm);
|
| 3234 |
|
|
|
|
| 1 |
/* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
* hexstate_quantize.c β HexState GGUF Quantizer
|
| 3 |
*
|
| 4 |
* βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 5 |
* β HPC-Optimized GGUF Quantization Engine β
|
|
|
|
| 3229 |
Lm_blk[j] = best_lm;
|
| 3230 |
}
|
| 3231 |
|
| 3232 |
+
/* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 3233 |
+
* PHASE 4.5 β LLOYD-MAX CENTROID REFINEMENT (per sub-block)
|
| 3234 |
+
*
|
| 3235 |
+
* Standard WLS and grid search both assume uniform spacing between
|
| 3236 |
+
* the 4 representable values is MSE-optimal. For non-uniform weight
|
| 3237 |
+
* distributions (the typical case β transformer weights are heavy-
|
| 3238 |
+
* tailed near zero, sparse in the tails), the WLS-optimal (d, m)
|
| 3239 |
+
* does not align with the empirical centroids of the code partitions.
|
| 3240 |
+
*
|
| 3241 |
+
* Lloyd-Max iterates the assignment-then-centroid loop:
|
| 3242 |
+
*
|
| 3243 |
+
* 1. Assign: each weight β nearest representable value, code vβ{0..3}
|
| 3244 |
+
* 2. Centroid: c_v = empirical mean of weights assigned to v
|
| 3245 |
+
* 3. Project: c_v are 4 real numbers; find the arithmetic progression
|
| 3246 |
+
* {dΒ·v β m : vβ{0,1,2,3}} that best fits c_v in MSE.
|
| 3247 |
+
* Closed-form solution from normal equations:
|
| 3248 |
+
*
|
| 3249 |
+
* d_new = (3Β·c_3 + c_2 β c_1 β 3Β·c_0) / 10
|
| 3250 |
+
* m_new = (β7Β·c_0 β 4Β·c_1 β c_2 + 2Β·c_3) / 10
|
| 3251 |
+
*
|
| 3252 |
+
* (Constants derived from Ξ£q=6, Ξ£qΒ²=14, 4 codes total.)
|
| 3253 |
+
* 4. Re-quantise; repeat until (d, m) stop changing.
|
| 3254 |
+
*
|
| 3255 |
+
* The arithmetic-progression projection is the key constraint that
|
| 3256 |
+
* keeps the output in valid Q2_K format. In unconstrained Lloyd-Max,
|
| 3257 |
+
* the 4 centroids could be placed freely; here they must sit on an
|
| 3258 |
+
* AP determined by (d, m), which is exactly what Q2_K stores.
|
| 3259 |
+
*
|
| 3260 |
+
* Operating per sub-block: we refine (d_sub_j, m_sub_j) = (dΒ·Ls_j,
|
| 3261 |
+
* mΒ·Lm_j), then re-project onto integer (Ls, Lm) β [0,15]. The
|
| 3262 |
+
* integer rounding can hurt, so we only accept the refined values
|
| 3263 |
+
* if they reduce the sub-block's weighted MSE.
|
| 3264 |
+
*
|
| 3265 |
+
* This is a genuine refinement on top of the grid search: the grid
|
| 3266 |
+
* search minimises element-wise MSE assuming uniform spacing is
|
| 3267 |
+
* locked in; Lloyd-Max iterates toward distribution-optimal spacing
|
| 3268 |
+
* given the actual empirical centroids.
|
| 3269 |
+
* ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 3270 |
+
for (int j = 0; j < N_SUB; j++) {
|
| 3271 |
+
const float *sx = adj_block_x + 16 * j;
|
| 3272 |
+
uint8_t Ls_cur = Ls_blk[j];
|
| 3273 |
+
uint8_t Lm_cur = Lm_blk[j];
|
| 3274 |
+
|
| 3275 |
+
/* Baseline MSE for current (Ls, Lm) β only accept if we beat this */
|
| 3276 |
+
float baseline_err = 0.0f;
|
| 3277 |
+
{
|
| 3278 |
+
float d_sub = dm * (float)Ls_cur;
|
| 3279 |
+
float m_sub = mm * (float)Lm_cur;
|
| 3280 |
+
for (int k = 0; k < 16; k++) {
|
| 3281 |
+
float w_imp = (imat_importance)
|
| 3282 |
+
? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
|
| 3283 |
+
int q;
|
| 3284 |
+
if (d_sub < 1e-15f) { q = 0; }
|
| 3285 |
+
else {
|
| 3286 |
+
q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
|
| 3287 |
+
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 3288 |
+
}
|
| 3289 |
+
float deq = d_sub * (float)q - m_sub;
|
| 3290 |
+
float diff = sx[k] - deq;
|
| 3291 |
+
baseline_err += diff * diff * w_imp;
|
| 3292 |
+
}
|
| 3293 |
+
}
|
| 3294 |
+
|
| 3295 |
+
/* Lloyd-Max iteration on (d_sub, m_sub) */
|
| 3296 |
+
float d_sub = dm * (float)Ls_cur;
|
| 3297 |
+
float m_sub = mm * (float)Lm_cur;
|
| 3298 |
+
float d_sub_best = d_sub, m_sub_best = m_sub;
|
| 3299 |
+
float lloyd_err = baseline_err;
|
| 3300 |
+
|
| 3301 |
+
const int MAX_LLOYD_ITERS = 6;
|
| 3302 |
+
for (int it = 0; it < MAX_LLOYD_ITERS; it++) {
|
| 3303 |
+
if (d_sub < 1e-15f) break;
|
| 3304 |
+
|
| 3305 |
+
/* Step 1+2: assign and accumulate weighted centroids */
|
| 3306 |
+
double sum_v[4] = {0.0, 0.0, 0.0, 0.0};
|
| 3307 |
+
double cnt_v[4] = {0.0, 0.0, 0.0, 0.0};
|
| 3308 |
+
for (int k = 0; k < 16; k++) {
|
| 3309 |
+
float w_imp = (imat_importance)
|
| 3310 |
+
? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
|
| 3311 |
+
int q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
|
| 3312 |
+
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 3313 |
+
sum_v[q] += (double)sx[k] * (double)w_imp;
|
| 3314 |
+
cnt_v[q] += (double)w_imp;
|
| 3315 |
+
}
|
| 3316 |
+
|
| 3317 |
+
/* Fill empty bins with extrapolation from neighbours to avoid
|
| 3318 |
+
* degenerate centroids when a code is unused */
|
| 3319 |
+
double c[4];
|
| 3320 |
+
int n_empty = 0;
|
| 3321 |
+
for (int v = 0; v < 4; v++) {
|
| 3322 |
+
if (cnt_v[v] > 1e-15) {
|
| 3323 |
+
c[v] = sum_v[v] / cnt_v[v];
|
| 3324 |
+
} else {
|
| 3325 |
+
c[v] = (double)(d_sub * (float)v - m_sub); /* fallback to current AP */
|
| 3326 |
+
n_empty++;
|
| 3327 |
+
}
|
| 3328 |
+
}
|
| 3329 |
+
if (n_empty >= 3) break; /* distribution too sparse β give up */
|
| 3330 |
+
|
| 3331 |
+
/* Step 3: AP projection β closed form for arithmetic progression
|
| 3332 |
+
* minimising Ξ£_v (c_v β (dΒ·v β m))Β² */
|
| 3333 |
+
float d_new = (float)((3.0*c[3] + c[2] - c[1] - 3.0*c[0]) / 10.0);
|
| 3334 |
+
float m_new = (float)((-7.0*c[0] - 4.0*c[1] - c[2] + 2.0*c[3]) / 10.0);
|
| 3335 |
+
if (d_new <= 1e-15f) break;
|
| 3336 |
+
if (m_new < 0.0f) m_new = 0.0f; /* keep m non-negative */
|
| 3337 |
+
|
| 3338 |
+
/* Step 4: project onto integer (Ls, Lm) and evaluate */
|
| 3339 |
+
int Ls_try = (dm > 1e-15f) ? gguf_nearest_int(d_new / dm) : Ls_cur;
|
| 3340 |
+
int Lm_try = (mm > 1e-15f) ? gguf_nearest_int(m_new / mm) : Lm_cur;
|
| 3341 |
+
if (Ls_try < 1) Ls_try = 1;
|
| 3342 |
+
if (Ls_try > 15) Ls_try = 15;
|
| 3343 |
+
if (Lm_try < 0) Lm_try = 0;
|
| 3344 |
+
if (Lm_try > 15) Lm_try = 15;
|
| 3345 |
+
|
| 3346 |
+
float d_sub_try = dm * (float)Ls_try;
|
| 3347 |
+
float m_sub_try = mm * (float)Lm_try;
|
| 3348 |
+
|
| 3349 |
+
float try_err = 0.0f;
|
| 3350 |
+
for (int k = 0; k < 16; k++) {
|
| 3351 |
+
float w_imp = (imat_importance)
|
| 3352 |
+
? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
|
| 3353 |
+
int q;
|
| 3354 |
+
if (d_sub_try < 1e-15f) { q = 0; }
|
| 3355 |
+
else {
|
| 3356 |
+
q = gguf_nearest_int((sx[k] + m_sub_try) / d_sub_try);
|
| 3357 |
+
if (q < 0) q = 0; if (q > 3) q = 3;
|
| 3358 |
+
}
|
| 3359 |
+
float deq = d_sub_try * (float)q - m_sub_try;
|
| 3360 |
+
float diff = sx[k] - deq;
|
| 3361 |
+
try_err += diff * diff * w_imp;
|
| 3362 |
+
}
|
| 3363 |
+
|
| 3364 |
+
/* Only accept if strictly improves; this is our safety net */
|
| 3365 |
+
if (try_err < lloyd_err) {
|
| 3366 |
+
lloyd_err = try_err;
|
| 3367 |
+
d_sub_best = d_sub_try;
|
| 3368 |
+
m_sub_best = m_sub_try;
|
| 3369 |
+
Ls_cur = (uint8_t)Ls_try;
|
| 3370 |
+
Lm_cur = (uint8_t)Lm_try;
|
| 3371 |
+
d_sub = d_sub_try;
|
| 3372 |
+
m_sub = m_sub_try;
|
| 3373 |
+
} else {
|
| 3374 |
+
/* Converged or projection rounding hurt β stop */
|
| 3375 |
+
break;
|
| 3376 |
+
}
|
| 3377 |
+
}
|
| 3378 |
+
|
| 3379 |
+
if (lloyd_err < baseline_err) {
|
| 3380 |
+
Ls_blk[j] = Ls_cur;
|
| 3381 |
+
Lm_blk[j] = Lm_cur;
|
| 3382 |
+
}
|
| 3383 |
+
}
|
| 3384 |
+
|
| 3385 |
output[blk].d = gguf_fp32_to_fp16(dm);
|
| 3386 |
output[blk].dmin = gguf_fp32_to_fp16(mm);
|
| 3387 |
|