CompressedGemma commited on
Commit
73e9225
Β·
verified Β·
1 Parent(s): 2432d03
Files changed (1) hide show
  1. hexstate_quantize.c +154 -1
hexstate_quantize.c CHANGED
@@ -1,5 +1,5 @@
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
- * hexstate_quantize.c β€” HExState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * β•‘ HPC-Optimized GGUF Quantization Engine β•‘
@@ -3229,6 +3229,159 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3229
  Lm_blk[j] = best_lm;
3230
  }
3231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3232
  output[blk].d = gguf_fp32_to_fp16(dm);
3233
  output[blk].dmin = gguf_fp32_to_fp16(mm);
3234
 
 
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
+ * hexstate_quantize.c β€” HexState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * β•‘ HPC-Optimized GGUF Quantization Engine β•‘
 
3229
  Lm_blk[j] = best_lm;
3230
  }
3231
 
3232
+ /* ══════════════════════════════════════════════════════════════
3233
+ * PHASE 4.5 β€” LLOYD-MAX CENTROID REFINEMENT (per sub-block)
3234
+ *
3235
+ * Standard WLS and grid search both assume uniform spacing between
3236
+ * the 4 representable values is MSE-optimal. For non-uniform weight
3237
+ * distributions (the typical case β€” transformer weights are heavy-
3238
+ * tailed near zero, sparse in the tails), the WLS-optimal (d, m)
3239
+ * does not align with the empirical centroids of the code partitions.
3240
+ *
3241
+ * Lloyd-Max iterates the assignment-then-centroid loop:
3242
+ *
3243
+ * 1. Assign: each weight β†’ nearest representable value, code v∈{0..3}
3244
+ * 2. Centroid: c_v = empirical mean of weights assigned to v
3245
+ * 3. Project: c_v are 4 real numbers; find the arithmetic progression
3246
+ * {dΒ·v βˆ’ m : v∈{0,1,2,3}} that best fits c_v in MSE.
3247
+ * Closed-form solution from normal equations:
3248
+ *
3249
+ * d_new = (3Β·c_3 + c_2 βˆ’ c_1 βˆ’ 3Β·c_0) / 10
3250
+ * m_new = (βˆ’7Β·c_0 βˆ’ 4Β·c_1 βˆ’ c_2 + 2Β·c_3) / 10
3251
+ *
3252
+ * (Constants derived from Ξ£q=6, Ξ£qΒ²=14, 4 codes total.)
3253
+ * 4. Re-quantise; repeat until (d, m) stop changing.
3254
+ *
3255
+ * The arithmetic-progression projection is the key constraint that
3256
+ * keeps the output in valid Q2_K format. In unconstrained Lloyd-Max,
3257
+ * the 4 centroids could be placed freely; here they must sit on an
3258
+ * AP determined by (d, m), which is exactly what Q2_K stores.
3259
+ *
3260
+ * Operating per sub-block: we refine (d_sub_j, m_sub_j) = (dΒ·Ls_j,
3261
+ * m·Lm_j), then re-project onto integer (Ls, Lm) ∈ [0,15]. The
3262
+ * integer rounding can hurt, so we only accept the refined values
3263
+ * if they reduce the sub-block's weighted MSE.
3264
+ *
3265
+ * This is a genuine refinement on top of the grid search: the grid
3266
+ * search minimises element-wise MSE assuming uniform spacing is
3267
+ * locked in; Lloyd-Max iterates toward distribution-optimal spacing
3268
+ * given the actual empirical centroids.
3269
+ * ══════════════════════════════════════════════════════════════ */
3270
+ for (int j = 0; j < N_SUB; j++) {
3271
+ const float *sx = adj_block_x + 16 * j;
3272
+ uint8_t Ls_cur = Ls_blk[j];
3273
+ uint8_t Lm_cur = Lm_blk[j];
3274
+
3275
+ /* Baseline MSE for current (Ls, Lm) β€” only accept if we beat this */
3276
+ float baseline_err = 0.0f;
3277
+ {
3278
+ float d_sub = dm * (float)Ls_cur;
3279
+ float m_sub = mm * (float)Lm_cur;
3280
+ for (int k = 0; k < 16; k++) {
3281
+ float w_imp = (imat_importance)
3282
+ ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3283
+ int q;
3284
+ if (d_sub < 1e-15f) { q = 0; }
3285
+ else {
3286
+ q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
3287
+ if (q < 0) q = 0; if (q > 3) q = 3;
3288
+ }
3289
+ float deq = d_sub * (float)q - m_sub;
3290
+ float diff = sx[k] - deq;
3291
+ baseline_err += diff * diff * w_imp;
3292
+ }
3293
+ }
3294
+
3295
+ /* Lloyd-Max iteration on (d_sub, m_sub) */
3296
+ float d_sub = dm * (float)Ls_cur;
3297
+ float m_sub = mm * (float)Lm_cur;
3298
+ float d_sub_best = d_sub, m_sub_best = m_sub;
3299
+ float lloyd_err = baseline_err;
3300
+
3301
+ const int MAX_LLOYD_ITERS = 6;
3302
+ for (int it = 0; it < MAX_LLOYD_ITERS; it++) {
3303
+ if (d_sub < 1e-15f) break;
3304
+
3305
+ /* Step 1+2: assign and accumulate weighted centroids */
3306
+ double sum_v[4] = {0.0, 0.0, 0.0, 0.0};
3307
+ double cnt_v[4] = {0.0, 0.0, 0.0, 0.0};
3308
+ for (int k = 0; k < 16; k++) {
3309
+ float w_imp = (imat_importance)
3310
+ ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3311
+ int q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
3312
+ if (q < 0) q = 0; if (q > 3) q = 3;
3313
+ sum_v[q] += (double)sx[k] * (double)w_imp;
3314
+ cnt_v[q] += (double)w_imp;
3315
+ }
3316
+
3317
+ /* Fill empty bins with extrapolation from neighbours to avoid
3318
+ * degenerate centroids when a code is unused */
3319
+ double c[4];
3320
+ int n_empty = 0;
3321
+ for (int v = 0; v < 4; v++) {
3322
+ if (cnt_v[v] > 1e-15) {
3323
+ c[v] = sum_v[v] / cnt_v[v];
3324
+ } else {
3325
+ c[v] = (double)(d_sub * (float)v - m_sub); /* fallback to current AP */
3326
+ n_empty++;
3327
+ }
3328
+ }
3329
+ if (n_empty >= 3) break; /* distribution too sparse β€” give up */
3330
+
3331
+ /* Step 3: AP projection β€” closed form for arithmetic progression
3332
+ * minimising Ξ£_v (c_v βˆ’ (dΒ·v βˆ’ m))Β² */
3333
+ float d_new = (float)((3.0*c[3] + c[2] - c[1] - 3.0*c[0]) / 10.0);
3334
+ float m_new = (float)((-7.0*c[0] - 4.0*c[1] - c[2] + 2.0*c[3]) / 10.0);
3335
+ if (d_new <= 1e-15f) break;
3336
+ if (m_new < 0.0f) m_new = 0.0f; /* keep m non-negative */
3337
+
3338
+ /* Step 4: project onto integer (Ls, Lm) and evaluate */
3339
+ int Ls_try = (dm > 1e-15f) ? gguf_nearest_int(d_new / dm) : Ls_cur;
3340
+ int Lm_try = (mm > 1e-15f) ? gguf_nearest_int(m_new / mm) : Lm_cur;
3341
+ if (Ls_try < 1) Ls_try = 1;
3342
+ if (Ls_try > 15) Ls_try = 15;
3343
+ if (Lm_try < 0) Lm_try = 0;
3344
+ if (Lm_try > 15) Lm_try = 15;
3345
+
3346
+ float d_sub_try = dm * (float)Ls_try;
3347
+ float m_sub_try = mm * (float)Lm_try;
3348
+
3349
+ float try_err = 0.0f;
3350
+ for (int k = 0; k < 16; k++) {
3351
+ float w_imp = (imat_importance)
3352
+ ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3353
+ int q;
3354
+ if (d_sub_try < 1e-15f) { q = 0; }
3355
+ else {
3356
+ q = gguf_nearest_int((sx[k] + m_sub_try) / d_sub_try);
3357
+ if (q < 0) q = 0; if (q > 3) q = 3;
3358
+ }
3359
+ float deq = d_sub_try * (float)q - m_sub_try;
3360
+ float diff = sx[k] - deq;
3361
+ try_err += diff * diff * w_imp;
3362
+ }
3363
+
3364
+ /* Only accept if strictly improves; this is our safety net */
3365
+ if (try_err < lloyd_err) {
3366
+ lloyd_err = try_err;
3367
+ d_sub_best = d_sub_try;
3368
+ m_sub_best = m_sub_try;
3369
+ Ls_cur = (uint8_t)Ls_try;
3370
+ Lm_cur = (uint8_t)Lm_try;
3371
+ d_sub = d_sub_try;
3372
+ m_sub = m_sub_try;
3373
+ } else {
3374
+ /* Converged or projection rounding hurt β€” stop */
3375
+ break;
3376
+ }
3377
+ }
3378
+
3379
+ if (lloyd_err < baseline_err) {
3380
+ Ls_blk[j] = Ls_cur;
3381
+ Lm_blk[j] = Lm_cur;
3382
+ }
3383
+ }
3384
+
3385
  output[blk].d = gguf_fp32_to_fp16(dm);
3386
  output[blk].dmin = gguf_fp32_to_fp16(mm);
3387