CompressedGemma commited on
Commit
e0ba36a
·
verified ·
1 Parent(s): 73e9225

Revert to Alpha 0.1

Browse files
Files changed (1) hide show
  1. hexstate_quantize.c +41 -344
hexstate_quantize.c CHANGED
@@ -2732,209 +2732,70 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2732
  }
2733
 
2734
  /* ══════════════════════════════════════════════════════════════════
2735
- * PHASE 3.9 — HOLOGRAPHIC BOUNDARY RECONSTRUCTION
2736
  *
2737
- * Proper implementation of:
 
2738
  *
2739
- * W = Σᵢ ( Proj_n̂(Grad(V)) · Quantize(I_boundary) ) Trans(Δτ)
 
 
 
 
2740
  *
2741
- * W is NOT a correction to an existing reconstruction it IS the
2742
- * reconstruction, expressed as a sum over block-boundary contributions.
2743
- * Each block-boundary i produces a scalar signal that is then lifted
2744
- * via ⊗ Trans(Δτ) into a vector that decays into the interior of the
2745
- * blocks on either side of that boundary.
2746
  *
2747
- * ── Term-by-term derivation ─────────────────────────────────────
2748
  *
2749
- * Proj_n̂(Grad(V)) at boundary i
2750
- * The gradient of the original weight tensor V projected onto
2751
- * n̂, the unit normal to the block boundary. In the 1-D block-
2752
- * sequence space n̂ points in the inter-block direction, so this
2753
- * equals the cross-boundary finite difference:
2754
  *
2755
- * g_i = w[i·QK_K] w[i·QK_K − 1]
2756
  *
2757
- * (first weight of block i minus last weight of block i-1)
2758
  *
2759
- * Quantize(I_boundary) at boundary i
2760
- * The boundary information I_boundary = the weight value at the
2761
- * boundary element, expressed through the quantizer. Computed
2762
- * from Phase 1 seeds for the boundary sub-block:
2763
  *
2764
- * Q_i = d_sub · round((w_boundary + m_sub) / d_sub) − m_sub
2765
- *
2766
- * This is not a step size or ratio — it is the actual dequantized
2767
- * value of the boundary weight.
2768
- *
2769
- * Signal s_i = Proj_n̂(Grad(V)) · Quantize(I_boundary)
2770
- * Scalar product at boundary i. Captures the signed energy of
2771
- * the weight function at that boundary in the quantized domain.
2772
- * Units: weight². Normalised by d_sub² to become dimensionless.
2773
- *
2774
- * ⊗ Trans(Δτ)
2775
- * The tensor product lifts the scalar s_i into a vector over
2776
- * the 256-element interior of the adjacent blocks. Trans(Δτ)
2777
- * is parameterised by Δτ = d_{i-1}/d_i (the scale ratio between
2778
- * adjacent blocks), which re-projects s_i from block i-1's
2779
- * quantization space into block i's:
2780
- *
2781
- * right-propagation into block i at position k:
2782
- * s_i · (d_{i-1}/d_i) · exp(−k / τ)
2783
- *
2784
- * left-propagation into block i-1 at position k:
2785
- * s_i · (d_i/d_{i-1}) · exp(−(QK_K−1−k) / τ)
2786
- *
2787
- * The full reconstruction for block b at position k:
2788
- *
2789
- * W[b][k] = x[b][k] (original weights, not replaced)
2790
- * − dc_bias[b] (zeroth-moment error correction, unchanged)
2791
- * + s_left[b] · (d_{b-1}/d_b) · exp(−k/τ)
2792
- * + s_right[b] · (d_{b+1}/d_b) · exp(−(QK_K−1−k)/τ)
2793
- *
2794
- * where s_left[b] = signal from boundary b (left edge of block b)
2795
- * s_right[b] = signal from boundary b+1 (right edge of block b)
2796
- *
2797
- * The DC bias is subtracted (it is an error correction, not a signal).
2798
- * The boundary signals are added (they encode the holographic surface).
2799
- *
2800
- * ── Implementation ──────────────────────────────────────────────
2801
- *
2802
- * Pre-pass (sequential) computes per block:
2803
- * block_dc_bias[b] — scalar DC offset (existing, unchanged)
2804
- * block_s_left[b] — left boundary signal (normalised, pre-scaled)
2805
- * block_s_right[b] — right boundary signal (normalised, pre-scaled)
2806
- *
2807
- * Phase 4 (parallel) applies:
2808
- * adj_x[k] = x[k] − dc_bias + s_left·fwd_decay[k] + s_right·rev_decay[k]
2809
- *
2810
- * Two precomputed decay tables (initialised once, thread-safe):
2811
- * boundary_decay[k] = exp(−k / τ) forward (left → interior)
2812
- * boundary_decay_rev[k] = exp(−(255−k) / τ) reversed (right → interior)
2813
  * ══════════════════════════════════════════════════════════════════ */
2814
 
2815
- #define DC_DECAY 0.85f /* DC residual leak factor */
2816
- #define HOLO_TAU 32.0f /* Boundary signal decay length (elements) */
2817
- #define HOLO_ALPHA 0.20f /* Boundary signal weight (fraction of one step) */
2818
-
2819
- /* Precompute forward and reverse decay tables — read-only in Phase 4. */
2820
- static float boundary_decay [QK_K];
2821
- static float boundary_decay_rev[QK_K];
2822
- {
2823
- static int _decay_init = 0;
2824
- if (!_decay_init) {
2825
- int _dk;
2826
- for (_dk = 0; _dk < QK_K; _dk++) {
2827
- boundary_decay [_dk] = expf(-(float)_dk / HOLO_TAU);
2828
- boundary_decay_rev[_dk] = expf(-(float)(QK_K-1-_dk)/ HOLO_TAU);
2829
- }
2830
- _decay_init = 1;
2831
- }
2832
- }
2833
 
2834
- float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
2835
- float *block_s_left = (float *)calloc(n_blocks, sizeof(float));
2836
- float *block_s_right = (float *)calloc(n_blocks, sizeof(float));
2837
 
2838
- if (block_dc_bias && block_s_left && block_s_right) {
2839
  float rolling_dc = 0.0f;
2840
 
2841
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2842
- const float *bx = weights + blk * QK_K;
2843
  int cidx = best_candidate[blk];
2844
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2845
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2846
 
2847
- /* ── DC bias (zeroth moment, existing) ── */
2848
- float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2849
- block_dc_bias[blk] = dc_bias;
2850
 
2851
- /* ── Left boundary signal: boundary between block blk-1 and blk ──
2852
- *
2853
- * Proj_n̂(Grad(V)): cross-boundary finite difference g_left
2854
- * Quantize(I_boundary): dequantized value of bx[0] using Phase 1
2855
- * seeds for the first sub-block (j=0) of block blk.
2856
- * s_left = g_left × Q(bx[0]) / d_sub² (dimensionless)
2857
- * Pre-scaled by HOLO_ALPHA × d_sub × (d_{blk-1}/d_{blk}) */
2858
- {
2859
- float g_left = (blk > 0)
2860
- ? bx[0] - weights[(blk - 1) * QK_K + QK_K - 1]
2861
- : 0.0f;
2862
-
2863
- /* Quantize(I_boundary) for left edge: sub-block j=0 */
2864
- float d_sub_l = dm0 * (float)seeds[blk].Ls[0];
2865
- float m_sub_l = mm0 * (float)seeds[blk].Lm[0];
2866
- float q_val_l = 0.0f;
2867
- if (d_sub_l > 1e-15f) {
2868
- int qi = gguf_nearest_int((bx[0] + m_sub_l) / d_sub_l);
2869
- if (qi < 0) qi = 0; if (qi > 3) qi = 3;
2870
- q_val_l = d_sub_l * (float)qi - m_sub_l;
2871
- }
2872
-
2873
- /* Scale ratio Trans(Δτ): d_{blk-1} / d_{blk} */
2874
- float d_prev = (blk > 0 && seeds[blk-1].dm > 1e-15f)
2875
- ? seeds[blk-1].dm : dm0;
2876
- float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
2877
- float scale_ratio_l = d_prev / d_curr;
2878
- if (scale_ratio_l < 0.1f) scale_ratio_l = 0.1f;
2879
- if (scale_ratio_l > 10.f) scale_ratio_l = 10.f;
2880
-
2881
- /* Normalise s = (g × Q) / d² then re-scale to weight units */
2882
- float d2 = d_sub_l * d_sub_l;
2883
- float s = (d2 > 1e-30f) ? (g_left * q_val_l / d2) : 0.0f;
2884
- block_s_left[blk] = HOLO_ALPHA * s * d_sub_l * scale_ratio_l;
2885
- }
2886
-
2887
- /* ── Right boundary signal: boundary between block blk and blk+1 ──
2888
- *
2889
- * Same derivation but at the right edge (position QK_K-1,
2890
- * sub-block j = N_SUB-1) looking into block blk+1. */
2891
- {
2892
- float g_right = (blk + 1 < n_blocks)
2893
- ? weights[(blk + 1) * QK_K] - bx[QK_K - 1]
2894
- : 0.0f;
2895
-
2896
- /* Quantize(I_boundary) for right edge: sub-block j=N_SUB-1 */
2897
- float d_sub_r = dm0 * (float)seeds[blk].Ls[N_SUB - 1];
2898
- float m_sub_r = mm0 * (float)seeds[blk].Lm[N_SUB - 1];
2899
- float q_val_r = 0.0f;
2900
- if (d_sub_r > 1e-15f) {
2901
- int qi = gguf_nearest_int((bx[QK_K-1] + m_sub_r) / d_sub_r);
2902
- if (qi < 0) qi = 0; if (qi > 3) qi = 3;
2903
- q_val_r = d_sub_r * (float)qi - m_sub_r;
2904
- }
2905
-
2906
- /* Scale ratio Trans(Δτ): d_{blk+1} / d_{blk} */
2907
- float d_next = (blk + 1 < n_blocks && seeds[blk+1].dm > 1e-15f)
2908
- ? seeds[blk+1].dm : dm0;
2909
- float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
2910
- float scale_ratio_r = d_next / d_curr;
2911
- if (scale_ratio_r < 0.1f) scale_ratio_r = 0.1f;
2912
- if (scale_ratio_r > 10.f) scale_ratio_r = 10.f;
2913
-
2914
- float d2 = d_sub_r * d_sub_r;
2915
- float s = (d2 > 1e-30f) ? (g_right * q_val_r / d2) : 0.0f;
2916
- block_s_right[blk] = HOLO_ALPHA * s * d_sub_r * scale_ratio_r;
2917
- }
2918
-
2919
- /* ── DC residual for the next block's rolling_dc ── */
2920
  float dc_res = 0.0f;
2921
  int j, k;
2922
  for (j = 0; j < N_SUB; j++) {
2923
  float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
2924
  float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
2925
- int base = 16 * j;
2926
  for (k = 0; k < 16; k++) {
2927
- int elem = base + k;
2928
- float x_adj = bx[elem] - dc_bias
2929
- + block_s_left [blk] * boundary_decay [elem]
2930
- + block_s_right[blk] * boundary_decay_rev[elem];
2931
  int q = 0;
2932
  if (d_sub >= 1e-15f) {
2933
  q = gguf_nearest_int((x_adj + m_sub) / d_sub);
2934
- if (q < 0) q = 0; if (q > 3) q = 3;
 
2935
  }
2936
- float deq = d_sub * (float)q - m_sub;
2937
- dc_res += bx[elem] - deq; /* residual vs ORIGINAL weight */
 
2938
  }
2939
  }
2940
  rolling_dc = dc_res;
@@ -2959,28 +2820,19 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2959
  int cidx = best_candidate[blk];
2960
  uint8_t Ls_blk[16], Lm_blk[16];
2961
 
2962
- /* ── Holographic boundary reconstruction (Phase 3.9 → Phase 4) ──
2963
- *
2964
- * W[b][k] = x[b][k]
2965
- * dc_bias[b] (DC error correction)
2966
- * + s_left[b] · exp(−k/τ) (left boundary signal)
2967
- * + s_right[b] · exp(−(QK_K−1−k)/τ) (right boundary signal)
2968
- *
2969
- * The two boundary signals decay inward from opposite edges and meet
2970
- * in the middle. Together they enforce C¹ continuity across every
2971
- * block boundary in the quantized domain. */
2972
- float dc_adj = (block_dc_bias) ? block_dc_bias [blk] : 0.0f;
2973
- float s_left = (block_s_left) ? block_s_left [blk] : 0.0f;
2974
- float s_right = (block_s_right) ? block_s_right [blk] : 0.0f;
2975
 
 
 
2976
  float adj_block_x[QK_K];
2977
  {
2978
  int _i;
2979
  for (_i = 0; _i < QK_K; _i++)
2980
- adj_block_x[_i] = block_x[_i]
2981
- - dc_adj
2982
- + s_left * boundary_decay [_i]
2983
- + s_right * boundary_decay_rev[_i];
2984
  }
2985
 
2986
  memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
@@ -3229,159 +3081,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3229
  Lm_blk[j] = best_lm;
3230
  }
3231
 
3232
- /* ══════════════════════════════════════════════════════════════
3233
- * PHASE 4.5 — LLOYD-MAX CENTROID REFINEMENT (per sub-block)
3234
- *
3235
- * Standard WLS and grid search both assume uniform spacing between
3236
- * the 4 representable values is MSE-optimal. For non-uniform weight
3237
- * distributions (the typical case — transformer weights are heavy-
3238
- * tailed near zero, sparse in the tails), the WLS-optimal (d, m)
3239
- * does not align with the empirical centroids of the code partitions.
3240
- *
3241
- * Lloyd-Max iterates the assignment-then-centroid loop:
3242
- *
3243
- * 1. Assign: each weight → nearest representable value, code v∈{0..3}
3244
- * 2. Centroid: c_v = empirical mean of weights assigned to v
3245
- * 3. Project: c_v are 4 real numbers; find the arithmetic progression
3246
- * {d·v − m : v∈{0,1,2,3}} that best fits c_v in MSE.
3247
- * Closed-form solution from normal equations:
3248
- *
3249
- * d_new = (3·c_3 + c_2 − c_1 − 3·c_0) / 10
3250
- * m_new = (−7·c_0 − 4·c_1 − c_2 + 2·c_3) / 10
3251
- *
3252
- * (Constants derived from Σq=6, Σq²=14, 4 codes total.)
3253
- * 4. Re-quantise; repeat until (d, m) stop changing.
3254
- *
3255
- * The arithmetic-progression projection is the key constraint that
3256
- * keeps the output in valid Q2_K format. In unconstrained Lloyd-Max,
3257
- * the 4 centroids could be placed freely; here they must sit on an
3258
- * AP determined by (d, m), which is exactly what Q2_K stores.
3259
- *
3260
- * Operating per sub-block: we refine (d_sub_j, m_sub_j) = (d·Ls_j,
3261
- * m·Lm_j), then re-project onto integer (Ls, Lm) ∈ [0,15]. The
3262
- * integer rounding can hurt, so we only accept the refined values
3263
- * if they reduce the sub-block's weighted MSE.
3264
- *
3265
- * This is a genuine refinement on top of the grid search: the grid
3266
- * search minimises element-wise MSE assuming uniform spacing is
3267
- * locked in; Lloyd-Max iterates toward distribution-optimal spacing
3268
- * given the actual empirical centroids.
3269
- * ══════════════════════════════════════════════════════════════ */
3270
- for (int j = 0; j < N_SUB; j++) {
3271
- const float *sx = adj_block_x + 16 * j;
3272
- uint8_t Ls_cur = Ls_blk[j];
3273
- uint8_t Lm_cur = Lm_blk[j];
3274
-
3275
- /* Baseline MSE for current (Ls, Lm) — only accept if we beat this */
3276
- float baseline_err = 0.0f;
3277
- {
3278
- float d_sub = dm * (float)Ls_cur;
3279
- float m_sub = mm * (float)Lm_cur;
3280
- for (int k = 0; k < 16; k++) {
3281
- float w_imp = (imat_importance)
3282
- ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3283
- int q;
3284
- if (d_sub < 1e-15f) { q = 0; }
3285
- else {
3286
- q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
3287
- if (q < 0) q = 0; if (q > 3) q = 3;
3288
- }
3289
- float deq = d_sub * (float)q - m_sub;
3290
- float diff = sx[k] - deq;
3291
- baseline_err += diff * diff * w_imp;
3292
- }
3293
- }
3294
-
3295
- /* Lloyd-Max iteration on (d_sub, m_sub) */
3296
- float d_sub = dm * (float)Ls_cur;
3297
- float m_sub = mm * (float)Lm_cur;
3298
- float d_sub_best = d_sub, m_sub_best = m_sub;
3299
- float lloyd_err = baseline_err;
3300
-
3301
- const int MAX_LLOYD_ITERS = 6;
3302
- for (int it = 0; it < MAX_LLOYD_ITERS; it++) {
3303
- if (d_sub < 1e-15f) break;
3304
-
3305
- /* Step 1+2: assign and accumulate weighted centroids */
3306
- double sum_v[4] = {0.0, 0.0, 0.0, 0.0};
3307
- double cnt_v[4] = {0.0, 0.0, 0.0, 0.0};
3308
- for (int k = 0; k < 16; k++) {
3309
- float w_imp = (imat_importance)
3310
- ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3311
- int q = gguf_nearest_int((sx[k] + m_sub) / d_sub);
3312
- if (q < 0) q = 0; if (q > 3) q = 3;
3313
- sum_v[q] += (double)sx[k] * (double)w_imp;
3314
- cnt_v[q] += (double)w_imp;
3315
- }
3316
-
3317
- /* Fill empty bins with extrapolation from neighbours to avoid
3318
- * degenerate centroids when a code is unused */
3319
- double c[4];
3320
- int n_empty = 0;
3321
- for (int v = 0; v < 4; v++) {
3322
- if (cnt_v[v] > 1e-15) {
3323
- c[v] = sum_v[v] / cnt_v[v];
3324
- } else {
3325
- c[v] = (double)(d_sub * (float)v - m_sub); /* fallback to current AP */
3326
- n_empty++;
3327
- }
3328
- }
3329
- if (n_empty >= 3) break; /* distribution too sparse — give up */
3330
-
3331
- /* Step 3: AP projection — closed form for arithmetic progression
3332
- * minimising Σ_v (c_v − (d·v − m))² */
3333
- float d_new = (float)((3.0*c[3] + c[2] - c[1] - 3.0*c[0]) / 10.0);
3334
- float m_new = (float)((-7.0*c[0] - 4.0*c[1] - c[2] + 2.0*c[3]) / 10.0);
3335
- if (d_new <= 1e-15f) break;
3336
- if (m_new < 0.0f) m_new = 0.0f; /* keep m non-negative */
3337
-
3338
- /* Step 4: project onto integer (Ls, Lm) and evaluate */
3339
- int Ls_try = (dm > 1e-15f) ? gguf_nearest_int(d_new / dm) : Ls_cur;
3340
- int Lm_try = (mm > 1e-15f) ? gguf_nearest_int(m_new / mm) : Lm_cur;
3341
- if (Ls_try < 1) Ls_try = 1;
3342
- if (Ls_try > 15) Ls_try = 15;
3343
- if (Lm_try < 0) Lm_try = 0;
3344
- if (Lm_try > 15) Lm_try = 15;
3345
-
3346
- float d_sub_try = dm * (float)Ls_try;
3347
- float m_sub_try = mm * (float)Lm_try;
3348
-
3349
- float try_err = 0.0f;
3350
- for (int k = 0; k < 16; k++) {
3351
- float w_imp = (imat_importance)
3352
- ? imat_importance[blk * QK_K + 16*j + k] : 1.0f;
3353
- int q;
3354
- if (d_sub_try < 1e-15f) { q = 0; }
3355
- else {
3356
- q = gguf_nearest_int((sx[k] + m_sub_try) / d_sub_try);
3357
- if (q < 0) q = 0; if (q > 3) q = 3;
3358
- }
3359
- float deq = d_sub_try * (float)q - m_sub_try;
3360
- float diff = sx[k] - deq;
3361
- try_err += diff * diff * w_imp;
3362
- }
3363
-
3364
- /* Only accept if strictly improves; this is our safety net */
3365
- if (try_err < lloyd_err) {
3366
- lloyd_err = try_err;
3367
- d_sub_best = d_sub_try;
3368
- m_sub_best = m_sub_try;
3369
- Ls_cur = (uint8_t)Ls_try;
3370
- Lm_cur = (uint8_t)Lm_try;
3371
- d_sub = d_sub_try;
3372
- m_sub = m_sub_try;
3373
- } else {
3374
- /* Converged or projection rounding hurt — stop */
3375
- break;
3376
- }
3377
- }
3378
-
3379
- if (lloyd_err < baseline_err) {
3380
- Ls_blk[j] = Ls_cur;
3381
- Lm_blk[j] = Lm_cur;
3382
- }
3383
- }
3384
-
3385
  output[blk].d = gguf_fp32_to_fp16(dm);
3386
  output[blk].dmin = gguf_fp32_to_fp16(mm);
3387
 
@@ -3602,8 +3301,6 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3602
  free(_tl_graphs);
3603
 
3604
  free(block_dc_bias);
3605
- free(block_s_left);
3606
- free(block_s_right);
3607
  free(seeds);
3608
  free(candidate_errors);
3609
  free(candidate_d);
 
2732
  }
2733
 
2734
  /* ══════════════════════════════════════════════════════════════════
2735
+ * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
2736
  *
2737
+ * Transforms the tensor from a collection of isolated 256-element
2738
+ * Q2_K superblocks into a single, continuous error-cancelling waveform.
2739
  *
2740
+ * After Phase 3 has selected the optimal (d, dmin) candidate for every
2741
+ * block, this sequential pass computes the net DC residual left by each
2742
+ * block using a cheap round-nearest forward quantization, then feeds the
2743
+ * negated, exponentially-decayed residual as a correction bias into the
2744
+ * WLS solver of the immediately following block.
2745
  *
2746
+ * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
 
 
 
 
2747
  *
2748
+ * dc_bias[N+1] = −DC_DECAY × R_N / QK_K (per-element offset)
2749
  *
2750
+ * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
2751
+ * quantizer toward codes whose reconstruction deq x′, so that
 
 
 
2752
  *
2753
+ * Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = DC_DECAY × R_N
2754
  *
2755
+ * The accumulated cross-block DC collapses geometrically:
2756
  *
2757
+ * R₀, DC_DECAY·R₀, DC_DECAY²·R₀, … → 0
 
 
 
2758
  *
2759
+ * The result is written into block_dc_bias[n_blocks]. Phase 4 reads
2760
+ * this array (safe: written sequentially before the parallel loop).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2761
  * ══════════════════════════════════════════════════════════════════ */
2762
 
2763
+ #define DC_DECAY 0.85f /* Boundary-condition leak factor (0 = isolated, 1 = full) */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2764
 
2765
+ float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
 
 
2766
 
2767
+ if (block_dc_bias) {
2768
  float rolling_dc = 0.0f;
2769
 
2770
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2771
+ const float *bx = weights + blk * QK_K;
2772
  int cidx = best_candidate[blk];
2773
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2774
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2775
 
2776
+ /* Bias applied to THIS block's WLS targets */
2777
+ float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2778
+ block_dc_bias[blk] = dc_bias;
2779
 
2780
+ /* Quick round-nearest quant to estimate DC residual for NEXT block.
2781
+ * We quantize the adjusted target x′ = x − dc_bias, then measure
2782
+ * the residual of the ORIGINAL weight against the chosen code. */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2783
  float dc_res = 0.0f;
2784
  int j, k;
2785
  for (j = 0; j < N_SUB; j++) {
2786
  float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
2787
  float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
 
2788
  for (k = 0; k < 16; k++) {
2789
+ float x_adj = bx[16*j + k] - dc_bias;
 
 
 
2790
  int q = 0;
2791
  if (d_sub >= 1e-15f) {
2792
  q = gguf_nearest_int((x_adj + m_sub) / d_sub);
2793
+ if (q < 0) q = 0;
2794
+ if (q > 3) q = 3;
2795
  }
2796
+ float deq = d_sub * (float)q - m_sub;
2797
+ /* Residual against ORIGINAL weight (not adjusted) */
2798
+ dc_res += bx[16*j + k] - deq;
2799
  }
2800
  }
2801
  rolling_dc = dc_res;
 
2820
  int cidx = best_candidate[blk];
2821
  uint8_t Ls_blk[16], Lm_blk[16];
2822
 
2823
+ /* ── Rolling DC boundary condition ──────────────────────────────
2824
+ * dc_adj shifts every WLS target in this block so that the net
2825
+ * quantisation error steers toward cancelling the previous block's
2826
+ * DC residual (written by the sequential Phase 3.9 pre-pass). */
2827
+ float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
 
 
 
 
 
 
 
 
2828
 
2829
+ /* Adjusted weight view — WLS and Shor work on this array;
2830
+ * the final error is always reported against the original block_x. */
2831
  float adj_block_x[QK_K];
2832
  {
2833
  int _i;
2834
  for (_i = 0; _i < QK_K; _i++)
2835
+ adj_block_x[_i] = block_x[_i] - dc_adj;
 
 
 
2836
  }
2837
 
2838
  memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
 
3081
  Lm_blk[j] = best_lm;
3082
  }
3083
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3084
  output[blk].d = gguf_fp32_to_fp16(dm);
3085
  output[blk].dmin = gguf_fp32_to_fp16(mm);
3086
 
 
3301
  free(_tl_graphs);
3302
 
3303
  free(block_dc_bias);
 
 
3304
  free(seeds);
3305
  free(candidate_errors);
3306
  free(candidate_d);