CompressedGemma commited on
Commit
2432d03
·
verified ·
1 Parent(s): 28f242e
Files changed (1) hide show
  1. hexstate_quantize.c +192 -42
hexstate_quantize.c CHANGED
@@ -1,5 +1,5 @@
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
- * hexstate_quantize.c — HexState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * ║ HPC-Optimized GGUF Quantization Engine ║
@@ -2732,70 +2732,209 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2732
  }
2733
 
2734
  /* ══════════════════════════════════════════════════════════════════
2735
- * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
2736
  *
2737
- * Transforms the tensor from a collection of isolated 256-element
2738
- * Q2_K superblocks into a single, continuous error-cancelling waveform.
2739
  *
2740
- * After Phase 3 has selected the optimal (d, dmin) candidate for every
2741
- * block, this sequential pass computes the net DC residual left by each
2742
- * block using a cheap round-nearest forward quantization, then feeds the
2743
- * negated, exponentially-decayed residual as a correction bias into the
2744
- * WLS solver of the immediately following block.
2745
  *
2746
- * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
 
 
 
 
2747
  *
2748
- * dc_bias[N+1] = −DC_DECAY × R_N / QK_K (per-element offset)
2749
  *
2750
- * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
2751
- * quantizer toward codes whose reconstruction deq x′, so that
 
 
 
2752
  *
2753
- * Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = DC_DECAY × R_N
2754
  *
2755
- * The accumulated cross-block DC collapses geometrically:
2756
  *
2757
- * R₀, DC_DECAY·R₀, DC_DECAY²·R₀, … → 0
 
 
 
2758
  *
2759
- * The result is written into block_dc_bias[n_blocks]. Phase 4 reads
2760
- * this array (safe: written sequentially before the parallel loop).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2761
  * ══════════════════════════════════════════════════════════════════ */
2762
 
2763
- #define DC_DECAY 0.85f /* Boundary-condition leak factor (0 = isolated, 1 = full) */
 
 
2764
 
2765
- float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
 
 
 
 
 
 
 
 
 
 
 
 
 
2766
 
2767
- if (block_dc_bias) {
 
 
 
 
2768
  float rolling_dc = 0.0f;
2769
 
2770
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2771
- const float *bx = weights + blk * QK_K;
2772
  int cidx = best_candidate[blk];
2773
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2774
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2775
 
2776
- /* Bias applied to THIS block's WLS targets */
2777
- float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2778
- block_dc_bias[blk] = dc_bias;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2779
 
2780
- /* Quick round-nearest quant to estimate DC residual for NEXT block.
2781
- * We quantize the adjusted target x′ = x − dc_bias, then measure
2782
- * the residual of the ORIGINAL weight against the chosen code. */
2783
  float dc_res = 0.0f;
2784
  int j, k;
2785
  for (j = 0; j < N_SUB; j++) {
2786
  float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
2787
  float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
 
2788
  for (k = 0; k < 16; k++) {
2789
- float x_adj = bx[16*j + k] - dc_bias;
 
 
 
2790
  int q = 0;
2791
  if (d_sub >= 1e-15f) {
2792
  q = gguf_nearest_int((x_adj + m_sub) / d_sub);
2793
- if (q < 0) q = 0;
2794
- if (q > 3) q = 3;
2795
  }
2796
- float deq = d_sub * (float)q - m_sub;
2797
- /* Residual against ORIGINAL weight (not adjusted) */
2798
- dc_res += bx[16*j + k] - deq;
2799
  }
2800
  }
2801
  rolling_dc = dc_res;
@@ -2820,19 +2959,28 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
2820
  int cidx = best_candidate[blk];
2821
  uint8_t Ls_blk[16], Lm_blk[16];
2822
 
2823
- /* ── Rolling DC boundary condition ──────────────────────────────
2824
- * dc_adj shifts every WLS target in this block so that the net
2825
- * quantisation error steers toward cancelling the previous block's
2826
- * DC residual (written by the sequential Phase 3.9 pre-pass). */
2827
- float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
 
 
 
 
 
 
 
 
2828
 
2829
- /* Adjusted weight view — WLS and Shor work on this array;
2830
- * the final error is always reported against the original block_x. */
2831
  float adj_block_x[QK_K];
2832
  {
2833
  int _i;
2834
  for (_i = 0; _i < QK_K; _i++)
2835
- adj_block_x[_i] = block_x[_i] - dc_adj;
 
 
 
2836
  }
2837
 
2838
  memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
@@ -3301,6 +3449,8 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
3301
  free(_tl_graphs);
3302
 
3303
  free(block_dc_bias);
 
 
3304
  free(seeds);
3305
  free(candidate_errors);
3306
  free(candidate_d);
 
1
  /* ═══════════════════════════════════════════════════════════════════════════
2
+ * hexstate_quantize.c — HExState GGUF Quantizer
3
  *
4
  * ╔═══════════════════════════════════════════════════════════════╗
5
  * ║ HPC-Optimized GGUF Quantization Engine ║
 
2732
  }
2733
 
2734
  /* ══════════════════════════════════════════════════════════════════
2735
+ * PHASE 3.9 — HOLOGRAPHIC BOUNDARY RECONSTRUCTION
2736
  *
2737
+ * Proper implementation of:
 
2738
  *
2739
+ * W = Σᵢ ( Proj_n̂(Grad(V)) · Quantize(I_boundary) ) Trans(Δτ)
 
 
 
 
2740
  *
2741
+ * W is NOT a correction to an existing reconstruction it IS the
2742
+ * reconstruction, expressed as a sum over block-boundary contributions.
2743
+ * Each block-boundary i produces a scalar signal that is then lifted
2744
+ * via ⊗ Trans(Δτ) into a vector that decays into the interior of the
2745
+ * blocks on either side of that boundary.
2746
  *
2747
+ * ── Term-by-term derivation ─────────────────────────────────────
2748
  *
2749
+ * Proj_n̂(Grad(V)) at boundary i
2750
+ * The gradient of the original weight tensor V projected onto
2751
+ * n̂, the unit normal to the block boundary. In the 1-D block-
2752
+ * sequence space n̂ points in the inter-block direction, so this
2753
+ * equals the cross-boundary finite difference:
2754
  *
2755
+ * g_i = w[i·QK_K] w[i·QK_K − 1]
2756
  *
2757
+ * (first weight of block i minus last weight of block i-1)
2758
  *
2759
+ * Quantize(I_boundary) at boundary i
2760
+ * The boundary information I_boundary = the weight value at the
2761
+ * boundary element, expressed through the quantizer. Computed
2762
+ * from Phase 1 seeds for the boundary sub-block:
2763
  *
2764
+ * Q_i = d_sub · round((w_boundary + m_sub) / d_sub) − m_sub
2765
+ *
2766
+ * This is not a step size or ratio — it is the actual dequantized
2767
+ * value of the boundary weight.
2768
+ *
2769
+ * Signal s_i = Proj_n̂(Grad(V)) · Quantize(I_boundary)
2770
+ * Scalar product at boundary i. Captures the signed energy of
2771
+ * the weight function at that boundary in the quantized domain.
2772
+ * Units: weight². Normalised by d_sub² to become dimensionless.
2773
+ *
2774
+ * ⊗ Trans(Δτ)
2775
+ * The tensor product lifts the scalar s_i into a vector over
2776
+ * the 256-element interior of the adjacent blocks. Trans(Δτ)
2777
+ * is parameterised by Δτ = d_{i-1}/d_i (the scale ratio between
2778
+ * adjacent blocks), which re-projects s_i from block i-1's
2779
+ * quantization space into block i's:
2780
+ *
2781
+ * right-propagation into block i at position k:
2782
+ * s_i · (d_{i-1}/d_i) · exp(−k / τ)
2783
+ *
2784
+ * left-propagation into block i-1 at position k:
2785
+ * s_i · (d_i/d_{i-1}) · exp(−(QK_K−1−k) / τ)
2786
+ *
2787
+ * The full reconstruction for block b at position k:
2788
+ *
2789
+ * W[b][k] = x[b][k] (original weights, not replaced)
2790
+ * − dc_bias[b] (zeroth-moment error correction, unchanged)
2791
+ * + s_left[b] · (d_{b-1}/d_b) · exp(−k/τ)
2792
+ * + s_right[b] · (d_{b+1}/d_b) · exp(−(QK_K−1−k)/τ)
2793
+ *
2794
+ * where s_left[b] = signal from boundary b (left edge of block b)
2795
+ * s_right[b] = signal from boundary b+1 (right edge of block b)
2796
+ *
2797
+ * The DC bias is subtracted (it is an error correction, not a signal).
2798
+ * The boundary signals are added (they encode the holographic surface).
2799
+ *
2800
+ * ── Implementation ──────────────────────────────────────────────
2801
+ *
2802
+ * Pre-pass (sequential) computes per block:
2803
+ * block_dc_bias[b] — scalar DC offset (existing, unchanged)
2804
+ * block_s_left[b] — left boundary signal (normalised, pre-scaled)
2805
+ * block_s_right[b] — right boundary signal (normalised, pre-scaled)
2806
+ *
2807
+ * Phase 4 (parallel) applies:
2808
+ * adj_x[k] = x[k] − dc_bias + s_left·fwd_decay[k] + s_right·rev_decay[k]
2809
+ *
2810
+ * Two precomputed decay tables (initialised once, thread-safe):
2811
+ * boundary_decay[k] = exp(−k / τ) forward (left → interior)
2812
+ * boundary_decay_rev[k] = exp(−(255−k) / τ) reversed (right → interior)
2813
  * ══════════════════════════════════════════════════════════════════ */
2814
 
2815
+ #define DC_DECAY 0.85f /* DC residual leak factor */
2816
+ #define HOLO_TAU 32.0f /* Boundary signal decay length (elements) */
2817
+ #define HOLO_ALPHA 0.20f /* Boundary signal weight (fraction of one step) */
2818
 
2819
+ /* Precompute forward and reverse decay tables — read-only in Phase 4. */
2820
+ static float boundary_decay [QK_K];
2821
+ static float boundary_decay_rev[QK_K];
2822
+ {
2823
+ static int _decay_init = 0;
2824
+ if (!_decay_init) {
2825
+ int _dk;
2826
+ for (_dk = 0; _dk < QK_K; _dk++) {
2827
+ boundary_decay [_dk] = expf(-(float)_dk / HOLO_TAU);
2828
+ boundary_decay_rev[_dk] = expf(-(float)(QK_K-1-_dk)/ HOLO_TAU);
2829
+ }
2830
+ _decay_init = 1;
2831
+ }
2832
+ }
2833
 
2834
+ float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
2835
+ float *block_s_left = (float *)calloc(n_blocks, sizeof(float));
2836
+ float *block_s_right = (float *)calloc(n_blocks, sizeof(float));
2837
+
2838
+ if (block_dc_bias && block_s_left && block_s_right) {
2839
  float rolling_dc = 0.0f;
2840
 
2841
  for (int64_t blk = 0; blk < n_blocks; blk++) {
2842
+ const float *bx = weights + blk * QK_K;
2843
  int cidx = best_candidate[blk];
2844
  float dm0 = gguf_fp16_to_fp32(candidate_d [blk][cidx]);
2845
  float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
2846
 
2847
+ /* ── DC bias (zeroth moment, existing) ── */
2848
+ float dc_bias = (DC_DECAY * rolling_dc) / (float)QK_K;
2849
+ block_dc_bias[blk] = dc_bias;
2850
+
2851
+ /* ── Left boundary signal: boundary between block blk-1 and blk ──
2852
+ *
2853
+ * Proj_n̂(Grad(V)): cross-boundary finite difference g_left
2854
+ * Quantize(I_boundary): dequantized value of bx[0] using Phase 1
2855
+ * seeds for the first sub-block (j=0) of block blk.
2856
+ * s_left = g_left × Q(bx[0]) / d_sub² (dimensionless)
2857
+ * Pre-scaled by HOLO_ALPHA × d_sub × (d_{blk-1}/d_{blk}) */
2858
+ {
2859
+ float g_left = (blk > 0)
2860
+ ? bx[0] - weights[(blk - 1) * QK_K + QK_K - 1]
2861
+ : 0.0f;
2862
+
2863
+ /* Quantize(I_boundary) for left edge: sub-block j=0 */
2864
+ float d_sub_l = dm0 * (float)seeds[blk].Ls[0];
2865
+ float m_sub_l = mm0 * (float)seeds[blk].Lm[0];
2866
+ float q_val_l = 0.0f;
2867
+ if (d_sub_l > 1e-15f) {
2868
+ int qi = gguf_nearest_int((bx[0] + m_sub_l) / d_sub_l);
2869
+ if (qi < 0) qi = 0; if (qi > 3) qi = 3;
2870
+ q_val_l = d_sub_l * (float)qi - m_sub_l;
2871
+ }
2872
+
2873
+ /* Scale ratio Trans(Δτ): d_{blk-1} / d_{blk} */
2874
+ float d_prev = (blk > 0 && seeds[blk-1].dm > 1e-15f)
2875
+ ? seeds[blk-1].dm : dm0;
2876
+ float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
2877
+ float scale_ratio_l = d_prev / d_curr;
2878
+ if (scale_ratio_l < 0.1f) scale_ratio_l = 0.1f;
2879
+ if (scale_ratio_l > 10.f) scale_ratio_l = 10.f;
2880
+
2881
+ /* Normalise s = (g × Q) / d² then re-scale to weight units */
2882
+ float d2 = d_sub_l * d_sub_l;
2883
+ float s = (d2 > 1e-30f) ? (g_left * q_val_l / d2) : 0.0f;
2884
+ block_s_left[blk] = HOLO_ALPHA * s * d_sub_l * scale_ratio_l;
2885
+ }
2886
+
2887
+ /* ── Right boundary signal: boundary between block blk and blk+1 ──
2888
+ *
2889
+ * Same derivation but at the right edge (position QK_K-1,
2890
+ * sub-block j = N_SUB-1) looking into block blk+1. */
2891
+ {
2892
+ float g_right = (blk + 1 < n_blocks)
2893
+ ? weights[(blk + 1) * QK_K] - bx[QK_K - 1]
2894
+ : 0.0f;
2895
+
2896
+ /* Quantize(I_boundary) for right edge: sub-block j=N_SUB-1 */
2897
+ float d_sub_r = dm0 * (float)seeds[blk].Ls[N_SUB - 1];
2898
+ float m_sub_r = mm0 * (float)seeds[blk].Lm[N_SUB - 1];
2899
+ float q_val_r = 0.0f;
2900
+ if (d_sub_r > 1e-15f) {
2901
+ int qi = gguf_nearest_int((bx[QK_K-1] + m_sub_r) / d_sub_r);
2902
+ if (qi < 0) qi = 0; if (qi > 3) qi = 3;
2903
+ q_val_r = d_sub_r * (float)qi - m_sub_r;
2904
+ }
2905
+
2906
+ /* Scale ratio Trans(Δτ): d_{blk+1} / d_{blk} */
2907
+ float d_next = (blk + 1 < n_blocks && seeds[blk+1].dm > 1e-15f)
2908
+ ? seeds[blk+1].dm : dm0;
2909
+ float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
2910
+ float scale_ratio_r = d_next / d_curr;
2911
+ if (scale_ratio_r < 0.1f) scale_ratio_r = 0.1f;
2912
+ if (scale_ratio_r > 10.f) scale_ratio_r = 10.f;
2913
+
2914
+ float d2 = d_sub_r * d_sub_r;
2915
+ float s = (d2 > 1e-30f) ? (g_right * q_val_r / d2) : 0.0f;
2916
+ block_s_right[blk] = HOLO_ALPHA * s * d_sub_r * scale_ratio_r;
2917
+ }
2918
 
2919
+ /* ── DC residual for the next block's rolling_dc ── */
 
 
2920
  float dc_res = 0.0f;
2921
  int j, k;
2922
  for (j = 0; j < N_SUB; j++) {
2923
  float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
2924
  float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
2925
+ int base = 16 * j;
2926
  for (k = 0; k < 16; k++) {
2927
+ int elem = base + k;
2928
+ float x_adj = bx[elem] - dc_bias
2929
+ + block_s_left [blk] * boundary_decay [elem]
2930
+ + block_s_right[blk] * boundary_decay_rev[elem];
2931
  int q = 0;
2932
  if (d_sub >= 1e-15f) {
2933
  q = gguf_nearest_int((x_adj + m_sub) / d_sub);
2934
+ if (q < 0) q = 0; if (q > 3) q = 3;
 
2935
  }
2936
+ float deq = d_sub * (float)q - m_sub;
2937
+ dc_res += bx[elem] - deq; /* residual vs ORIGINAL weight */
 
2938
  }
2939
  }
2940
  rolling_dc = dc_res;
 
2959
  int cidx = best_candidate[blk];
2960
  uint8_t Ls_blk[16], Lm_blk[16];
2961
 
2962
+ /* ── Holographic boundary reconstruction (Phase 3.9 → Phase 4) ──
2963
+ *
2964
+ * W[b][k] = x[b][k]
2965
+ * dc_bias[b] (DC error correction)
2966
+ * + s_left[b] · exp(−k/τ) (left boundary signal)
2967
+ * + s_right[b] · exp(−(QK_K−1−k)/τ) (right boundary signal)
2968
+ *
2969
+ * The two boundary signals decay inward from opposite edges and meet
2970
+ * in the middle. Together they enforce C¹ continuity across every
2971
+ * block boundary in the quantized domain. */
2972
+ float dc_adj = (block_dc_bias) ? block_dc_bias [blk] : 0.0f;
2973
+ float s_left = (block_s_left) ? block_s_left [blk] : 0.0f;
2974
+ float s_right = (block_s_right) ? block_s_right [blk] : 0.0f;
2975
 
 
 
2976
  float adj_block_x[QK_K];
2977
  {
2978
  int _i;
2979
  for (_i = 0; _i < QK_K; _i++)
2980
+ adj_block_x[_i] = block_x[_i]
2981
+ - dc_adj
2982
+ + s_left * boundary_decay [_i]
2983
+ + s_right * boundary_decay_rev[_i];
2984
  }
2985
 
2986
  memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
 
3449
  free(_tl_graphs);
3450
 
3451
  free(block_dc_bias);
3452
+ free(block_s_left);
3453
+ free(block_s_right);
3454
  free(seeds);
3455
  free(candidate_errors);
3456
  free(candidate_d);