CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 19 days ago

Commit

2432d03

verified ·

1 Parent(s): 28f242e

ALPHA

Browse files

Files changed (1) hide show

hexstate_quantize.c +192 -42

hexstate_quantize.c CHANGED Viewed

@@ -1,5 +1,5 @@
 /* ═══════════════════════════════════════════════════════════════════════════
- * hexstate_quantize.c — HexState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
@@ -2732,70 +2732,209 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     }
     /* ══════════════════════════════════════════════════════════════════
-     * PHASE 3.9 — ROLLING DC BOUNDARY CONDITION PRE-PASS
      *
-     * Transforms the tensor from a collection of isolated 256-element
-     * Q2_K superblocks into a single, continuous error-cancelling waveform.
      *
-     * After Phase 3 has selected the optimal (d, dmin) candidate for every
-     * block, this sequential pass computes the net DC residual left by each
-     * block using a cheap round-nearest forward quantization, then feeds the
-     * negated, exponentially-decayed residual as a correction bias into the
-     * WLS solver of the immediately following block.
      *
-     * Mathematically, for block N with final DC residual R_N = Σ εᵢ:
      *
-     *   dc_bias[N+1] = −DC_DECAY × R_N / QK_K      (per-element offset)
      *
-     * Block N+1's WLS targets become x′ᵢ = xᵢ − dc_bias[N+1], steering the
-     * quantizer toward codes whose reconstruction deq ≈ x′, so that
      *
-     *   Σ (xᵢ − deqᵢ) ≈ dc_bias[N+1] × QK_K = −DC_DECAY × R_N
      *
-     * The accumulated cross-block DC collapses geometrically:
      *
-     *   R₀, DC_DECAY·R₀, DC_DECAY²·R₀, …  → 0
      *
-     * The result is written into block_dc_bias[n_blocks].  Phase 4 reads
-     * this array (safe: written sequentially before the parallel loop).
      * ══════════════════════════════════════════════════════════════════ */
-    #define DC_DECAY 0.85f   /* Boundary-condition leak factor (0 = isolated, 1 = full) */
-    float *block_dc_bias = (float *)calloc(n_blocks, sizeof(float));
-    if (block_dc_bias) {
         float rolling_dc = 0.0f;
         for (int64_t blk = 0; blk < n_blocks; blk++) {
-            const float *bx  = weights + blk * QK_K;
             int          cidx = best_candidate[blk];
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
-            /* Bias applied to THIS block's WLS targets */
-            float dc_bias       = (DC_DECAY * rolling_dc) / (float)QK_K;
-            block_dc_bias[blk]  = dc_bias;
-            /* Quick round-nearest quant to estimate DC residual for NEXT block.
-             * We quantize the adjusted target x′ = x − dc_bias, then measure
-             * the residual of the ORIGINAL weight against the chosen code. */
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
                 float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
                 float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
                 for (k = 0; k < 16; k++) {
-                    float x_adj = bx[16*j + k] - dc_bias;
                     int q = 0;
                     if (d_sub >= 1e-15f) {
                         q = gguf_nearest_int((x_adj + m_sub) / d_sub);
-                        if (q < 0) q = 0;
-                        if (q > 3) q = 3;
                     }
-                    float deq = d_sub * (float)q - m_sub;
-                    /* Residual against ORIGINAL weight (not adjusted) */
-                    dc_res += bx[16*j + k] - deq;
                 }
             }
             rolling_dc = dc_res;
@@ -2820,19 +2959,28 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
         int cidx = best_candidate[blk];
         uint8_t Ls_blk[16], Lm_blk[16];
-        /* ── Rolling DC boundary condition ──────────────────────────────
-         * dc_adj shifts every WLS target in this block so that the net
-         * quantisation error steers toward cancelling the previous block's
-         * DC residual (written by the sequential Phase 3.9 pre-pass). */
-        float dc_adj = (block_dc_bias) ? block_dc_bias[blk] : 0.0f;
-        /* Adjusted weight view — WLS and Shor work on this array;
-         * the final error is always reported against the original block_x. */
         float adj_block_x[QK_K];
         {
             int _i;
             for (_i = 0; _i < QK_K; _i++)
-                adj_block_x[_i] = block_x[_i] - dc_adj;
         }
         memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
@@ -3301,6 +3449,8 @@ static void quantize_tensor_q2k_hpc(const float *weights, int64_t n_elements,
     free(_tl_graphs);
     free(block_dc_bias);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);

 /* ═══════════════════════════════════════════════════════════════════════════
+ * hexstate_quantize.c — HExState GGUF Quantizer
  *
  * ╔═══════════════════════════════════════════════════════════════╗
  * ║  HPC-Optimized GGUF Quantization Engine                      ║
     }
     /* ══════════════════════════════════════════════════════════════════
+     * PHASE 3.9 — HOLOGRAPHIC BOUNDARY RECONSTRUCTION
      *
+     * Proper implementation of:
      *
+     *   W = Σᵢ ( Proj_n̂(Grad(V)) · Quantize(I_boundary) ) ⊗ Trans(Δτ)
      *
+     * W is NOT a correction to an existing reconstruction — it IS the
+     * reconstruction, expressed as a sum over block-boundary contributions.
+     * Each block-boundary i produces a scalar signal that is then lifted
+     * via ⊗ Trans(Δτ) into a vector that decays into the interior of the
+     * blocks on either side of that boundary.
      *
+     * ── Term-by-term derivation ─────────────────────────────────────
      *
+     *  Proj_n̂(Grad(V)) at boundary i
+     *      The gradient of the original weight tensor V projected onto
+     *      n̂, the unit normal to the block boundary.  In the 1-D block-
+     *      sequence space n̂ points in the inter-block direction, so this
+     *      equals the cross-boundary finite difference:
      *
+     *        g_i = w[i·QK_K] − w[i·QK_K − 1]
      *
+     *      (first weight of block i minus last weight of block i-1)
      *
+     *  Quantize(I_boundary) at boundary i
+     *      The boundary information I_boundary = the weight value at the
+     *      boundary element, expressed through the quantizer.  Computed
+     *      from Phase 1 seeds for the boundary sub-block:
      *
+     *        Q_i = d_sub · round((w_boundary + m_sub) / d_sub) − m_sub
+     *
+     *      This is not a step size or ratio — it is the actual dequantized
+     *      value of the boundary weight.
+     *
+     *  Signal s_i = Proj_n̂(Grad(V)) · Quantize(I_boundary)
+     *      Scalar product at boundary i.  Captures the signed energy of
+     *      the weight function at that boundary in the quantized domain.
+     *      Units: weight².  Normalised by d_sub² to become dimensionless.
+     *
+     *  ⊗ Trans(Δτ)
+     *      The tensor product lifts the scalar s_i into a vector over
+     *      the 256-element interior of the adjacent blocks.  Trans(Δτ)
+     *      is parameterised by Δτ = d_{i-1}/d_i (the scale ratio between
+     *      adjacent blocks), which re-projects s_i from block i-1's
+     *      quantization space into block i's:
+     *
+     *        right-propagation into block i at position k:
+     *          s_i · (d_{i-1}/d_i) · exp(−k / τ)
+     *
+     *        left-propagation into block i-1 at position k:
+     *          s_i · (d_i/d_{i-1}) · exp(−(QK_K−1−k) / τ)
+     *
+     *  The full reconstruction for block b at position k:
+     *
+     *    W[b][k] = x[b][k]           (original weights, not replaced)
+     *            − dc_bias[b]        (zeroth-moment error correction, unchanged)
+     *            + s_left[b]  · (d_{b-1}/d_b) · exp(−k/τ)
+     *            + s_right[b] · (d_{b+1}/d_b) · exp(−(QK_K−1−k)/τ)
+     *
+     *  where s_left[b]  = signal from boundary b   (left  edge of block b)
+     *        s_right[b] = signal from boundary b+1 (right edge of block b)
+     *
+     *  The DC bias is subtracted (it is an error correction, not a signal).
+     *  The boundary signals are added (they encode the holographic surface).
+     *
+     * ── Implementation ──────────────────────────────────────────────
+     *
+     *  Pre-pass (sequential) computes per block:
+     *    block_dc_bias[b]  — scalar DC offset (existing, unchanged)
+     *    block_s_left[b]   — left  boundary signal (normalised, pre-scaled)
+     *    block_s_right[b]  — right boundary signal (normalised, pre-scaled)
+     *
+     *  Phase 4 (parallel) applies:
+     *    adj_x[k] = x[k] − dc_bias + s_left·fwd_decay[k] + s_right·rev_decay[k]
+     *
+     *  Two precomputed decay tables (initialised once, thread-safe):
+     *    boundary_decay[k]     = exp(−k         / τ)  forward  (left → interior)
+     *    boundary_decay_rev[k] = exp(−(255−k)   / τ)  reversed (right → interior)
      * ══════════════════════════════════════════════════════════════════ */
+    #define DC_DECAY   0.85f  /* DC residual leak factor                       */
+    #define HOLO_TAU   32.0f  /* Boundary signal decay length (elements)       */
+    #define HOLO_ALPHA 0.20f  /* Boundary signal weight (fraction of one step) */
+    /* Precompute forward and reverse decay tables — read-only in Phase 4. */
+    static float boundary_decay    [QK_K];
+    static float boundary_decay_rev[QK_K];
+    {
+        static int _decay_init = 0;
+        if (!_decay_init) {
+            int _dk;
+            for (_dk = 0; _dk < QK_K; _dk++) {
+                boundary_decay    [_dk]         = expf(-(float)_dk         / HOLO_TAU);
+                boundary_decay_rev[_dk]         = expf(-(float)(QK_K-1-_dk)/ HOLO_TAU);
+            }
+            _decay_init = 1;
+        }
+    }
+    float *block_dc_bias  = (float *)calloc(n_blocks, sizeof(float));
+    float *block_s_left   = (float *)calloc(n_blocks, sizeof(float));
+    float *block_s_right  = (float *)calloc(n_blocks, sizeof(float));
+    if (block_dc_bias && block_s_left && block_s_right) {
         float rolling_dc = 0.0f;
         for (int64_t blk = 0; blk < n_blocks; blk++) {
+            const float *bx   = weights + blk * QK_K;
             int          cidx = best_candidate[blk];
             float dm0 = gguf_fp16_to_fp32(candidate_d   [blk][cidx]);
             float mm0 = gguf_fp16_to_fp32(candidate_dmin[blk][cidx]);
+            /* ── DC bias (zeroth moment, existing) ── */
+            float dc_bias      = (DC_DECAY * rolling_dc) / (float)QK_K;
+            block_dc_bias[blk] = dc_bias;
+            /* ── Left boundary signal: boundary between block blk-1 and blk ──
+             *
+             * Proj_n̂(Grad(V)): cross-boundary finite difference g_left
+             * Quantize(I_boundary): dequantized value of bx[0] using Phase 1
+             *   seeds for the first sub-block (j=0) of block blk.
+             * s_left = g_left × Q(bx[0]) / d_sub² (dimensionless)
+             * Pre-scaled by HOLO_ALPHA × d_sub × (d_{blk-1}/d_{blk}) */
+            {
+                float g_left = (blk > 0)
+                    ? bx[0] - weights[(blk - 1) * QK_K + QK_K - 1]
+                    : 0.0f;
+                /* Quantize(I_boundary) for left edge: sub-block j=0 */
+                float d_sub_l = dm0 * (float)seeds[blk].Ls[0];
+                float m_sub_l = mm0 * (float)seeds[blk].Lm[0];
+                float q_val_l = 0.0f;
+                if (d_sub_l > 1e-15f) {
+                    int qi = gguf_nearest_int((bx[0] + m_sub_l) / d_sub_l);
+                    if (qi < 0) qi = 0; if (qi > 3) qi = 3;
+                    q_val_l = d_sub_l * (float)qi - m_sub_l;
+                }
+                /* Scale ratio Trans(Δτ): d_{blk-1} / d_{blk} */
+                float d_prev = (blk > 0 && seeds[blk-1].dm > 1e-15f)
+                               ? seeds[blk-1].dm : dm0;
+                float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
+                float scale_ratio_l = d_prev / d_curr;
+                if (scale_ratio_l < 0.1f) scale_ratio_l = 0.1f;
+                if (scale_ratio_l > 10.f) scale_ratio_l = 10.f;
+                /* Normalise s = (g × Q) / d² then re-scale to weight units */
+                float d2 = d_sub_l * d_sub_l;
+                float s  = (d2 > 1e-30f) ? (g_left * q_val_l / d2) : 0.0f;
+                block_s_left[blk] = HOLO_ALPHA * s * d_sub_l * scale_ratio_l;
+            }
+            /* ── Right boundary signal: boundary between block blk and blk+1 ──
+             *
+             * Same derivation but at the right edge (position QK_K-1,
+             * sub-block j = N_SUB-1) looking into block blk+1. */
+            {
+                float g_right = (blk + 1 < n_blocks)
+                    ? weights[(blk + 1) * QK_K] - bx[QK_K - 1]
+                    : 0.0f;
+                /* Quantize(I_boundary) for right edge: sub-block j=N_SUB-1 */
+                float d_sub_r = dm0 * (float)seeds[blk].Ls[N_SUB - 1];
+                float m_sub_r = mm0 * (float)seeds[blk].Lm[N_SUB - 1];
+                float q_val_r = 0.0f;
+                if (d_sub_r > 1e-15f) {
+                    int qi = gguf_nearest_int((bx[QK_K-1] + m_sub_r) / d_sub_r);
+                    if (qi < 0) qi = 0; if (qi > 3) qi = 3;
+                    q_val_r = d_sub_r * (float)qi - m_sub_r;
+                }
+                /* Scale ratio Trans(Δτ): d_{blk+1} / d_{blk} */
+                float d_next = (blk + 1 < n_blocks && seeds[blk+1].dm > 1e-15f)
+                               ? seeds[blk+1].dm : dm0;
+                float d_curr = (dm0 > 1e-15f) ? dm0 : 1.0f;
+                float scale_ratio_r = d_next / d_curr;
+                if (scale_ratio_r < 0.1f) scale_ratio_r = 0.1f;
+                if (scale_ratio_r > 10.f) scale_ratio_r = 10.f;
+                float d2 = d_sub_r * d_sub_r;
+                float s  = (d2 > 1e-30f) ? (g_right * q_val_r / d2) : 0.0f;
+                block_s_right[blk] = HOLO_ALPHA * s * d_sub_r * scale_ratio_r;
+            }
+            /* ── DC residual for the next block's rolling_dc ── */
             float dc_res = 0.0f;
             int   j, k;
             for (j = 0; j < N_SUB; j++) {
                 float d_sub = dm0 * (float)candidate_Ls[blk][cidx][j];
                 float m_sub = mm0 * (float)candidate_Lm[blk][cidx][j];
+                int   base  = 16 * j;
                 for (k = 0; k < 16; k++) {
+                    int   elem  = base + k;
+                    float x_adj = bx[elem] - dc_bias
+                                + block_s_left [blk] * boundary_decay    [elem]
+                                + block_s_right[blk] * boundary_decay_rev[elem];
                     int q = 0;
                     if (d_sub >= 1e-15f) {
                         q = gguf_nearest_int((x_adj + m_sub) / d_sub);
+                        if (q < 0) q = 0; if (q > 3) q = 3;
                     }
+                    float deq  = d_sub * (float)q - m_sub;
+                    dc_res    += bx[elem] - deq;   /* residual vs ORIGINAL weight */
                 }
             }
             rolling_dc = dc_res;
         int cidx = best_candidate[blk];
         uint8_t Ls_blk[16], Lm_blk[16];
+        /* ── Holographic boundary reconstruction (Phase 3.9 → Phase 4) ──
+         *
+         * W[b][k] = x[b][k]
+         *         − dc_bias[b]                             (DC error correction)
+         *         + s_left[b]  · exp(−k/τ)                (left  boundary signal)
+         *         + s_right[b] · exp(−(QK_K−1−k)/τ)       (right boundary signal)
+         *
+         * The two boundary signals decay inward from opposite edges and meet
+         * in the middle.  Together they enforce C¹ continuity across every
+         * block boundary in the quantized domain. */
+        float dc_adj   = (block_dc_bias)  ? block_dc_bias [blk] : 0.0f;
+        float s_left   = (block_s_left)   ? block_s_left  [blk] : 0.0f;
+        float s_right  = (block_s_right)  ? block_s_right [blk] : 0.0f;
         float adj_block_x[QK_K];
         {
             int _i;
             for (_i = 0; _i < QK_K; _i++)
+                adj_block_x[_i] = block_x[_i]
+                                 - dc_adj
+                                 + s_left  * boundary_decay    [_i]
+                                 + s_right * boundary_decay_rev[_i];
         }
         memcpy(Ls_blk, candidate_Ls[blk][cidx], 16);
     free(_tl_graphs);
     free(block_dc_bias);
+    free(block_s_left);
+    free(block_s_right);
     free(seeds);
     free(candidate_errors);
     free(candidate_d);