CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on 21 days ago

Commit

f32b3c6

verified ·

1 Parent(s): 63a70a0

Q8_0 tied embeddings

Browse files

Files changed (1) hide show

hexstate_requantize.py +147 -18

hexstate_requantize.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-HExState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
 and re-quantizes eligible weight tensors to Q2_K using numpy.
@@ -70,6 +70,18 @@ def _load_hexstate_lib():
             ctypes.c_int,                     # verbose
         ]
         # Q4_0 HPC quantizer (for attention tensors)
         if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
             lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
@@ -255,6 +267,7 @@ QK_K = 256
 GGML_TYPE_F32   = 0
 GGML_TYPE_F16   = 1
 GGML_TYPE_Q4_0  = 2
 GGML_TYPE_Q2_K  = 10
 GGML_TYPE_BF16  = 30
@@ -355,6 +368,32 @@ def f32_to_bf16(f32_array):
 # - Joint scale+min least-squares solve
 # - 16-step grid search for initial iscale
 def quantize_tensor_q2k(f32_data):
     """Quantize an entire tensor to Q2_K format.
@@ -616,9 +655,12 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
       - ffn_gate_inp — MoE routing gate
       - layer_output_scale — per-layer scaling factor (scalar)
       - altup, laurel — small Gemma-specific tensors
-      - token_embd.weight / output.weight when embeddings are tied
-        (the same tensor serves as both embedding lookup AND LM head;
-         quantizing it to Q2_K destroys logit precision → garbage output)
     """
     n_elements = 1
     for d in dims:
@@ -669,7 +711,8 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
 def main():
     if len(sys.argv) < 3:
-        print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [--keep-metadata]")
         sys.exit(1)
     input_path = sys.argv[1]
@@ -677,6 +720,7 @@ def main():
     keep_metadata = '--keep-metadata' in sys.argv
     quantize_none = '--quantize-none' in sys.argv
     q2all = '--q2all' in sys.argv
     # Check for imatrix
     imatrix_data = None
@@ -697,6 +741,8 @@ def main():
     print("  ╔════════════════════════════════════════════════════════════════╗")
     print("  ║  HExState GGUF Re-Quantizer                                  ║")
     print("  ║  GGUF → Q2_K GGUF with metadata passthrough                  ║")
     if use_hpc and imatrix_data:
         print("  ║  Engine: HPC + iMatrix (calibrated sensitivity propagation)  ║")
     elif use_hpc:
@@ -770,7 +816,11 @@ def main():
         has_output_weight = 'output.weight' in tensor_names
         tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
         if tied_embeddings:
-            print("  ⚠ Tied embeddings detected — token_embd.weight promoted to Q4_0 (serves as LM head)")
             print()
         # ── Determine output types ──
@@ -778,15 +828,26 @@ def main():
         total_quant = 0
         total_attn = 0
         total_keep = 0
         for ti in tensor_infos:
             if quantize_none:
                 will_quant = False
             elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
-                if tied_embeddings and ti['name'] in ('token_embd.weight', 'output.weight'):
-                    will_quant = 'ATTN_Q4'  # Promote tied embedding to Q4_0
-                    total_attn += 1
-                elif q2all:
-                    will_quant = True  # --q2all: everything to Q2_K
                     total_quant += 1
                 elif is_attention_tensor(ti['name']):
                     will_quant = 'ATTN_Q4'  # Promote attention to Q4_0 HPC
@@ -799,9 +860,15 @@ def main():
                 total_keep += 1
             quant_plan.append(will_quant)
-        print(f"  Tensors to quantize (Q2_K):     {total_quant}")
-        print(f"  Tensors to promote (Q4_0·HPC):  {total_attn}")
-        print(f"  Tensors to keep as-is:          {total_keep}")
         print()
         # ── Compute output tensor sizes and offsets ──
@@ -813,17 +880,27 @@ def main():
                 out_dims = list(ti['dims'])
                 dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
-                if quant_plan[i] == 'ATTN_Q4':
                     # Attention tensor → Q4_0 HPC (4.5 bpw)
                     out_type = GGML_TYPE_Q4_0
                     n_blocks = (ti['n_elements'] + 31) // 32
                     out_size = n_blocks * 18
                     print(f"  [ATTN→Q4_0·HPC] {ti['name']} ({ti['n_elements']} elements)")
-                elif dim0 % QK_K == 0:
                     # Q2_K (2.6 bpw, block_size=256)
                     out_type = GGML_TYPE_Q2_K
                     n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
                     out_size = n_blocks * 84
                 elif dim0 % 32 == 0:
                     # Q4_0 fallback (4.5 bpw, block_size=32)
                     out_type = GGML_TYPE_Q4_0
@@ -999,7 +1076,59 @@ def main():
                 fin.seek(abs_offset)
                 raw_data = fin.read(ti['data_size'])
-                if quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
                     # ── Q4_0 quantization (fallback or attention HPC) ──
                     if ti['type'] == GGML_TYPE_BF16:
                         f32 = bf16_to_f32(raw_data, ti['n_elements'])
@@ -1206,4 +1335,4 @@ def main():
 if __name__ == '__main__':
-    main()

 #!/usr/bin/env python3
 """
+HexState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
 and re-quantizes eligible weight tensors to Q2_K using numpy.
             ctypes.c_int,                     # verbose
         ]
+        # Q8_0 HPC quantizer (Shor pipeline; tied embeddings / LM head)
+        if hasattr(lib, 'hexstate_quantize_tensor_q8_0_hpc'):
+            lib.hexstate_quantize_tensor_q8_0_hpc.restype = None
+            lib.hexstate_quantize_tensor_q8_0_hpc.argtypes = [
+                ctypes.POINTER(ctypes.c_float),  # weights
+                ctypes.c_int64,                   # n_elements
+                ctypes.c_void_p,                  # output
+                ctypes.POINTER(ctypes.c_float),   # out_error
+                ctypes.POINTER(ctypes.c_float),   # imat_importance (can be NULL)
+                ctypes.c_int,                     # verbose
+            ]
         # Q4_0 HPC quantizer (for attention tensors)
         if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
             lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
 GGML_TYPE_F32   = 0
 GGML_TYPE_F16   = 1
 GGML_TYPE_Q4_0  = 2
+GGML_TYPE_Q8_0  = 8
 GGML_TYPE_Q2_K  = 10
 GGML_TYPE_BF16  = 30
 # - Joint scale+min least-squares solve
 # - 16-step grid search for initial iscale
+def quantize_tensor_q8_0(f32_data):
+    """Vectorized ggml-faithful Q8_0 (fallback when the HPC lib is absent).
+    Block: 32 weights -> fp16 d + 32 x int8 = 34 bytes; y = q * d.
+    d = amax/127 (float), q = round(x/d), d stored as fp16 -- matches
+    ggml quantize_row_q8_0_ref. Returns (bytes, n_blocks, sse)."""
+    n = len(f32_data)
+    if n % 32 != 0:
+        f32_data = np.concatenate(
+            [f32_data, np.zeros(32 - n % 32, dtype=np.float32)])
+        n = len(f32_data)
+    blocks = f32_data.reshape(-1, 32).astype(np.float32)
+    nb = blocks.shape[0]
+    amax = np.max(np.abs(blocks), axis=1)
+    d = amax / 127.0
+    id_ = np.where(d > 0, 1.0 / np.where(d > 0, d, 1.0), 0.0)
+    qs = np.clip(np.rint(blocks * id_[:, None]), -127, 127).astype(np.int8)
+    d16 = d.astype('<f2')
+    out = np.zeros((nb, 34), dtype=np.uint8)
+    out[:, 0:2] = d16.view(np.uint8).reshape(nb, 2)
+    out[:, 2:]  = qs.view(np.uint8)
+    deq = qs.astype(np.float32) * d16.astype(np.float32)[:, None]
+    sse = float(np.sum((blocks - deq) ** 2))
+    return out.tobytes(), nb, sse
 def quantize_tensor_q2k(f32_data):
     """Quantize an entire tensor to Q2_K format.
       - ffn_gate_inp — MoE routing gate
       - layer_output_scale — per-layer scaling factor (scalar)
       - altup, laurel — small Gemma-specific tensors
+      - token_embd.weight / output.weight — always excluded here.
+        When embeddings are TIED, main() routes token_embd.weight to
+        Q8_0 (HPC Shor pipeline) instead: the same tensor serves as both
+        embedding lookup AND LM head, and Q2_K/Q4_0 there destroys logit
+        precision → looping / repetitive generation. --keep-embd keeps
+        it at source precision instead.
     """
     n_elements = 1
     for d in dims:
 def main():
     if len(sys.argv) < 3:
+        print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf>"
+              " [--keep-metadata] [--imatrix FILE] [--keep-embd] [--q2all]")
         sys.exit(1)
     input_path = sys.argv[1]
     keep_metadata = '--keep-metadata' in sys.argv
     quantize_none = '--quantize-none' in sys.argv
     q2all = '--q2all' in sys.argv
+    keep_embd = '--keep-embd' in sys.argv   # keep tied embedding at source precision instead of Q8_0
     # Check for imatrix
     imatrix_data = None
     print("  ╔════════════════════════════════════════════════════════════════╗")
     print("  ║  HExState GGUF Re-Quantizer                                  ║")
     print("  ║  GGUF → Q2_K GGUF with metadata passthrough                  ║")
+    if q2all:
+        print("  ║  Mode: --q2all  ALL eligible tensors → Q2_K (test mode)      ║")
     if use_hpc and imatrix_data:
         print("  ║  Engine: HPC + iMatrix (calibrated sensitivity propagation)  ║")
     elif use_hpc:
         has_output_weight = 'output.weight' in tensor_names
         tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
         if tied_embeddings:
+            if keep_embd:
+                print("  ⚠ Tied embeddings detected — token_embd.weight kept at source precision (--keep-embd)")
+            else:
+                print("  ⚠ Tied embeddings detected — token_embd.weight → Q8_0 via Shor pipeline (serves as LM head;")
+                print("    Q2_K/Q4_0 here destroys logit precision — classic looping-output symptom)")
             print()
         # ── Determine output types ──
         total_quant = 0
         total_attn = 0
         total_keep = 0
+        total_embd = 0
         for ti in tensor_infos:
             if quantize_none:
                 will_quant = False
+            elif (tied_embeddings and ti['name'] == 'token_embd.weight'
+                  and not keep_embd and ti['n_elements'] % 32 == 0):
+                # Tied embedding doubles as the LM head. NOTE: the old
+                # 'promote to Q4_0' branch below should_quantize() was dead
+                # code (should_quantize always returned False for
+                # token_embd), so the tensor was silently kept at F16/BF16.
+                # Now: Q8_0 (8.5 bpw, ~2x smaller than F16) via the HPC
+                # Shor pipeline — transparent for both embedding lookup
+                # and logit projection.
+                will_quant = 'EMBD_Q8'
+                total_embd += 1
             elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
+                if q2all:
+                    # --q2all: ALL eligible tensors → Q2_K, no exceptions
+                    # (tied embedding stays on the Q8_0 route above).
+                    will_quant = True
                     total_quant += 1
                 elif is_attention_tensor(ti['name']):
                     will_quant = 'ATTN_Q4'  # Promote attention to Q4_0 HPC
                 total_keep += 1
             quant_plan.append(will_quant)
+        if q2all:
+            print(f"  Mode: --q2all — all eligible tensors forced to Q2_K")
+            print(f"  Tensors to quantize (Q2_K):     {total_quant}")
+            print(f"  Tensors to keep as-is:          {total_keep}")
+        else:
+            print(f"  Tensors to quantize (Q2_K):     {total_quant}")
+            print(f"  Tensors to promote (Q4_0·HPC):  {total_attn}")
+            print(f"  Tied embd → Q8_0 (Shor·HPC):    {total_embd}")
+            print(f"  Tensors to keep as-is:          {total_keep}")
         print()
         # ── Compute output tensor sizes and offsets ──
                 out_dims = list(ti['dims'])
                 dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
+                if quant_plan[i] == 'EMBD_Q8':
+                    # Tied embedding / LM head → Q8_0 (8.5 bpw, 34 B / 32 w)
+                    out_type = GGML_TYPE_Q8_0
+                    n_blocks = ti['n_elements'] // 32
+                    out_size = n_blocks * 34
+                    print(f"  [EMBD→Q8_0·Shor] {ti['name']} ({ti['n_elements']:,} elements)")
+                elif quant_plan[i] == 'ATTN_Q4':
                     # Attention tensor → Q4_0 HPC (4.5 bpw)
                     out_type = GGML_TYPE_Q4_0
                     n_blocks = (ti['n_elements'] + 31) // 32
                     out_size = n_blocks * 18
                     print(f"  [ATTN→Q4_0·HPC] {ti['name']} ({ti['n_elements']} elements)")
+                elif dim0 % QK_K == 0 or q2all:
                     # Q2_K (2.6 bpw, block_size=256)
+                    # --q2all forces Q2_K even when dim0 isn't a clean multiple;
+                    # the quantizer pads internally to the next QK_K boundary.
                     out_type = GGML_TYPE_Q2_K
                     n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
                     out_size = n_blocks * 84
+                    if q2all and dim0 % QK_K != 0:
+                        print(f"  [Q2_K·PADDED] {ti['name']} (dim0={dim0}, padded to QK_K boundary)")
                 elif dim0 % 32 == 0:
                     # Q4_0 fallback (4.5 bpw, block_size=32)
                     out_type = GGML_TYPE_Q4_0
                 fin.seek(abs_offset)
                 raw_data = fin.read(ti['data_size'])
+                if quant_plan[i] == 'EMBD_Q8':
+                    # ── Tied embedding → Q8_0 via the HPC Shor pipeline ──
+                    if ti['type'] == GGML_TYPE_BF16:
+                        f32 = bf16_to_f32(raw_data, ti['n_elements'])
+                    elif ti['type'] == GGML_TYPE_F16:
+                        f32 = f16_to_f32(raw_data, ti['n_elements'])
+                    elif ti['type'] == GGML_TYPE_F32:
+                        f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
+                    else:
+                        # Can't re-quantize from quantized source — keep
+                        fout.write(raw_data)
+                        pad = align_offset(fout.tell()) - fout.tell()
+                        if pad > 0: fout.write(b'\x00' * pad)
+                        continue
+                    n_el = ti['n_elements']
+                    n_blocks_q8 = n_el // 32
+                    if use_hpc and hasattr(_HEXSTATE_LIB, 'hexstate_quantize_tensor_q8_0_hpc'):
+                        output_buf = np.zeros(n_blocks_q8 * 34, dtype=np.uint8)
+                        error = ctypes.c_float(0.0)
+                        f32_c = np.ascontiguousarray(f32, dtype=np.float32)
+                        imat_ptr = None
+                        if imatrix_data and ti['name'] in imatrix_data:
+                            iw = imatrix_data[ti['name']]
+                            n_cols = iw.shape[0]
+                            n_rows = n_el // n_cols if n_cols > 0 else 1
+                            imat_full = np.tile(iw, n_rows)[:n_el].astype(np.float32)
+                            imat_c = np.ascontiguousarray(imat_full)
+                            imat_ptr = imat_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
+                        _HEXSTATE_LIB.hexstate_quantize_tensor_q8_0_hpc(
+                            f32_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
+                            ctypes.c_int64(n_el),
+                            output_buf.ctypes.data_as(ctypes.c_void_p),
+                            ctypes.byref(error),
+                            imat_ptr,
+                            ctypes.c_int(0),
+                        )
+                        fout.write(output_buf.tobytes())
+                        rmse8 = float(np.sqrt(error.value / max(n_el, 1)))
+                        print(f"\n  [Q8_0·Shor] {ti['name']} RMSE={rmse8:.6e}")
+                    else:
+                        q8_bytes, n_blocks_q8, sse8 = quantize_tensor_q8_0(f32)
+                        fout.write(q8_bytes)
+                        rmse8 = float(np.sqrt(sse8 / max(n_el, 1)))
+                        print(f"\n  [Q8_0] {ti['name']} RMSE={rmse8:.6e} (numpy fallback)")
+                    quant_count += 1
+                    total_quant_bytes += n_blocks_q8 * 34
+                elif quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
                     # ── Q4_0 quantization (fallback or attention HPC) ──
                     if ti['type'] == GGML_TYPE_BF16:
                         f32 = bf16_to_f32(raw_data, ti['n_elements'])
 if __name__ == '__main__':
+    main()