CompressedGemma commited on
Commit
f32b3c6
Β·
verified Β·
1 Parent(s): 63a70a0

Q8_0 tied embeddings

Browse files
Files changed (1) hide show
  1. hexstate_requantize.py +147 -18
hexstate_requantize.py CHANGED
@@ -1,6 +1,6 @@
1
  #!/usr/bin/env python3
2
  """
3
- HExState GGUF Re-Quantizer β€” GGUF-to-GGUF Q2_K quantization.
4
 
5
  Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
6
  and re-quantizes eligible weight tensors to Q2_K using numpy.
@@ -70,6 +70,18 @@ def _load_hexstate_lib():
70
  ctypes.c_int, # verbose
71
  ]
72
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  # Q4_0 HPC quantizer (for attention tensors)
74
  if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
75
  lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
@@ -255,6 +267,7 @@ QK_K = 256
255
  GGML_TYPE_F32 = 0
256
  GGML_TYPE_F16 = 1
257
  GGML_TYPE_Q4_0 = 2
 
258
  GGML_TYPE_Q2_K = 10
259
  GGML_TYPE_BF16 = 30
260
 
@@ -355,6 +368,32 @@ def f32_to_bf16(f32_array):
355
  # - Joint scale+min least-squares solve
356
  # - 16-step grid search for initial iscale
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  def quantize_tensor_q2k(f32_data):
359
  """Quantize an entire tensor to Q2_K format.
360
 
@@ -616,9 +655,12 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
616
  - ffn_gate_inp β€” MoE routing gate
617
  - layer_output_scale β€” per-layer scaling factor (scalar)
618
  - altup, laurel β€” small Gemma-specific tensors
619
- - token_embd.weight / output.weight when embeddings are tied
620
- (the same tensor serves as both embedding lookup AND LM head;
621
- quantizing it to Q2_K destroys logit precision β†’ garbage output)
 
 
 
622
  """
623
  n_elements = 1
624
  for d in dims:
@@ -669,7 +711,8 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
669
 
670
  def main():
671
  if len(sys.argv) < 3:
672
- print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf> [--keep-metadata]")
 
673
  sys.exit(1)
674
 
675
  input_path = sys.argv[1]
@@ -677,6 +720,7 @@ def main():
677
  keep_metadata = '--keep-metadata' in sys.argv
678
  quantize_none = '--quantize-none' in sys.argv
679
  q2all = '--q2all' in sys.argv
 
680
 
681
  # Check for imatrix
682
  imatrix_data = None
@@ -697,6 +741,8 @@ def main():
697
  print(" ╔════════════════════════════════════════════════════════════════╗")
698
  print(" β•‘ HExState GGUF Re-Quantizer β•‘")
699
  print(" β•‘ GGUF β†’ Q2_K GGUF with metadata passthrough β•‘")
 
 
700
  if use_hpc and imatrix_data:
701
  print(" β•‘ Engine: HPC + iMatrix (calibrated sensitivity propagation) β•‘")
702
  elif use_hpc:
@@ -770,7 +816,11 @@ def main():
770
  has_output_weight = 'output.weight' in tensor_names
771
  tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
772
  if tied_embeddings:
773
- print(" ⚠ Tied embeddings detected β€” token_embd.weight promoted to Q4_0 (serves as LM head)")
 
 
 
 
774
  print()
775
 
776
  # ── Determine output types ──
@@ -778,15 +828,26 @@ def main():
778
  total_quant = 0
779
  total_attn = 0
780
  total_keep = 0
 
781
  for ti in tensor_infos:
782
  if quantize_none:
783
  will_quant = False
 
 
 
 
 
 
 
 
 
 
 
784
  elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
785
- if tied_embeddings and ti['name'] in ('token_embd.weight', 'output.weight'):
786
- will_quant = 'ATTN_Q4' # Promote tied embedding to Q4_0
787
- total_attn += 1
788
- elif q2all:
789
- will_quant = True # --q2all: everything to Q2_K
790
  total_quant += 1
791
  elif is_attention_tensor(ti['name']):
792
  will_quant = 'ATTN_Q4' # Promote attention to Q4_0 HPC
@@ -799,9 +860,15 @@ def main():
799
  total_keep += 1
800
  quant_plan.append(will_quant)
801
 
802
- print(f" Tensors to quantize (Q2_K): {total_quant}")
803
- print(f" Tensors to promote (Q4_0Β·HPC): {total_attn}")
804
- print(f" Tensors to keep as-is: {total_keep}")
 
 
 
 
 
 
805
  print()
806
 
807
  # ── Compute output tensor sizes and offsets ──
@@ -813,17 +880,27 @@ def main():
813
  out_dims = list(ti['dims'])
814
  dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
815
 
816
- if quant_plan[i] == 'ATTN_Q4':
 
 
 
 
 
 
817
  # Attention tensor β†’ Q4_0 HPC (4.5 bpw)
818
  out_type = GGML_TYPE_Q4_0
819
  n_blocks = (ti['n_elements'] + 31) // 32
820
  out_size = n_blocks * 18
821
  print(f" [ATTNβ†’Q4_0Β·HPC] {ti['name']} ({ti['n_elements']} elements)")
822
- elif dim0 % QK_K == 0:
823
  # Q2_K (2.6 bpw, block_size=256)
 
 
824
  out_type = GGML_TYPE_Q2_K
825
  n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
826
  out_size = n_blocks * 84
 
 
827
  elif dim0 % 32 == 0:
828
  # Q4_0 fallback (4.5 bpw, block_size=32)
829
  out_type = GGML_TYPE_Q4_0
@@ -999,7 +1076,59 @@ def main():
999
  fin.seek(abs_offset)
1000
  raw_data = fin.read(ti['data_size'])
1001
 
1002
- if quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1003
  # ── Q4_0 quantization (fallback or attention HPC) ──
1004
  if ti['type'] == GGML_TYPE_BF16:
1005
  f32 = bf16_to_f32(raw_data, ti['n_elements'])
@@ -1206,4 +1335,4 @@ def main():
1206
 
1207
 
1208
  if __name__ == '__main__':
1209
- main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ HexState GGUF Re-Quantizer β€” GGUF-to-GGUF Q2_K quantization.
4
 
5
  Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
6
  and re-quantizes eligible weight tensors to Q2_K using numpy.
 
70
  ctypes.c_int, # verbose
71
  ]
72
 
73
+ # Q8_0 HPC quantizer (Shor pipeline; tied embeddings / LM head)
74
+ if hasattr(lib, 'hexstate_quantize_tensor_q8_0_hpc'):
75
+ lib.hexstate_quantize_tensor_q8_0_hpc.restype = None
76
+ lib.hexstate_quantize_tensor_q8_0_hpc.argtypes = [
77
+ ctypes.POINTER(ctypes.c_float), # weights
78
+ ctypes.c_int64, # n_elements
79
+ ctypes.c_void_p, # output
80
+ ctypes.POINTER(ctypes.c_float), # out_error
81
+ ctypes.POINTER(ctypes.c_float), # imat_importance (can be NULL)
82
+ ctypes.c_int, # verbose
83
+ ]
84
+
85
  # Q4_0 HPC quantizer (for attention tensors)
86
  if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
87
  lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
 
267
  GGML_TYPE_F32 = 0
268
  GGML_TYPE_F16 = 1
269
  GGML_TYPE_Q4_0 = 2
270
+ GGML_TYPE_Q8_0 = 8
271
  GGML_TYPE_Q2_K = 10
272
  GGML_TYPE_BF16 = 30
273
 
 
368
  # - Joint scale+min least-squares solve
369
  # - 16-step grid search for initial iscale
370
 
371
+ def quantize_tensor_q8_0(f32_data):
372
+ """Vectorized ggml-faithful Q8_0 (fallback when the HPC lib is absent).
373
+
374
+ Block: 32 weights -> fp16 d + 32 x int8 = 34 bytes; y = q * d.
375
+ d = amax/127 (float), q = round(x/d), d stored as fp16 -- matches
376
+ ggml quantize_row_q8_0_ref. Returns (bytes, n_blocks, sse)."""
377
+ n = len(f32_data)
378
+ if n % 32 != 0:
379
+ f32_data = np.concatenate(
380
+ [f32_data, np.zeros(32 - n % 32, dtype=np.float32)])
381
+ n = len(f32_data)
382
+ blocks = f32_data.reshape(-1, 32).astype(np.float32)
383
+ nb = blocks.shape[0]
384
+ amax = np.max(np.abs(blocks), axis=1)
385
+ d = amax / 127.0
386
+ id_ = np.where(d > 0, 1.0 / np.where(d > 0, d, 1.0), 0.0)
387
+ qs = np.clip(np.rint(blocks * id_[:, None]), -127, 127).astype(np.int8)
388
+ d16 = d.astype('<f2')
389
+ out = np.zeros((nb, 34), dtype=np.uint8)
390
+ out[:, 0:2] = d16.view(np.uint8).reshape(nb, 2)
391
+ out[:, 2:] = qs.view(np.uint8)
392
+ deq = qs.astype(np.float32) * d16.astype(np.float32)[:, None]
393
+ sse = float(np.sum((blocks - deq) ** 2))
394
+ return out.tobytes(), nb, sse
395
+
396
+
397
  def quantize_tensor_q2k(f32_data):
398
  """Quantize an entire tensor to Q2_K format.
399
 
 
655
  - ffn_gate_inp β€” MoE routing gate
656
  - layer_output_scale β€” per-layer scaling factor (scalar)
657
  - altup, laurel β€” small Gemma-specific tensors
658
+ - token_embd.weight / output.weight β€” always excluded here.
659
+ When embeddings are TIED, main() routes token_embd.weight to
660
+ Q8_0 (HPC Shor pipeline) instead: the same tensor serves as both
661
+ embedding lookup AND LM head, and Q2_K/Q4_0 there destroys logit
662
+ precision β†’ looping / repetitive generation. --keep-embd keeps
663
+ it at source precision instead.
664
  """
665
  n_elements = 1
666
  for d in dims:
 
711
 
712
  def main():
713
  if len(sys.argv) < 3:
714
+ print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf>"
715
+ " [--keep-metadata] [--imatrix FILE] [--keep-embd] [--q2all]")
716
  sys.exit(1)
717
 
718
  input_path = sys.argv[1]
 
720
  keep_metadata = '--keep-metadata' in sys.argv
721
  quantize_none = '--quantize-none' in sys.argv
722
  q2all = '--q2all' in sys.argv
723
+ keep_embd = '--keep-embd' in sys.argv # keep tied embedding at source precision instead of Q8_0
724
 
725
  # Check for imatrix
726
  imatrix_data = None
 
741
  print(" ╔════════════════════════════════════════════════════════════════╗")
742
  print(" β•‘ HExState GGUF Re-Quantizer β•‘")
743
  print(" β•‘ GGUF β†’ Q2_K GGUF with metadata passthrough β•‘")
744
+ if q2all:
745
+ print(" β•‘ Mode: --q2all ALL eligible tensors β†’ Q2_K (test mode) β•‘")
746
  if use_hpc and imatrix_data:
747
  print(" β•‘ Engine: HPC + iMatrix (calibrated sensitivity propagation) β•‘")
748
  elif use_hpc:
 
816
  has_output_weight = 'output.weight' in tensor_names
817
  tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
818
  if tied_embeddings:
819
+ if keep_embd:
820
+ print(" ⚠ Tied embeddings detected β€” token_embd.weight kept at source precision (--keep-embd)")
821
+ else:
822
+ print(" ⚠ Tied embeddings detected β€” token_embd.weight β†’ Q8_0 via Shor pipeline (serves as LM head;")
823
+ print(" Q2_K/Q4_0 here destroys logit precision β€” classic looping-output symptom)")
824
  print()
825
 
826
  # ── Determine output types ──
 
828
  total_quant = 0
829
  total_attn = 0
830
  total_keep = 0
831
+ total_embd = 0
832
  for ti in tensor_infos:
833
  if quantize_none:
834
  will_quant = False
835
+ elif (tied_embeddings and ti['name'] == 'token_embd.weight'
836
+ and not keep_embd and ti['n_elements'] % 32 == 0):
837
+ # Tied embedding doubles as the LM head. NOTE: the old
838
+ # 'promote to Q4_0' branch below should_quantize() was dead
839
+ # code (should_quantize always returned False for
840
+ # token_embd), so the tensor was silently kept at F16/BF16.
841
+ # Now: Q8_0 (8.5 bpw, ~2x smaller than F16) via the HPC
842
+ # Shor pipeline β€” transparent for both embedding lookup
843
+ # and logit projection.
844
+ will_quant = 'EMBD_Q8'
845
+ total_embd += 1
846
  elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
847
+ if q2all:
848
+ # --q2all: ALL eligible tensors β†’ Q2_K, no exceptions
849
+ # (tied embedding stays on the Q8_0 route above).
850
+ will_quant = True
 
851
  total_quant += 1
852
  elif is_attention_tensor(ti['name']):
853
  will_quant = 'ATTN_Q4' # Promote attention to Q4_0 HPC
 
860
  total_keep += 1
861
  quant_plan.append(will_quant)
862
 
863
+ if q2all:
864
+ print(f" Mode: --q2all β€” all eligible tensors forced to Q2_K")
865
+ print(f" Tensors to quantize (Q2_K): {total_quant}")
866
+ print(f" Tensors to keep as-is: {total_keep}")
867
+ else:
868
+ print(f" Tensors to quantize (Q2_K): {total_quant}")
869
+ print(f" Tensors to promote (Q4_0Β·HPC): {total_attn}")
870
+ print(f" Tied embd β†’ Q8_0 (ShorΒ·HPC): {total_embd}")
871
+ print(f" Tensors to keep as-is: {total_keep}")
872
  print()
873
 
874
  # ── Compute output tensor sizes and offsets ──
 
880
  out_dims = list(ti['dims'])
881
  dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
882
 
883
+ if quant_plan[i] == 'EMBD_Q8':
884
+ # Tied embedding / LM head β†’ Q8_0 (8.5 bpw, 34 B / 32 w)
885
+ out_type = GGML_TYPE_Q8_0
886
+ n_blocks = ti['n_elements'] // 32
887
+ out_size = n_blocks * 34
888
+ print(f" [EMBD→Q8_0·Shor] {ti['name']} ({ti['n_elements']:,} elements)")
889
+ elif quant_plan[i] == 'ATTN_Q4':
890
  # Attention tensor β†’ Q4_0 HPC (4.5 bpw)
891
  out_type = GGML_TYPE_Q4_0
892
  n_blocks = (ti['n_elements'] + 31) // 32
893
  out_size = n_blocks * 18
894
  print(f" [ATTNβ†’Q4_0Β·HPC] {ti['name']} ({ti['n_elements']} elements)")
895
+ elif dim0 % QK_K == 0 or q2all:
896
  # Q2_K (2.6 bpw, block_size=256)
897
+ # --q2all forces Q2_K even when dim0 isn't a clean multiple;
898
+ # the quantizer pads internally to the next QK_K boundary.
899
  out_type = GGML_TYPE_Q2_K
900
  n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
901
  out_size = n_blocks * 84
902
+ if q2all and dim0 % QK_K != 0:
903
+ print(f" [Q2_KΒ·PADDED] {ti['name']} (dim0={dim0}, padded to QK_K boundary)")
904
  elif dim0 % 32 == 0:
905
  # Q4_0 fallback (4.5 bpw, block_size=32)
906
  out_type = GGML_TYPE_Q4_0
 
1076
  fin.seek(abs_offset)
1077
  raw_data = fin.read(ti['data_size'])
1078
 
1079
+ if quant_plan[i] == 'EMBD_Q8':
1080
+ # ── Tied embedding β†’ Q8_0 via the HPC Shor pipeline ──
1081
+ if ti['type'] == GGML_TYPE_BF16:
1082
+ f32 = bf16_to_f32(raw_data, ti['n_elements'])
1083
+ elif ti['type'] == GGML_TYPE_F16:
1084
+ f32 = f16_to_f32(raw_data, ti['n_elements'])
1085
+ elif ti['type'] == GGML_TYPE_F32:
1086
+ f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
1087
+ else:
1088
+ # Can't re-quantize from quantized source β€” keep
1089
+ fout.write(raw_data)
1090
+ pad = align_offset(fout.tell()) - fout.tell()
1091
+ if pad > 0: fout.write(b'\x00' * pad)
1092
+ continue
1093
+
1094
+ n_el = ti['n_elements']
1095
+ n_blocks_q8 = n_el // 32
1096
+
1097
+ if use_hpc and hasattr(_HEXSTATE_LIB, 'hexstate_quantize_tensor_q8_0_hpc'):
1098
+ output_buf = np.zeros(n_blocks_q8 * 34, dtype=np.uint8)
1099
+ error = ctypes.c_float(0.0)
1100
+ f32_c = np.ascontiguousarray(f32, dtype=np.float32)
1101
+
1102
+ imat_ptr = None
1103
+ if imatrix_data and ti['name'] in imatrix_data:
1104
+ iw = imatrix_data[ti['name']]
1105
+ n_cols = iw.shape[0]
1106
+ n_rows = n_el // n_cols if n_cols > 0 else 1
1107
+ imat_full = np.tile(iw, n_rows)[:n_el].astype(np.float32)
1108
+ imat_c = np.ascontiguousarray(imat_full)
1109
+ imat_ptr = imat_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
1110
+
1111
+ _HEXSTATE_LIB.hexstate_quantize_tensor_q8_0_hpc(
1112
+ f32_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
1113
+ ctypes.c_int64(n_el),
1114
+ output_buf.ctypes.data_as(ctypes.c_void_p),
1115
+ ctypes.byref(error),
1116
+ imat_ptr,
1117
+ ctypes.c_int(0),
1118
+ )
1119
+ fout.write(output_buf.tobytes())
1120
+ rmse8 = float(np.sqrt(error.value / max(n_el, 1)))
1121
+ print(f"\n [Q8_0Β·Shor] {ti['name']} RMSE={rmse8:.6e}")
1122
+ else:
1123
+ q8_bytes, n_blocks_q8, sse8 = quantize_tensor_q8_0(f32)
1124
+ fout.write(q8_bytes)
1125
+ rmse8 = float(np.sqrt(sse8 / max(n_el, 1)))
1126
+ print(f"\n [Q8_0] {ti['name']} RMSE={rmse8:.6e} (numpy fallback)")
1127
+
1128
+ quant_count += 1
1129
+ total_quant_bytes += n_blocks_q8 * 34
1130
+
1131
+ elif quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
1132
  # ── Q4_0 quantization (fallback or attention HPC) ──
1133
  if ti['type'] == GGML_TYPE_BF16:
1134
  f32 = bf16_to_f32(raw_data, ti['n_elements'])
 
1335
 
1336
 
1337
  if __name__ == '__main__':
1338
+ main()