Q8_0 tied embeddings
Browse files- hexstate_requantize.py +147 -18
hexstate_requantize.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
|
| 5 |
Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
|
| 6 |
and re-quantizes eligible weight tensors to Q2_K using numpy.
|
|
@@ -70,6 +70,18 @@ def _load_hexstate_lib():
|
|
| 70 |
ctypes.c_int, # verbose
|
| 71 |
]
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# Q4_0 HPC quantizer (for attention tensors)
|
| 74 |
if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
|
| 75 |
lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
|
|
@@ -255,6 +267,7 @@ QK_K = 256
|
|
| 255 |
GGML_TYPE_F32 = 0
|
| 256 |
GGML_TYPE_F16 = 1
|
| 257 |
GGML_TYPE_Q4_0 = 2
|
|
|
|
| 258 |
GGML_TYPE_Q2_K = 10
|
| 259 |
GGML_TYPE_BF16 = 30
|
| 260 |
|
|
@@ -355,6 +368,32 @@ def f32_to_bf16(f32_array):
|
|
| 355 |
# - Joint scale+min least-squares solve
|
| 356 |
# - 16-step grid search for initial iscale
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
def quantize_tensor_q2k(f32_data):
|
| 359 |
"""Quantize an entire tensor to Q2_K format.
|
| 360 |
|
|
@@ -616,9 +655,12 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
|
|
| 616 |
- ffn_gate_inp β MoE routing gate
|
| 617 |
- layer_output_scale β per-layer scaling factor (scalar)
|
| 618 |
- altup, laurel β small Gemma-specific tensors
|
| 619 |
-
- token_embd.weight / output.weight
|
| 620 |
-
|
| 621 |
-
|
|
|
|
|
|
|
|
|
|
| 622 |
"""
|
| 623 |
n_elements = 1
|
| 624 |
for d in dims:
|
|
@@ -669,7 +711,8 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
|
|
| 669 |
|
| 670 |
def main():
|
| 671 |
if len(sys.argv) < 3:
|
| 672 |
-
print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf>
|
|
|
|
| 673 |
sys.exit(1)
|
| 674 |
|
| 675 |
input_path = sys.argv[1]
|
|
@@ -677,6 +720,7 @@ def main():
|
|
| 677 |
keep_metadata = '--keep-metadata' in sys.argv
|
| 678 |
quantize_none = '--quantize-none' in sys.argv
|
| 679 |
q2all = '--q2all' in sys.argv
|
|
|
|
| 680 |
|
| 681 |
# Check for imatrix
|
| 682 |
imatrix_data = None
|
|
@@ -697,6 +741,8 @@ def main():
|
|
| 697 |
print(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
|
| 698 |
print(" β HExState GGUF Re-Quantizer β")
|
| 699 |
print(" β GGUF β Q2_K GGUF with metadata passthrough β")
|
|
|
|
|
|
|
| 700 |
if use_hpc and imatrix_data:
|
| 701 |
print(" β Engine: HPC + iMatrix (calibrated sensitivity propagation) β")
|
| 702 |
elif use_hpc:
|
|
@@ -770,7 +816,11 @@ def main():
|
|
| 770 |
has_output_weight = 'output.weight' in tensor_names
|
| 771 |
tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
|
| 772 |
if tied_embeddings:
|
| 773 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 774 |
print()
|
| 775 |
|
| 776 |
# ββ Determine output types ββ
|
|
@@ -778,15 +828,26 @@ def main():
|
|
| 778 |
total_quant = 0
|
| 779 |
total_attn = 0
|
| 780 |
total_keep = 0
|
|
|
|
| 781 |
for ti in tensor_infos:
|
| 782 |
if quantize_none:
|
| 783 |
will_quant = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 784 |
elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
|
| 785 |
-
if
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
will_quant = True # --q2all: everything to Q2_K
|
| 790 |
total_quant += 1
|
| 791 |
elif is_attention_tensor(ti['name']):
|
| 792 |
will_quant = 'ATTN_Q4' # Promote attention to Q4_0 HPC
|
|
@@ -799,9 +860,15 @@ def main():
|
|
| 799 |
total_keep += 1
|
| 800 |
quant_plan.append(will_quant)
|
| 801 |
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 805 |
print()
|
| 806 |
|
| 807 |
# ββ Compute output tensor sizes and offsets ββ
|
|
@@ -813,17 +880,27 @@ def main():
|
|
| 813 |
out_dims = list(ti['dims'])
|
| 814 |
dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
|
| 815 |
|
| 816 |
-
if quant_plan[i] == '
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
# Attention tensor β Q4_0 HPC (4.5 bpw)
|
| 818 |
out_type = GGML_TYPE_Q4_0
|
| 819 |
n_blocks = (ti['n_elements'] + 31) // 32
|
| 820 |
out_size = n_blocks * 18
|
| 821 |
print(f" [ATTNβQ4_0Β·HPC] {ti['name']} ({ti['n_elements']} elements)")
|
| 822 |
-
elif dim0 % QK_K == 0:
|
| 823 |
# Q2_K (2.6 bpw, block_size=256)
|
|
|
|
|
|
|
| 824 |
out_type = GGML_TYPE_Q2_K
|
| 825 |
n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
|
| 826 |
out_size = n_blocks * 84
|
|
|
|
|
|
|
| 827 |
elif dim0 % 32 == 0:
|
| 828 |
# Q4_0 fallback (4.5 bpw, block_size=32)
|
| 829 |
out_type = GGML_TYPE_Q4_0
|
|
@@ -999,7 +1076,59 @@ def main():
|
|
| 999 |
fin.seek(abs_offset)
|
| 1000 |
raw_data = fin.read(ti['data_size'])
|
| 1001 |
|
| 1002 |
-
if quant_plan[i]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1003 |
# ββ Q4_0 quantization (fallback or attention HPC) ββ
|
| 1004 |
if ti['type'] == GGML_TYPE_BF16:
|
| 1005 |
f32 = bf16_to_f32(raw_data, ti['n_elements'])
|
|
@@ -1206,4 +1335,4 @@ def main():
|
|
| 1206 |
|
| 1207 |
|
| 1208 |
if __name__ == '__main__':
|
| 1209 |
-
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
HexState GGUF Re-Quantizer β GGUF-to-GGUF Q2_K quantization.
|
| 4 |
|
| 5 |
Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
|
| 6 |
and re-quantizes eligible weight tensors to Q2_K using numpy.
|
|
|
|
| 70 |
ctypes.c_int, # verbose
|
| 71 |
]
|
| 72 |
|
| 73 |
+
# Q8_0 HPC quantizer (Shor pipeline; tied embeddings / LM head)
|
| 74 |
+
if hasattr(lib, 'hexstate_quantize_tensor_q8_0_hpc'):
|
| 75 |
+
lib.hexstate_quantize_tensor_q8_0_hpc.restype = None
|
| 76 |
+
lib.hexstate_quantize_tensor_q8_0_hpc.argtypes = [
|
| 77 |
+
ctypes.POINTER(ctypes.c_float), # weights
|
| 78 |
+
ctypes.c_int64, # n_elements
|
| 79 |
+
ctypes.c_void_p, # output
|
| 80 |
+
ctypes.POINTER(ctypes.c_float), # out_error
|
| 81 |
+
ctypes.POINTER(ctypes.c_float), # imat_importance (can be NULL)
|
| 82 |
+
ctypes.c_int, # verbose
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
# Q4_0 HPC quantizer (for attention tensors)
|
| 86 |
if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
|
| 87 |
lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
|
|
|
|
| 267 |
GGML_TYPE_F32 = 0
|
| 268 |
GGML_TYPE_F16 = 1
|
| 269 |
GGML_TYPE_Q4_0 = 2
|
| 270 |
+
GGML_TYPE_Q8_0 = 8
|
| 271 |
GGML_TYPE_Q2_K = 10
|
| 272 |
GGML_TYPE_BF16 = 30
|
| 273 |
|
|
|
|
| 368 |
# - Joint scale+min least-squares solve
|
| 369 |
# - 16-step grid search for initial iscale
|
| 370 |
|
| 371 |
+
def quantize_tensor_q8_0(f32_data):
|
| 372 |
+
"""Vectorized ggml-faithful Q8_0 (fallback when the HPC lib is absent).
|
| 373 |
+
|
| 374 |
+
Block: 32 weights -> fp16 d + 32 x int8 = 34 bytes; y = q * d.
|
| 375 |
+
d = amax/127 (float), q = round(x/d), d stored as fp16 -- matches
|
| 376 |
+
ggml quantize_row_q8_0_ref. Returns (bytes, n_blocks, sse)."""
|
| 377 |
+
n = len(f32_data)
|
| 378 |
+
if n % 32 != 0:
|
| 379 |
+
f32_data = np.concatenate(
|
| 380 |
+
[f32_data, np.zeros(32 - n % 32, dtype=np.float32)])
|
| 381 |
+
n = len(f32_data)
|
| 382 |
+
blocks = f32_data.reshape(-1, 32).astype(np.float32)
|
| 383 |
+
nb = blocks.shape[0]
|
| 384 |
+
amax = np.max(np.abs(blocks), axis=1)
|
| 385 |
+
d = amax / 127.0
|
| 386 |
+
id_ = np.where(d > 0, 1.0 / np.where(d > 0, d, 1.0), 0.0)
|
| 387 |
+
qs = np.clip(np.rint(blocks * id_[:, None]), -127, 127).astype(np.int8)
|
| 388 |
+
d16 = d.astype('<f2')
|
| 389 |
+
out = np.zeros((nb, 34), dtype=np.uint8)
|
| 390 |
+
out[:, 0:2] = d16.view(np.uint8).reshape(nb, 2)
|
| 391 |
+
out[:, 2:] = qs.view(np.uint8)
|
| 392 |
+
deq = qs.astype(np.float32) * d16.astype(np.float32)[:, None]
|
| 393 |
+
sse = float(np.sum((blocks - deq) ** 2))
|
| 394 |
+
return out.tobytes(), nb, sse
|
| 395 |
+
|
| 396 |
+
|
| 397 |
def quantize_tensor_q2k(f32_data):
|
| 398 |
"""Quantize an entire tensor to Q2_K format.
|
| 399 |
|
|
|
|
| 655 |
- ffn_gate_inp β MoE routing gate
|
| 656 |
- layer_output_scale β per-layer scaling factor (scalar)
|
| 657 |
- altup, laurel β small Gemma-specific tensors
|
| 658 |
+
- token_embd.weight / output.weight β always excluded here.
|
| 659 |
+
When embeddings are TIED, main() routes token_embd.weight to
|
| 660 |
+
Q8_0 (HPC Shor pipeline) instead: the same tensor serves as both
|
| 661 |
+
embedding lookup AND LM head, and Q2_K/Q4_0 there destroys logit
|
| 662 |
+
precision β looping / repetitive generation. --keep-embd keeps
|
| 663 |
+
it at source precision instead.
|
| 664 |
"""
|
| 665 |
n_elements = 1
|
| 666 |
for d in dims:
|
|
|
|
| 711 |
|
| 712 |
def main():
|
| 713 |
if len(sys.argv) < 3:
|
| 714 |
+
print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf>"
|
| 715 |
+
" [--keep-metadata] [--imatrix FILE] [--keep-embd] [--q2all]")
|
| 716 |
sys.exit(1)
|
| 717 |
|
| 718 |
input_path = sys.argv[1]
|
|
|
|
| 720 |
keep_metadata = '--keep-metadata' in sys.argv
|
| 721 |
quantize_none = '--quantize-none' in sys.argv
|
| 722 |
q2all = '--q2all' in sys.argv
|
| 723 |
+
keep_embd = '--keep-embd' in sys.argv # keep tied embedding at source precision instead of Q8_0
|
| 724 |
|
| 725 |
# Check for imatrix
|
| 726 |
imatrix_data = None
|
|
|
|
| 741 |
print(" ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ")
|
| 742 |
print(" β HExState GGUF Re-Quantizer β")
|
| 743 |
print(" β GGUF β Q2_K GGUF with metadata passthrough β")
|
| 744 |
+
if q2all:
|
| 745 |
+
print(" β Mode: --q2all ALL eligible tensors β Q2_K (test mode) β")
|
| 746 |
if use_hpc and imatrix_data:
|
| 747 |
print(" β Engine: HPC + iMatrix (calibrated sensitivity propagation) β")
|
| 748 |
elif use_hpc:
|
|
|
|
| 816 |
has_output_weight = 'output.weight' in tensor_names
|
| 817 |
tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
|
| 818 |
if tied_embeddings:
|
| 819 |
+
if keep_embd:
|
| 820 |
+
print(" β Tied embeddings detected β token_embd.weight kept at source precision (--keep-embd)")
|
| 821 |
+
else:
|
| 822 |
+
print(" β Tied embeddings detected β token_embd.weight β Q8_0 via Shor pipeline (serves as LM head;")
|
| 823 |
+
print(" Q2_K/Q4_0 here destroys logit precision β classic looping-output symptom)")
|
| 824 |
print()
|
| 825 |
|
| 826 |
# ββ Determine output types ββ
|
|
|
|
| 828 |
total_quant = 0
|
| 829 |
total_attn = 0
|
| 830 |
total_keep = 0
|
| 831 |
+
total_embd = 0
|
| 832 |
for ti in tensor_infos:
|
| 833 |
if quantize_none:
|
| 834 |
will_quant = False
|
| 835 |
+
elif (tied_embeddings and ti['name'] == 'token_embd.weight'
|
| 836 |
+
and not keep_embd and ti['n_elements'] % 32 == 0):
|
| 837 |
+
# Tied embedding doubles as the LM head. NOTE: the old
|
| 838 |
+
# 'promote to Q4_0' branch below should_quantize() was dead
|
| 839 |
+
# code (should_quantize always returned False for
|
| 840 |
+
# token_embd), so the tensor was silently kept at F16/BF16.
|
| 841 |
+
# Now: Q8_0 (8.5 bpw, ~2x smaller than F16) via the HPC
|
| 842 |
+
# Shor pipeline β transparent for both embedding lookup
|
| 843 |
+
# and logit projection.
|
| 844 |
+
will_quant = 'EMBD_Q8'
|
| 845 |
+
total_embd += 1
|
| 846 |
elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
|
| 847 |
+
if q2all:
|
| 848 |
+
# --q2all: ALL eligible tensors β Q2_K, no exceptions
|
| 849 |
+
# (tied embedding stays on the Q8_0 route above).
|
| 850 |
+
will_quant = True
|
|
|
|
| 851 |
total_quant += 1
|
| 852 |
elif is_attention_tensor(ti['name']):
|
| 853 |
will_quant = 'ATTN_Q4' # Promote attention to Q4_0 HPC
|
|
|
|
| 860 |
total_keep += 1
|
| 861 |
quant_plan.append(will_quant)
|
| 862 |
|
| 863 |
+
if q2all:
|
| 864 |
+
print(f" Mode: --q2all β all eligible tensors forced to Q2_K")
|
| 865 |
+
print(f" Tensors to quantize (Q2_K): {total_quant}")
|
| 866 |
+
print(f" Tensors to keep as-is: {total_keep}")
|
| 867 |
+
else:
|
| 868 |
+
print(f" Tensors to quantize (Q2_K): {total_quant}")
|
| 869 |
+
print(f" Tensors to promote (Q4_0Β·HPC): {total_attn}")
|
| 870 |
+
print(f" Tied embd β Q8_0 (ShorΒ·HPC): {total_embd}")
|
| 871 |
+
print(f" Tensors to keep as-is: {total_keep}")
|
| 872 |
print()
|
| 873 |
|
| 874 |
# ββ Compute output tensor sizes and offsets ββ
|
|
|
|
| 880 |
out_dims = list(ti['dims'])
|
| 881 |
dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']
|
| 882 |
|
| 883 |
+
if quant_plan[i] == 'EMBD_Q8':
|
| 884 |
+
# Tied embedding / LM head β Q8_0 (8.5 bpw, 34 B / 32 w)
|
| 885 |
+
out_type = GGML_TYPE_Q8_0
|
| 886 |
+
n_blocks = ti['n_elements'] // 32
|
| 887 |
+
out_size = n_blocks * 34
|
| 888 |
+
print(f" [EMBDβQ8_0Β·Shor] {ti['name']} ({ti['n_elements']:,} elements)")
|
| 889 |
+
elif quant_plan[i] == 'ATTN_Q4':
|
| 890 |
# Attention tensor β Q4_0 HPC (4.5 bpw)
|
| 891 |
out_type = GGML_TYPE_Q4_0
|
| 892 |
n_blocks = (ti['n_elements'] + 31) // 32
|
| 893 |
out_size = n_blocks * 18
|
| 894 |
print(f" [ATTNβQ4_0Β·HPC] {ti['name']} ({ti['n_elements']} elements)")
|
| 895 |
+
elif dim0 % QK_K == 0 or q2all:
|
| 896 |
# Q2_K (2.6 bpw, block_size=256)
|
| 897 |
+
# --q2all forces Q2_K even when dim0 isn't a clean multiple;
|
| 898 |
+
# the quantizer pads internally to the next QK_K boundary.
|
| 899 |
out_type = GGML_TYPE_Q2_K
|
| 900 |
n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
|
| 901 |
out_size = n_blocks * 84
|
| 902 |
+
if q2all and dim0 % QK_K != 0:
|
| 903 |
+
print(f" [Q2_KΒ·PADDED] {ti['name']} (dim0={dim0}, padded to QK_K boundary)")
|
| 904 |
elif dim0 % 32 == 0:
|
| 905 |
# Q4_0 fallback (4.5 bpw, block_size=32)
|
| 906 |
out_type = GGML_TYPE_Q4_0
|
|
|
|
| 1076 |
fin.seek(abs_offset)
|
| 1077 |
raw_data = fin.read(ti['data_size'])
|
| 1078 |
|
| 1079 |
+
if quant_plan[i] == 'EMBD_Q8':
|
| 1080 |
+
# ββ Tied embedding β Q8_0 via the HPC Shor pipeline ββ
|
| 1081 |
+
if ti['type'] == GGML_TYPE_BF16:
|
| 1082 |
+
f32 = bf16_to_f32(raw_data, ti['n_elements'])
|
| 1083 |
+
elif ti['type'] == GGML_TYPE_F16:
|
| 1084 |
+
f32 = f16_to_f32(raw_data, ti['n_elements'])
|
| 1085 |
+
elif ti['type'] == GGML_TYPE_F32:
|
| 1086 |
+
f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
|
| 1087 |
+
else:
|
| 1088 |
+
# Can't re-quantize from quantized source β keep
|
| 1089 |
+
fout.write(raw_data)
|
| 1090 |
+
pad = align_offset(fout.tell()) - fout.tell()
|
| 1091 |
+
if pad > 0: fout.write(b'\x00' * pad)
|
| 1092 |
+
continue
|
| 1093 |
+
|
| 1094 |
+
n_el = ti['n_elements']
|
| 1095 |
+
n_blocks_q8 = n_el // 32
|
| 1096 |
+
|
| 1097 |
+
if use_hpc and hasattr(_HEXSTATE_LIB, 'hexstate_quantize_tensor_q8_0_hpc'):
|
| 1098 |
+
output_buf = np.zeros(n_blocks_q8 * 34, dtype=np.uint8)
|
| 1099 |
+
error = ctypes.c_float(0.0)
|
| 1100 |
+
f32_c = np.ascontiguousarray(f32, dtype=np.float32)
|
| 1101 |
+
|
| 1102 |
+
imat_ptr = None
|
| 1103 |
+
if imatrix_data and ti['name'] in imatrix_data:
|
| 1104 |
+
iw = imatrix_data[ti['name']]
|
| 1105 |
+
n_cols = iw.shape[0]
|
| 1106 |
+
n_rows = n_el // n_cols if n_cols > 0 else 1
|
| 1107 |
+
imat_full = np.tile(iw, n_rows)[:n_el].astype(np.float32)
|
| 1108 |
+
imat_c = np.ascontiguousarray(imat_full)
|
| 1109 |
+
imat_ptr = imat_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
| 1110 |
+
|
| 1111 |
+
_HEXSTATE_LIB.hexstate_quantize_tensor_q8_0_hpc(
|
| 1112 |
+
f32_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 1113 |
+
ctypes.c_int64(n_el),
|
| 1114 |
+
output_buf.ctypes.data_as(ctypes.c_void_p),
|
| 1115 |
+
ctypes.byref(error),
|
| 1116 |
+
imat_ptr,
|
| 1117 |
+
ctypes.c_int(0),
|
| 1118 |
+
)
|
| 1119 |
+
fout.write(output_buf.tobytes())
|
| 1120 |
+
rmse8 = float(np.sqrt(error.value / max(n_el, 1)))
|
| 1121 |
+
print(f"\n [Q8_0Β·Shor] {ti['name']} RMSE={rmse8:.6e}")
|
| 1122 |
+
else:
|
| 1123 |
+
q8_bytes, n_blocks_q8, sse8 = quantize_tensor_q8_0(f32)
|
| 1124 |
+
fout.write(q8_bytes)
|
| 1125 |
+
rmse8 = float(np.sqrt(sse8 / max(n_el, 1)))
|
| 1126 |
+
print(f"\n [Q8_0] {ti['name']} RMSE={rmse8:.6e} (numpy fallback)")
|
| 1127 |
+
|
| 1128 |
+
quant_count += 1
|
| 1129 |
+
total_quant_bytes += n_blocks_q8 * 34
|
| 1130 |
+
|
| 1131 |
+
elif quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
|
| 1132 |
# ββ Q4_0 quantization (fallback or attention HPC) ββ
|
| 1133 |
if ti['type'] == GGML_TYPE_BF16:
|
| 1134 |
f32 = bf16_to_f32(raw_data, ti['n_elements'])
|
|
|
|
| 1335 |
|
| 1336 |
|
| 1337 |
if __name__ == '__main__':
|
| 1338 |
+
main()
|