HPC-Quantize / hexstate_requantize.py

Q8_0 tied embeddings

f32b3c6 verified 14 days ago

60.2 kB

	#!/usr/bin/env python3
	"""
	HexState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.

	Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
	and re-quantizes eligible weight tensors to Q2_K using numpy.

	This bypasses the tokenizer parsing problem entirely — the source GGUF
	(from llama.cpp's convert_hf_to_gguf.py) has correct metadata.

	Usage:
	python3 hexstate_requantize.py input.gguf output.gguf
	"""

	import struct
	import sys
	import time
	import os
	import io
	import ctypes
	import numpy as np

	# ─── HExState C Library (HPC-optimized Q2_K quantization) ──────────────────
	_HEXSTATE_LIB = None

	def _load_hexstate_lib():
	"""Try to load the HExState C shared library for HPC-optimized quantization."""
	global _HEXSTATE_LIB
	if _HEXSTATE_LIB is not None:
	return _HEXSTATE_LIB

	lib_dir = os.path.dirname(os.path.abspath(__file__))
	lib_path = os.path.join(lib_dir, "libhexstate_q2k.so")

	if not os.path.exists(lib_path):
	return None

	try:
	lib = ctypes.CDLL(lib_path)

	# void hexstate_init(void)
	lib.hexstate_init.restype = None
	lib.hexstate_init.argtypes = []

	# void hexstate_quantize_tensor_q2k(const float, int64_t, void, float*, int, int)
	lib.hexstate_quantize_tensor_q2k.restype = None
	lib.hexstate_quantize_tensor_q2k.argtypes = [
	ctypes.POINTER(ctypes.c_float), # weights
	ctypes.c_int64, # n_elements
	ctypes.c_void_p, # output
	ctypes.POINTER(ctypes.c_float), # out_error
	ctypes.c_int, # opt_mode (0=HPC, 1=MSE, 2=Hybrid)
	ctypes.c_int, # verbose
	]

	lib.hexstate_q2k_block_bytes.restype = ctypes.c_int
	lib.hexstate_q2k_block_bytes.argtypes = []
	lib.hexstate_q2k_block_elements.restype = ctypes.c_int
	lib.hexstate_q2k_block_elements.argtypes = []

	# imatrix-aware version
	lib.hexstate_quantize_tensor_q2k_imat.restype = None
	lib.hexstate_quantize_tensor_q2k_imat.argtypes = [
	ctypes.POINTER(ctypes.c_float), # weights
	ctypes.c_int64, # n_elements
	ctypes.c_void_p, # output
	ctypes.POINTER(ctypes.c_float), # out_error
	ctypes.c_int, # opt_mode
	ctypes.POINTER(ctypes.c_float), # imat_importance (can be NULL)
	ctypes.c_int, # verbose
	]

	# Q8_0 HPC quantizer (Shor pipeline; tied embeddings / LM head)
	if hasattr(lib, 'hexstate_quantize_tensor_q8_0_hpc'):
	lib.hexstate_quantize_tensor_q8_0_hpc.restype = None
	lib.hexstate_quantize_tensor_q8_0_hpc.argtypes = [
	ctypes.POINTER(ctypes.c_float), # weights
	ctypes.c_int64, # n_elements
	ctypes.c_void_p, # output
	ctypes.POINTER(ctypes.c_float), # out_error
	ctypes.POINTER(ctypes.c_float), # imat_importance (can be NULL)
	ctypes.c_int, # verbose
	]

	# Q4_0 HPC quantizer (for attention tensors)
	if hasattr(lib, 'hexstate_quantize_tensor_q4_0_hpc'):
	lib.hexstate_quantize_tensor_q4_0_hpc.restype = None
	lib.hexstate_quantize_tensor_q4_0_hpc.argtypes = [
	ctypes.POINTER(ctypes.c_float), # weights
	ctypes.c_int64, # n_elements
	ctypes.c_void_p, # output
	ctypes.POINTER(ctypes.c_float), # out_error
	ctypes.POINTER(ctypes.c_float), # imat_importance (can be NULL)
	ctypes.c_int, # verbose
	]

	lib.hexstate_init()
	_HEXSTATE_LIB = lib
	return lib
	except Exception as e:
	print(f" WARNING: Failed to load HexState library: {e}")
	return None


	def _skip_gguf_kv_value(f, vtype):
	"""Skip a GGUF KV value of the given type."""
	import struct as st
	size_map = {0:1, 1:1, 2:2, 3:2, 4:4, 5:4, 6:4, 7:1, 10:8, 11:8, 12:8}
	if vtype == 8: # string
	slen = st.unpack('<Q', f.read(8))[0]
	f.read(slen)
	elif vtype == 9: # array
	arr_type = st.unpack('<I', f.read(4))[0]
	arr_len = st.unpack('<Q', f.read(8))[0]
	if arr_type == 8: # array of strings
	for _ in range(arr_len):
	slen = st.unpack('<Q', f.read(8))[0]
	f.read(slen)
	else:
	sz = size_map.get(arr_type, 4)
	f.read(arr_len * sz)
	else:
	sz = size_map.get(vtype, 4)
	f.read(sz)


	def read_imatrix(path):
	"""Read llama.cpp importance matrix file (GGUF or legacy .dat format).

	Returns dict: tensor_name -> normalized importance array (float32)
	"""
	import struct as st
	imat = {}

	with open(path, 'rb') as f:
	magic = st.unpack('<I', f.read(4))[0]

	if magic == 0x46554747: # GGUF format (modern llama.cpp)
	_ver = st.unpack('<I', f.read(4))[0]
	n_tensors = st.unpack('<Q', f.read(8))[0]
	n_kv = st.unpack('<Q', f.read(8))[0]

	# Skip KV pairs
	for _ in range(n_kv):
	slen = st.unpack('<Q', f.read(8))[0]
	f.read(slen) # key
	vtype = st.unpack('<I', f.read(4))[0]
	_skip_gguf_kv_value(f, vtype)

	# Read tensor infos
	tensor_infos = []
	for _ in range(n_tensors):
	slen = st.unpack('<Q', f.read(8))[0]
	name = f.read(slen).decode('utf-8', errors='replace')
	n_dims = st.unpack('<I', f.read(4))[0]
	dims = [st.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
	ttype = st.unpack('<I', f.read(4))[0]
	offset = st.unpack('<Q', f.read(8))[0]
	n_el = 1
	for d in dims:
	n_el *= d
	tensor_infos.append((name, n_el, offset))

	# Data section start (32-byte aligned)
	data_start = ((f.tell() + 31) // 32) * 32

	# Group by base tensor name: collect in_sum2 and counts
	sum2_data = {}
	counts_data = {}
	for name, n_el, offset in tensor_infos:
	f.seek(data_start + offset)
	data = np.frombuffer(f.read(n_el * 4), dtype=np.float32).copy()
	if name.endswith('.in_sum2'):
	base = name[:-len('.in_sum2')]
	sum2_data[base] = data
	elif name.endswith('.counts'):
	base = name[:-len('.counts')]
	counts_data[base] = data

	# Compute normalized importance: sqrt(in_sum2 / counts) / mean
	for base_name in sum2_data:
	in_sum2 = sum2_data[base_name]
	count = counts_data.get(base_name, np.array([1.0]))[0]
	if count > 0:
	importance = np.sqrt(in_sum2 / count)
	else:
	importance = np.ones_like(in_sum2)
	mean = importance.mean()
	if mean > 1e-30:
	imat[base_name] = importance / mean
	else:
	imat[base_name] = np.ones_like(importance)

	else:
	# Legacy format: first 4 bytes were n_entries
	f.seek(0)
	n_entries = st.unpack('<i', f.read(4))[0]
	for _ in range(n_entries):
	name_len = st.unpack('<i', f.read(4))[0]
	name = f.read(name_len).decode('utf-8')
	n_values = st.unpack('<i', f.read(4))[0]
	n_samples = st.unpack('<i', f.read(4))[0]
	values = np.frombuffer(f.read(n_values * 4), dtype=np.float32).copy()
	mean = values.mean()
	if mean > 1e-30:
	imat[name] = values / mean
	else:
	imat[name] = np.ones_like(values)

	return imat


	def quantize_tensor_q2k_hpc(f32_data, opt_mode=2, importance=None):
	"""Quantize tensor using HexState HPC-optimized C implementation.

	opt_mode: 0=HPC (BP only), 1=MSE (grid search), 2=Hybrid (recommended)
	importance: optional per-element importance weights (from imatrix)
	Returns: (bytes, n_blocks) same as quantize_tensor_q2k()
	"""
	lib = _load_hexstate_lib()
	if lib is None:
	raise RuntimeError("HexState library not available")

	n_elements = len(f32_data)
	if n_elements % QK_K != 0:
	pad_len = QK_K - (n_elements % QK_K)
	f32_data = np.concatenate([f32_data, np.zeros(pad_len, dtype=np.float32)])
	if importance is not None:
	importance = np.concatenate([importance, np.ones(pad_len, dtype=np.float32)])
	n_elements = len(f32_data)

	n_blocks = n_elements // QK_K
	block_bytes = lib.hexstate_q2k_block_bytes() # 84

	# Allocate output buffer
	output = np.zeros(n_blocks * block_bytes, dtype=np.uint8)
	error = ctypes.c_float(0.0)

	# Call C quantizer with or without importance weights
	f32_contiguous = np.ascontiguousarray(f32_data, dtype=np.float32)

	if importance is not None:
	imat_contiguous = np.ascontiguousarray(importance, dtype=np.float32)
	imat_ptr = imat_contiguous.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
	else:
	imat_ptr = None

	lib.hexstate_quantize_tensor_q2k_imat(
	f32_contiguous.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
	ctypes.c_int64(n_elements),
	output.ctypes.data_as(ctypes.c_void_p),
	ctypes.byref(error),
	ctypes.c_int(opt_mode),
	imat_ptr,
	ctypes.c_int(1), # verbose
	)

	return output.tobytes(), n_blocks


	# ─── Constants ──────────────────────────────────────────────────────────────
	GGUF_MAGIC = 0x46554747
	GGUF_VERSION = 3
	ALIGNMENT = 32
	QK_K = 256

	GGML_TYPE_F32 = 0
	GGML_TYPE_F16 = 1
	GGML_TYPE_Q4_0 = 2
	GGML_TYPE_Q8_0 = 8
	GGML_TYPE_Q2_K = 10
	GGML_TYPE_BF16 = 30

	TYPE_NAME = {
	0: "F32", 1: "F16", 2: "Q4_0", 3: "Q4_1", 6: "Q5_0", 7: "Q5_1",
	8: "Q8_0", 9: "Q8_1", 10: "Q2_K", 11: "Q3_K", 12: "Q4_K",
	13: "Q5_K", 14: "Q6_K", 15: "Q8_K", 30: "BF16",
	}

	# Block sizes and byte sizes for each type
	TYPE_BLOCK_SIZE = {
	0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
	8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
	13: 256, 14: 256, 15: 256, 30: 1,
	}
	TYPE_BLOCK_BYTES = {
	0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
	8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
	13: 176, 14: 210, 15: 292, 30: 2,
	}


	def align_offset(offset, alignment=ALIGNMENT):
	return (offset + alignment - 1) & ~(alignment - 1)


	def read_string(f):
	slen = struct.unpack('<Q', f.read(8))[0]
	return f.read(slen).decode('utf-8', errors='replace')


	def write_string(f, s):
	data = s.encode('utf-8')
	f.write(struct.pack('<Q', len(data)))
	f.write(data)


	def read_kv_value(f, vtype):
	"""Read a KV value and return (vtype, raw_bytes) for passthrough."""
	start = f.tell()
	if vtype == 0: f.read(1) # UINT8
	elif vtype == 1: f.read(1) # INT8
	elif vtype == 2: f.read(2) # UINT16
	elif vtype == 3: f.read(2) # INT16
	elif vtype == 4: f.read(4) # UINT32
	elif vtype == 5: f.read(4) # INT32
	elif vtype == 6: f.read(4) # FLOAT32
	elif vtype == 7: f.read(1) # BOOL
	elif vtype == 8: # STRING
	slen = struct.unpack('<Q', f.read(8))[0]
	f.read(slen)
	elif vtype == 9: # ARRAY
	arr_type = struct.unpack('<I', f.read(4))[0]
	arr_len = struct.unpack('<Q', f.read(8))[0]
	for _ in range(arr_len):
	read_kv_value(f, arr_type)
	elif vtype == 10: f.read(8) # UINT64
	elif vtype == 11: f.read(8) # INT64
	elif vtype == 12: f.read(8) # FLOAT64
	else:
	raise ValueError(f"Unknown KV type {vtype}")
	end = f.tell()
	f.seek(start)
	raw = f.read(end - start)
	return raw


	# ─── BF16 ↔ F32 conversion ─────────────────────────────────────────────────
	def bf16_to_f32(data_bytes, n_elements):
	"""Convert BF16 raw bytes to float32 numpy array."""
	bf16 = np.frombuffer(data_bytes, dtype=np.uint16)
	# BF16 → F32: shift left 16 bits
	f32_bits = bf16.astype(np.uint32) << 16
	return f32_bits.view(np.float32)


	def f16_to_f32(data_bytes, n_elements):
	"""Convert F16 raw bytes to float32 numpy array."""
	f16 = np.frombuffer(data_bytes, dtype=np.float16)
	return f16.astype(np.float32)


	def f32_to_f16(f32_array):
	"""Convert float32 array to F16 bytes."""
	return f32_array.astype(np.float16).tobytes()


	def f32_to_bf16(f32_array):
	"""Convert float32 array to BF16 bytes."""
	f32_bits = f32_array.view(np.uint32)
	bf16 = ((f32_bits + 0x8000) >> 16).astype(np.uint16)
	return bf16.tobytes()


	# ─── Q2_K quantization — faithful port of ggml quantize_row_q2_K_ref ───────
	# Vectorized with numpy for performance. Uses make_qkx2_quants algorithm:
	# - Weighted MAD error with weights[i] = \|x[i]\|
	# - Joint scale+min least-squares solve
	# - 16-step grid search for initial iscale

	def quantize_tensor_q8_0(f32_data):
	"""Vectorized ggml-faithful Q8_0 (fallback when the HPC lib is absent).

	Block: 32 weights -> fp16 d + 32 x int8 = 34 bytes; y = q * d.
	d = amax/127 (float), q = round(x/d), d stored as fp16 -- matches
	ggml quantize_row_q8_0_ref. Returns (bytes, n_blocks, sse)."""
	n = len(f32_data)
	if n % 32 != 0:
	f32_data = np.concatenate(
	[f32_data, np.zeros(32 - n % 32, dtype=np.float32)])
	n = len(f32_data)
	blocks = f32_data.reshape(-1, 32).astype(np.float32)
	nb = blocks.shape[0]
	amax = np.max(np.abs(blocks), axis=1)
	d = amax / 127.0
	id_ = np.where(d > 0, 1.0 / np.where(d > 0, d, 1.0), 0.0)
	qs = np.clip(np.rint(blocks * id_[:, None]), -127, 127).astype(np.int8)
	d16 = d.astype('<f2')
	out = np.zeros((nb, 34), dtype=np.uint8)
	out[:, 0:2] = d16.view(np.uint8).reshape(nb, 2)
	out[:, 2:] = qs.view(np.uint8)
	deq = qs.astype(np.float32) * d16.astype(np.float32)[:, None]
	sse = float(np.sum((blocks - deq) ** 2))
	return out.tobytes(), nb, sse


	def quantize_tensor_q2k(f32_data):
	"""Quantize an entire tensor to Q2_K format.

	Faithful vectorized port of ggml quantize_row_q2_K_ref with
	make_qkx2_quants sub-block optimization.

	Q2_K block layout (84 bytes, must match ggml block_q2_K):
	d : fp16 super-block scale
	dmin : fp16 super-block min-scale
	scales[16] : packed 4-bit scale + 4-bit min per sub-block
	qs[64] : interleaved 2-bit quants (4 weights 32-apart per byte)
	"""
	n_elements = len(f32_data)
	nmax = 3
	q4scale = 15.0

	# Pad to QK_K (256) multiple
	if n_elements % QK_K != 0:
	pad_len = QK_K - (n_elements % QK_K)
	f32_data = np.concatenate([f32_data, np.zeros(pad_len, dtype=np.float32)])
	n_elements = len(f32_data)

	n_blocks = n_elements // QK_K

	# Reshape: [n_blocks, 16 sub-blocks, 16 weights]
	data = f32_data.reshape(n_blocks, 16, 16).astype(np.float64)

	# ── make_qkx2_quants vectorized over all sub-blocks ──
	# Shape key: S = [n_blocks, 16], V = [n_blocks, 16, 16]

	weights = np.abs(data) # [n_blocks, 16, 16]

	sb_min = data.min(axis=2) # [n_blocks, 16]
	sb_max = data.max(axis=2) # [n_blocks, 16]
	sb_min = np.minimum(sb_min, 0.0)

	# Weighted sums (needed for least-squares solve)
	sum_w = weights.sum(axis=2) # [n_blocks, 16]
	sum_x = (weights * data).sum(axis=2) # [n_blocks, 16]

	sb_range = sb_max - sb_min
	degenerate = sb_range < 1e-30 # [n_blocks, 16]
	safe_range = np.maximum(sb_range, 1e-30)

	# Initial quantization
	iscale0 = nmax / safe_range
	scale0 = 1.0 / np.maximum(iscale0, 1e-30)

	shifted0 = data - sb_min[:, :, None] # [n_blocks, 16, 16]
	L0 = np.clip(np.round(iscale0[:, :, None] * shifted0), 0, nmax).astype(np.float64)

	# Initial error (MAD): sum(w * \|scale*L + min - x\|)
	recon0 = scale0[:, :, None] * L0 + sb_min[:, :, None]
	best_error = (weights * np.abs(recon0 - data)).sum(axis=2) # [n_blocks, 16]

	best_L = L0.copy()
	best_scale = scale0.copy()
	best_min = sb_min.copy()

	# Grid search: 16 steps (nstep=15, rmin=-0.5, rdelta=0.1)
	rmin, rdelta, nstep = -0.5, 0.1, 15
	for ist in range(nstep + 1):
	iscale_try = (rmin + rdelta * ist + nmax) / safe_range # [n_blocks, 16]

	shifted = data - sb_min[:, :, None] # use original min for quantization
	Laux = np.clip(np.round(iscale_try[:, :, None] * shifted), 0, nmax).astype(np.float64)

	# Weighted sums for least-squares solve
	wL = weights * Laux # [n_blocks, 16, 16]
	sum_l = wL.sum(axis=2) # [n_blocks, 16]
	sum_l2 = (wL * Laux).sum(axis=2) # [n_blocks, 16]
	sum_xl = (wL * data).sum(axis=2) # [n_blocks, 16]

	# Solve 2-var system: x[i] ≈ this_scale * L[i] + this_min
	D = sum_w * sum_l2 - sum_l * sum_l
	valid_D = D > 0

	this_scale = np.where(valid_D,
	(sum_w * sum_xl - sum_x * sum_l) / np.maximum(D, 1e-30),
	0.0)
	this_min = np.where(valid_D,
	(sum_l2 * sum_x - sum_l * sum_xl) / np.maximum(D, 1e-30),
	0.0)

	# If this_min > 0, clamp to 0 and recompute scale
	pos_min = this_min > 0
	this_min = np.where(pos_min, 0.0, this_min)
	this_scale = np.where(pos_min & (sum_l2 > 0),
	sum_xl / np.maximum(sum_l2, 1e-30),
	this_scale)

	# Compute error for this trial
	recon = this_scale[:, :, None] * Laux + this_min[:, :, None]
	cur_error = (weights * np.abs(recon - data)).sum(axis=2)

	# Update where this trial is better
	better = valid_D & (cur_error < best_error) & ~degenerate
	if better.any():
	# Expand mask to weight dimension for L update
	better3d = better[:, :, None]
	best_L = np.where(better3d, Laux, best_L)
	best_error = np.where(better, cur_error, best_error)
	best_scale = np.where(better, this_scale, best_scale)
	best_min = np.where(better, this_min, best_min)

	# the_min = -best_min (make positive)
	sb_scale = np.maximum(best_scale, 0.0).astype(np.float32) # [n_blocks, 16]
	sb_the_min = np.maximum(-best_min, 0.0).astype(np.float32) # [n_blocks, 16]

	# Handle degenerate sub-blocks
	sb_scale[degenerate] = 0.0
	sb_the_min[degenerate] = np.maximum(-sb_min[degenerate], 0.0).astype(np.float32)

	# ── Phase 2: quantize scales/mins to 4-bit ──
	max_scale = sb_scale.max(axis=1) # [n_blocks]
	max_min = sb_the_min.max(axis=1) # [n_blocks]

	# Quantize sub-block scales to 4-bit
	has_scale = max_scale > 0
	iscale_s = np.where(has_scale, q4scale / np.maximum(max_scale, 1e-30), 0.0)
	scales_q = np.where(has_scale[:, None],
	np.clip(np.round(iscale_s[:, None] * sb_scale), 0, 15),
	0.0).astype(np.uint8)

	# Quantize sub-block mins to 4-bit
	has_min = max_min > 0
	iscale_m = np.where(has_min, q4scale / np.maximum(max_min, 1e-30), 0.0)
	mins_q = np.where(has_min[:, None],
	np.clip(np.round(iscale_m[:, None] * sb_the_min), 0, 15),
	0.0).astype(np.uint8)

	d_fp16 = np.where(has_scale, max_scale / q4scale, 0.0).astype(np.float16)
	dmin_fp16 = np.where(has_min, max_min / q4scale, 0.0).astype(np.float16)

	# ── Phase 3: requantize using fp16-truncated d/dmin ──
	scales_packed = scales_q \| (mins_q << 4) # [n_blocks, 16]

	d_f32 = d_fp16.astype(np.float32)
	dmin_f32 = dmin_fp16.astype(np.float32)

	d_sub = d_f32[:, None] * (scales_packed & 0xF).astype(np.float32)
	dm_sub = dmin_f32[:, None] * (scales_packed >> 4).astype(np.float32)

	# l = nearest_int((x + dm) / d), clamp [0,3]
	valid_d = d_sub > 0
	inv_d = np.where(valid_d, 1.0 / np.maximum(d_sub, 1e-30), 0.0)
	q_vals = np.where(valid_d[:, :, None],
	np.clip(np.round(
	(f32_data.reshape(n_blocks, 16, 16) + dm_sub[:, :, None]) * inv_d[:, :, None]
	), 0, 3),
	0).astype(np.uint8)

	# ── Phase 4: pack ──
	q_flat = q_vals.reshape(n_blocks, QK_K)
	q_groups = q_flat.reshape(n_blocks, 2, 4, 32)
	qs_packed = (q_groups[:, :, 0, :] \|
	(q_groups[:, :, 1, :] << 2) \|
	(q_groups[:, :, 2, :] << 4) \|
	(q_groups[:, :, 3, :] << 6)).astype(np.uint8)
	qs_packed = qs_packed.reshape(n_blocks, 64)

	# Build output: [n_blocks, 84] bytes
	# Layout matches ggml block_q2_K: scales[16] \| qs[64] \| d(fp16) \| dmin(fp16)
	result = np.zeros((n_blocks, 84), dtype=np.uint8)
	result[:, 0:16] = scales_packed
	result[:, 16:80] = qs_packed
	result[:, 80:82] = d_fp16.view(np.uint8).reshape(n_blocks, 2)
	result[:, 82:84] = dmin_fp16.view(np.uint8).reshape(n_blocks, 2)

	return result.tobytes(), n_blocks


	def dequant_q2k_fast(q2k_bytes, n_blocks):
	"""Vectorized Q2_K dequantization for RMSE computation.

	Block layout (84 bytes) — same for both C struct and Python writer:
	scales[16] (bytes 0-15) \| qs[64] (bytes 16-79) \| d(fp16, bytes 80-81) \| dmin(fp16, bytes 82-83)

	The C struct BlockQ2K in gguf_format.h is:
	{ uint8_t scales[16]; uint8_t qs[64]; uint16_t d; uint16_t dmin; }

	Dequantization follows gguf_dequantize_q2_k_block() exactly:
	For each half (0..1), qs_half = qs[half32 : half32+32]
	For each shift j (0..3):
	scale_idx = half8 + j2
	elements [0..15] use scales[scale_idx], from qs_half[0..15] >> (j*2)
	elements [16..31] use scales[scale_idx+1], from qs_half[16..31] >> (j*2)
	"""
	data = np.frombuffer(q2k_bytes, dtype=np.uint8).reshape(n_blocks, 84)

	# Extract fields
	scales_packed = data[:, 0:16] # [n_blocks, 16]
	qs = data[:, 16:80] # [n_blocks, 64]
	d_fp16 = data[:, 80:82].copy().view(np.float16).astype(np.float32).reshape(n_blocks)
	dmin_fp16 = data[:, 82:84].copy().view(np.float16).astype(np.float32).reshape(n_blocks)

	# Extract scale (low 4 bits) and min (high 4 bits) per sub-block
	sc = (scales_packed & 0xF).astype(np.float32) # [n_blocks, 16]
	mn = (scales_packed >> 4).astype(np.float32) # [n_blocks, 16]

	# Compute per-sub-block d_sub and m_sub
	d_sub = d_fp16[:, np.newaxis] * sc # [n_blocks, 16]
	m_sub = dmin_fp16[:, np.newaxis] * mn # [n_blocks, 16]

	# Unpack 2-bit quants from qs[64] into 256 values per block.
	# Matches C reference: two scales per 32-byte extraction (16 elements each).
	# half=0: qs[0..31], half=1: qs[32..63]
	# shift j=0..3: scale_idx = half8 + j2 (first 16), +1 (second 16)
	result = np.zeros((n_blocks, QK_K), dtype=np.float32)
	for half in range(2):
	qs_half = qs[:, half * 32:(half + 1) * 32] # [n_blocks, 32]
	for sub in range(4):
	# Extract 2-bit quants at this shift position
	q_vals = ((qs_half >> (sub * 2)) & 3).astype(np.float32) # [n_blocks, 32]
	base_idx = half * 128 + sub * 32

	# First 16 elements: qs_half[0..15], scale index = half8 + sub2
	si_0 = half * 8 + sub * 2
	result[:, base_idx:base_idx + 16] = (
	d_sub[:, si_0:si_0+1] * q_vals[:, :16] - m_sub[:, si_0:si_0+1]
	)

	# Second 16 elements: qs_half[16..31], scale index = si_0 + 1
	si_1 = si_0 + 1
	result[:, base_idx + 16:base_idx + 32] = (
	d_sub[:, si_1:si_1+1] * q_vals[:, 16:] - m_sub[:, si_1:si_1+1]
	)
	return result.reshape(-1)


	def is_attention_tensor(name):
	"""Detect attention Q/K/V/O projection tensors.
	These are the most sensitive to quantization and get promoted to Q4_0."""
	attn_patterns = [
	'attn_q.weight', 'attn_k.weight', 'attn_v.weight', 'attn_output.weight',
	'attn_qkv.weight', 'attn_gate.weight',
	'self_attn.q_proj.weight', 'self_attn.k_proj.weight',
	'self_attn.v_proj.weight', 'self_attn.o_proj.weight',
	# Qwen 3.6 DeltaNet SSM projections — treat as attention-class
	'ssm_in_qkv.weight', 'ssm_in_z.weight', 'ssm_out.weight',
	'linear_attn.in_proj_qkv.weight', 'linear_attn.in_proj_z.weight',
	'linear_attn.out_proj.weight',
	]
	for pat in attn_patterns:
	if pat in name:
	return True
	return False


	def should_quantize(name, n_dims, dims, tied_embeddings=False):
	"""Should this tensor be quantized to Q2_K?

	With iMatrix importance weighting, Q2_K is applied to ALL eligible
	tensors including embeddings for maximum compression.

	Tensors kept as-is:
	- 1D tensors (norms, biases) — always kept
	- _norm, .bias — normalization layers
	- ffn_gate_inp — MoE routing gate
	- layer_output_scale — per-layer scaling factor (scalar)
	- altup, laurel — small Gemma-specific tensors
	- token_embd.weight / output.weight — always excluded here.
	When embeddings are TIED, main() routes token_embd.weight to
	Q8_0 (HPC Shor pipeline) instead: the same tensor serves as both
	embedding lookup AND LM head, and Q2_K/Q4_0 there destroys logit
	precision → looping / repetitive generation. --keep-embd keeps
	it at source precision instead.
	"""
	n_elements = 1
	for d in dims:
	n_elements *= d
	if n_dims < 2:
	return False
	if 'norm' in name:
	return False
	if '.bias' in name:
	return False
	if 'ffn_gate_inp' in name:
	return False
	if 'altup' in name or 'laurel' in name:
	return False
	if 'layer_output_scale' in name:
	return False
	# Embedding table — this is a lookup, not a matmul; Q2_K destroys
	# token distinctions. Keep at source precision (F16/BF16).
	if 'token_embd' in name:
	return False
	# LM head output projection — logit precision is critical for generation.
	# (When tied with embeddings, this is the same tensor and also skipped above.)
	if name == 'output.weight':
	return False
	# DeltaNet state-space parameters — keep at full precision
	if 'ssm_a' in name or 'A_log' in name:
	return False
	if 'ssm_dt' in name or 'dt_bias' in name:
	return False
	if 'ssm_conv1d' in name or 'conv1d.weight' in name:
	return False
	# When embeddings are tied, token_embd.weight doubles as the output
	# projection (LM head). It gets routed to Q4_0 in the quant plan
	# instead of Q2_K — handled in main(), not here.
	# Skip vision/audio encoder tensors
	if 'v.' in name and name.startswith('v.'):
	return False
	if name.startswith('mm.') or name.startswith('a.'):
	return False
	# Small tensors are not worth quantizing
	if n_elements < QK_K:
	return False
	# Must be divisible by QK_K
	if n_elements % QK_K != 0:
	return False
	return True


	def main():
	if len(sys.argv) < 3:
	print("Usage: python3 hexstate_requantize.py <input.gguf> <output.gguf>"
	" [--keep-metadata] [--imatrix FILE] [--keep-embd] [--q2all]")
	sys.exit(1)

	input_path = sys.argv[1]
	output_path = sys.argv[2]
	keep_metadata = '--keep-metadata' in sys.argv
	quantize_none = '--quantize-none' in sys.argv
	q2all = '--q2all' in sys.argv
	keep_embd = '--keep-embd' in sys.argv # keep tied embedding at source precision instead of Q8_0

	# Check for imatrix
	imatrix_data = None
	for i, arg in enumerate(sys.argv):
	if arg == '--imatrix' and i + 1 < len(sys.argv):
	imat_path = sys.argv[i + 1]
	if os.path.exists(imat_path):
	imatrix_data = read_imatrix(imat_path)
	print(f" Loaded imatrix: {len(imatrix_data)} tensors from {imat_path}")
	else:
	print(f" WARNING: imatrix file not found: {imat_path}")
	break

	# Check for HPC C library
	use_hpc = _load_hexstate_lib() is not None

	print()
	print(" ╔════════════════════════════════════════════════════════════════╗")
	print(" ║ HExState GGUF Re-Quantizer ║")
	print(" ║ GGUF → Q2_K GGUF with metadata passthrough ║")
	if q2all:
	print(" ║ Mode: --q2all ALL eligible tensors → Q2_K (test mode) ║")
	if use_hpc and imatrix_data:
	print(" ║ Engine: HPC + iMatrix (calibrated sensitivity propagation) ║")
	elif use_hpc:
	print(" ║ Engine: HPC (BP + MSE Grid + Sensitivity Propagation) ║")
	else:
	print(" ║ Engine: Python (numpy vectorized) ║")
	print(" ╚════════════════════════════════════════════════════════════════╝")
	print()

	start_time = time.time()
	file_size = os.path.getsize(input_path)
	print(f" Input: {input_path}")
	print(f" Size: {file_size / 1024**3:.2f} GB")
	print(f" Output: {output_path}")
	print()

	with open(input_path, 'rb') as fin:
	# ── Read Header ──
	magic = struct.unpack('<I', fin.read(4))[0]
	assert magic == GGUF_MAGIC, f"Bad GGUF magic: 0x{magic:08X}"
	version = struct.unpack('<I', fin.read(4))[0]
	n_tensors = struct.unpack('<Q', fin.read(8))[0]
	n_kv = struct.unpack('<Q', fin.read(8))[0]

	print(f" GGUF v{version}: {n_tensors} tensors, {n_kv} KV pairs")
	print()

	# ── Read KV pairs (store as raw bytes for passthrough) ──
	kv_pairs = []
	for i in range(n_kv):
	key = read_string(fin)
	vtype = struct.unpack('<I', fin.read(4))[0]
	raw_value = read_kv_value(fin, vtype)
	kv_pairs.append((key, vtype, raw_value))

	# ── Read Tensor Info ──
	tensor_infos = []
	for i in range(n_tensors):
	name = read_string(fin)
	n_dims = struct.unpack('<I', fin.read(4))[0]
	dims = [struct.unpack('<Q', fin.read(8))[0] for _ in range(n_dims)]
	ttype = struct.unpack('<I', fin.read(4))[0]
	offset = struct.unpack('<Q', fin.read(8))[0]

	n_elements = 1
	for d in dims:
	n_elements *= d

	blk_sz = TYPE_BLOCK_SIZE.get(ttype, 1)
	blk_bytes = TYPE_BLOCK_BYTES.get(ttype, 4)
	n_blocks = (n_elements + blk_sz - 1) // blk_sz
	data_size = n_blocks * blk_bytes

	tensor_infos.append({
	'name': name, 'n_dims': n_dims, 'dims': dims,
	'type': ttype, 'offset': offset,
	'n_elements': n_elements, 'data_size': data_size,
	})

	# Calculate data section start
	pos_after_info = fin.tell()
	data_section_start = align_offset(pos_after_info)

	print(f" Data section starts at: {data_section_start:,}")
	print()

	# ── Detect tied embeddings ──
	# If no separate output.weight tensor exists, token_embd.weight
	# doubles as the LM head. Must preserve it at full precision.
	tensor_names = {ti['name'] for ti in tensor_infos}
	has_output_weight = 'output.weight' in tensor_names
	tied_embeddings = not has_output_weight and 'token_embd.weight' in tensor_names
	if tied_embeddings:
	if keep_embd:
	print(" ⚠ Tied embeddings detected — token_embd.weight kept at source precision (--keep-embd)")
	else:
	print(" ⚠ Tied embeddings detected — token_embd.weight → Q8_0 via Shor pipeline (serves as LM head;")
	print(" Q2_K/Q4_0 here destroys logit precision — classic looping-output symptom)")
	print()

	# ── Determine output types ──
	quant_plan = []
	total_quant = 0
	total_attn = 0
	total_keep = 0
	total_embd = 0
	for ti in tensor_infos:
	if quantize_none:
	will_quant = False
	elif (tied_embeddings and ti['name'] == 'token_embd.weight'
	and not keep_embd and ti['n_elements'] % 32 == 0):
	# Tied embedding doubles as the LM head. NOTE: the old
	# 'promote to Q4_0' branch below should_quantize() was dead
	# code (should_quantize always returned False for
	# token_embd), so the tensor was silently kept at F16/BF16.
	# Now: Q8_0 (8.5 bpw, ~2x smaller than F16) via the HPC
	# Shor pipeline — transparent for both embedding lookup
	# and logit projection.
	will_quant = 'EMBD_Q8'
	total_embd += 1
	elif should_quantize(ti['name'], ti['n_dims'], ti['dims'], tied_embeddings):
	if q2all:
	# --q2all: ALL eligible tensors → Q2_K, no exceptions
	# (tied embedding stays on the Q8_0 route above).
	will_quant = True
	total_quant += 1
	elif is_attention_tensor(ti['name']):
	will_quant = 'ATTN_Q4' # Promote attention to Q4_0 HPC
	total_attn += 1
	else:
	will_quant = True
	total_quant += 1
	else:
	will_quant = False
	total_keep += 1
	quant_plan.append(will_quant)

	if q2all:
	print(f" Mode: --q2all — all eligible tensors forced to Q2_K")
	print(f" Tensors to quantize (Q2_K): {total_quant}")
	print(f" Tensors to keep as-is: {total_keep}")
	else:
	print(f" Tensors to quantize (Q2_K): {total_quant}")
	print(f" Tensors to promote (Q4_0·HPC): {total_attn}")
	print(f" Tied embd → Q8_0 (Shor·HPC): {total_embd}")
	print(f" Tensors to keep as-is: {total_keep}")
	print()

	# ── Compute output tensor sizes and offsets ──
	out_tensor_infos = []
	out_data_offset = 0

	for i, ti in enumerate(tensor_infos):
	if quant_plan[i]:
	out_dims = list(ti['dims'])
	dim0 = out_dims[0] if ti['n_dims'] >= 2 else ti['n_elements']

	if quant_plan[i] == 'EMBD_Q8':
	# Tied embedding / LM head → Q8_0 (8.5 bpw, 34 B / 32 w)
	out_type = GGML_TYPE_Q8_0
	n_blocks = ti['n_elements'] // 32
	out_size = n_blocks * 34
	print(f" [EMBD→Q8_0·Shor] {ti['name']} ({ti['n_elements']:,} elements)")
	elif quant_plan[i] == 'ATTN_Q4':
	# Attention tensor → Q4_0 HPC (4.5 bpw)
	out_type = GGML_TYPE_Q4_0
	n_blocks = (ti['n_elements'] + 31) // 32
	out_size = n_blocks * 18
	print(f" [ATTN→Q4_0·HPC] {ti['name']} ({ti['n_elements']} elements)")
	elif dim0 % QK_K == 0 or q2all:
	# Q2_K (2.6 bpw, block_size=256)
	# --q2all forces Q2_K even when dim0 isn't a clean multiple;
	# the quantizer pads internally to the next QK_K boundary.
	out_type = GGML_TYPE_Q2_K
	n_blocks = (ti['n_elements'] + QK_K - 1) // QK_K
	out_size = n_blocks * 84
	if q2all and dim0 % QK_K != 0:
	print(f" [Q2_K·PADDED] {ti['name']} (dim0={dim0}, padded to QK_K boundary)")
	elif dim0 % 32 == 0:
	# Q4_0 fallback (4.5 bpw, block_size=32)
	out_type = GGML_TYPE_Q4_0
	n_blocks = ti['n_elements'] // 32
	out_size = n_blocks * 18
	quant_plan[i] = 'Q4_0'
	print(f" Q4_0: {ti['name']} (dims[0]={dim0})")
	else:
	out_type = ti['type']
	out_size = ti['data_size']
	quant_plan[i] = False
	print(f" Keep: {ti['name']} (dims[0]={dim0})")
	else:
	out_type = ti['type']
	out_size = ti['data_size']
	out_dims = list(ti['dims'])

	out_tensor_infos.append({
	'name': ti['name'],
	'n_dims': ti['n_dims'],
	'dims': out_dims,
	'type': out_type,
	'offset': out_data_offset,
	'data_size': out_size,
	})
	out_data_offset += out_size
	out_data_offset = align_offset(out_data_offset)

	# ── Update KV pairs ──
	updated_kv = []
	if keep_metadata:
	print(" --keep-metadata: passing through ALL KV pairs unchanged")
	updated_kv = list(kv_pairs)
	else:
	for key, vtype, raw_value in kv_pairs:
	if key == 'general.file_type' and vtype == 4: # UINT32
	# file_type=10 means Q2_K in llama.cpp
	updated_kv.append((key, vtype, struct.pack('<I', 10)))
	elif key == 'general.quantization_version' and vtype == 4:
	updated_kv.append((key, vtype, struct.pack('<I', 2)))
	elif key == 'tokenizer.ggml.token_type' and vtype == 9:
	# ── Fix Gemma 4 token types ──
	# convert_hf_to_gguf.py incorrectly marks control tokens as
	# NORMAL (1), causing llama.cpp to sample them (e.g. <unused24>
	# spam). Fix: read the tokens array to find control-looking
	# tokens, then patch their types to CONTROL (3).
	# See: https://github.com/ggml-org/llama.cpp/issues/21321
	tokens_kv = next((v for k, vt, v in kv_pairs
	if k == 'tokenizer.ggml.tokens' and vt == 9), None)
	token_names = []
	if tokens_kv:
	bio = io.BytesIO(tokens_kv)
	arr_type = struct.unpack('<I', bio.read(4))[0]
	arr_len = struct.unpack('<Q', bio.read(8))[0]
	for _ in range(arr_len):
	slen = struct.unpack('<Q', bio.read(8))[0]
	token_names.append(bio.read(slen).decode('utf-8', errors='replace'))

	# Parse the token_type array
	bio2 = io.BytesIO(raw_value)
	arr_type2 = struct.unpack('<I', bio2.read(4))[0]
	arr_len2 = struct.unpack('<Q', bio2.read(8))[0]
	ttypes = list(struct.unpack(f'<{arr_len2}i', bio2.read(arr_len2 * 4)))

	# Patch control-looking tokens
	n_fixed = 0
	CONTROL_TYPE = 3
	import re
	for i, tname in enumerate(token_names):
	if ttypes[i] == CONTROL_TYPE:
	continue # already correct
	if ttypes[i] == 6:
	continue # BYTE type — leave as-is
	# Only fix tokens that are genuine control/special tokens:
	# - <eos>, <bos>, <unk>, <mask>, </s> — sentence markers
	# - <\|turn>, <turn\|>, <\|tool_*\|> etc — delimiters
	# NOTE: do NOT mark <unused*> as CONTROL — Gemma 4 uses
	# these tokens internally for thinking/channel markers
	# (e.g. <unused24> = <\|channel>). The llama.cpp parser
	# handles them via the peg-gemma4 format instead.
	is_control = False
	if tname in ('<eos>', '<bos>', '<unk>', '<mask>', '</s>',
	'<pad>', '<s>'):
	is_control = True
	elif re.match(r'^<\\|.\\|?>$', tname) or re.match(r'^<.\\|>$', tname):
	is_control = True
	if is_control and ttypes[i] != CONTROL_TYPE:
	ttypes[i] = CONTROL_TYPE
	n_fixed += 1

	print(f" Fixed {n_fixed} token types to CONTROL (Gemma 4 <unused> fix)")

	# Rebuild the raw value
	new_raw = struct.pack('<I', arr_type2)
	new_raw += struct.pack('<Q', arr_len2)
	new_raw += struct.pack(f'<{arr_len2}i', *ttypes)
	updated_kv.append((key, vtype, new_raw))
	elif key == 'tokenizer.chat_template' and vtype == 8:
	# ── Replace chat template with fixed Gemma 4 template ──
	# The HF-exported template doesn't handle thinking mode, causing
	# the model to emit <unused24> tokens. The fixed template from
	# llama.cpp PR #21418 pre-fills an empty thought block when
	# thinking is disabled: <\|channel>thought\n<channel\|>
	# See: https://github.com/ggml-org/llama.cpp/pull/21418
	script_dir = os.path.dirname(os.path.abspath(__file__))
	workspace_dir = os.path.dirname(script_dir)
	template_path = os.path.join(workspace_dir, 'llama-cpp-latest',
	'models', 'templates', 'google-gemma-4-31B-it.jinja')
	if os.path.exists(template_path):
	with open(template_path, 'r') as tf:
	new_template = tf.read()
	new_raw = struct.pack('<Q', len(new_template.encode('utf-8')))
	new_raw += new_template.encode('utf-8')
	updated_kv.append((key, vtype, new_raw))
	print(f" Replaced chat template with fixed Gemma 4 template ({len(new_template)} chars)")
	else:
	print(f" WARNING: Fixed template not found at {template_path}, keeping original")
	updated_kv.append((key, vtype, raw_value))
	else:
	updated_kv.append((key, vtype, raw_value))

	# ── Write output GGUF ──
	print(" Writing output GGUF...")
	with open(output_path, 'wb') as fout:
	# Header
	fout.write(struct.pack('<I', GGUF_MAGIC))
	fout.write(struct.pack('<I', GGUF_VERSION))
	fout.write(struct.pack('<Q', n_tensors))
	fout.write(struct.pack('<Q', n_kv))

	# KV pairs (passthrough)
	for key, vtype, raw_value in updated_kv:
	write_string(fout, key)
	fout.write(struct.pack('<I', vtype))
	fout.write(raw_value)

	# Tensor info
	for oti in out_tensor_infos:
	write_string(fout, oti['name'])
	fout.write(struct.pack('<I', oti['n_dims']))
	for d in oti['dims']:
	fout.write(struct.pack('<Q', d))
	fout.write(struct.pack('<I', oti['type']))
	fout.write(struct.pack('<Q', oti['offset']))

	# Alignment padding before data
	pos = fout.tell()
	aligned = align_offset(pos)
	if aligned > pos:
	fout.write(b'\x00' * (aligned - pos))

	# ── Write tensor data ──
	quant_count = 0
	total_quant_bytes = 0
	total_keep_bytes = 0
	total_rmse = 0.0
	q2k_rmse_sum = 0.0
	q2k_tensor_count = 0

	for i, ti in enumerate(tensor_infos):
	# Progress bar
	pct = (i + 1) / n_tensors * 100
	bar_width = 40
	filled = int(bar_width * (i + 1) / n_tensors)
	bar = '█' * filled + '░' * (bar_width - filled)
	elapsed = time.time() - start_time
	eta = elapsed / max(i + 1, 1) * (n_tensors - i - 1)
	sys.stdout.write(f"\r [{bar}] {pct:5.1f}% ({i+1}/{n_tensors}) {elapsed:.0f}s ETA:{eta:.0f}s {ti['name'][:50]}")
	sys.stdout.flush()

	# Read source tensor data
	abs_offset = data_section_start + ti['offset']
	fin.seek(abs_offset)
	raw_data = fin.read(ti['data_size'])

	if quant_plan[i] == 'EMBD_Q8':
	# ── Tied embedding → Q8_0 via the HPC Shor pipeline ──
	if ti['type'] == GGML_TYPE_BF16:
	f32 = bf16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F16:
	f32 = f16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F32:
	f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
	else:
	# Can't re-quantize from quantized source — keep
	fout.write(raw_data)
	pad = align_offset(fout.tell()) - fout.tell()
	if pad > 0: fout.write(b'\x00' * pad)
	continue

	n_el = ti['n_elements']
	n_blocks_q8 = n_el // 32

	if use_hpc and hasattr(_HEXSTATE_LIB, 'hexstate_quantize_tensor_q8_0_hpc'):
	output_buf = np.zeros(n_blocks_q8 * 34, dtype=np.uint8)
	error = ctypes.c_float(0.0)
	f32_c = np.ascontiguousarray(f32, dtype=np.float32)

	imat_ptr = None
	if imatrix_data and ti['name'] in imatrix_data:
	iw = imatrix_data[ti['name']]
	n_cols = iw.shape[0]
	n_rows = n_el // n_cols if n_cols > 0 else 1
	imat_full = np.tile(iw, n_rows)[:n_el].astype(np.float32)
	imat_c = np.ascontiguousarray(imat_full)
	imat_ptr = imat_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

	_HEXSTATE_LIB.hexstate_quantize_tensor_q8_0_hpc(
	f32_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
	ctypes.c_int64(n_el),
	output_buf.ctypes.data_as(ctypes.c_void_p),
	ctypes.byref(error),
	imat_ptr,
	ctypes.c_int(0),
	)
	fout.write(output_buf.tobytes())
	rmse8 = float(np.sqrt(error.value / max(n_el, 1)))
	print(f"\n [Q8_0·Shor] {ti['name']} RMSE={rmse8:.6e}")
	else:
	q8_bytes, n_blocks_q8, sse8 = quantize_tensor_q8_0(f32)
	fout.write(q8_bytes)
	rmse8 = float(np.sqrt(sse8 / max(n_el, 1)))
	print(f"\n [Q8_0] {ti['name']} RMSE={rmse8:.6e} (numpy fallback)")

	quant_count += 1
	total_quant_bytes += n_blocks_q8 * 34

	elif quant_plan[i] in ('Q4_0', 'ATTN_Q4'):
	# ── Q4_0 quantization (fallback or attention HPC) ──
	if ti['type'] == GGML_TYPE_BF16:
	f32 = bf16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F16:
	f32 = f16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F32:
	f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
	else:
	fout.write(raw_data)
	pad = align_offset(fout.tell()) - fout.tell()
	if pad > 0: fout.write(b'\x00' * pad)
	continue

	# Pad to 32-element boundary
	n_el = len(f32)
	pad_to = ((n_el + 31) // 32) * 32
	if pad_to > n_el:
	f32 = np.concatenate([f32, np.zeros(pad_to - n_el, dtype=np.float32)])
	n_el = pad_to

	n_blocks_q4 = n_el // 32

	# Use HPC for attention tensors if available
	if quant_plan[i] == 'ATTN_Q4' and use_hpc and hasattr(_HEXSTATE_LIB, 'hexstate_quantize_tensor_q4_0_hpc'):
	output_buf = np.zeros(n_blocks_q4 * 18, dtype=np.uint8)
	error = ctypes.c_float(0.0)
	f32_c = np.ascontiguousarray(f32, dtype=np.float32)

	# Look up imatrix importance
	imat_ptr = None
	if imatrix_data and ti['name'] in imatrix_data:
	iw = imatrix_data[ti['name']]
	n_cols = iw.shape[0]
	n_rows = n_el // n_cols if n_cols > 0 else 1
	imat_full = np.tile(iw, n_rows)[:n_el].astype(np.float32)
	imat_c = np.ascontiguousarray(imat_full)
	imat_ptr = imat_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float))

	_HEXSTATE_LIB.hexstate_quantize_tensor_q4_0_hpc(
	f32_c.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
	ctypes.c_int64(n_el),
	output_buf.ctypes.data_as(ctypes.c_void_p),
	ctypes.byref(error),
	imat_ptr,
	ctypes.c_int(1), # verbose
	)
	fout.write(output_buf.tobytes())
	print(f"\n [Q4_0·HPC] {ti['name']} RMSE={np.sqrt(error.value / ti['n_elements']):.6e}")
	else:
	# Vectorized Q4_0: process all blocks at once
	blocks = f32.reshape(-1, 32)
	amax = np.max(np.abs(blocks), axis=1)
	d = amax / 7.0
	d[d == 0] = 1.0 # avoid div by zero
	qs = np.clip(np.round(blocks / d[:, None]) + 8, 0, 15).astype(np.uint8)
	d_orig = amax / 7.0 # restore zeros
	d_fp16 = d_orig.astype(np.float16)

	out_buf = bytearray(n_blocks_q4 * 18)
	for b in range(n_blocks_q4):
	off = b * 18
	struct.pack_into('<e', out_buf, off, float(d_fp16[b]))
	for j in range(16):
	out_buf[off + 2 + j] = int(qs[b, j]) \| (int(qs[b, j + 16]) << 4)
	fout.write(bytes(out_buf))

	quant_count += 1
	total_quant_bytes += n_blocks_q4 * 18

	elif quant_plan[i]:
	# Convert to F32 for quantization
	if ti['type'] == GGML_TYPE_BF16:
	f32 = bf16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F16:
	f32 = f16_to_f32(raw_data, ti['n_elements'])
	elif ti['type'] == GGML_TYPE_F32:
	f32 = np.frombuffer(raw_data, dtype=np.float32).copy()
	else:
	# Can't re-quantize from quantized format — keep as-is
	fout.write(raw_data)
	pad = align_offset(fout.tell()) - fout.tell()
	if pad > 0:
	fout.write(b'\x00' * pad)
	continue

	# Quantize to Q2_K — always use HPC with chunked processing
	# Each chunk gets full HPC treatment (no size threshold)
	HPC_CHUNK = 50_000_000 # 50M elements per HPC chunk
	HPC_CHUNK = (HPC_CHUNK // QK_K) * QK_K # align to QK_K

	# Look up imatrix importance for this tensor
	imat_full = None
	if imatrix_data and ti['name'] in imatrix_data:
	iw = imatrix_data[ti['name']]
	n_cols = iw.shape[0]
	n_rows = ti['n_elements'] // n_cols if n_cols > 0 else 1
	imat_full = np.tile(iw, n_rows)[:ti['n_elements']]

	n_el = ti['n_elements']
	if use_hpc and n_el <= HPC_CHUNK:
	# Small tensor — single HPC pass
	q2k_data, n_blocks = quantize_tensor_q2k_hpc(f32, opt_mode=2, importance=imat_full)
	elif use_hpc:
	# Large tensor — chunked HPC (each chunk gets BP)
	chunks = []
	processed = 0
	while processed < n_el:
	end = min(processed + HPC_CHUNK, n_el)
	chunk_f32 = f32[processed:end]
	if len(chunk_f32) % QK_K != 0:
	pad_len = QK_K - (len(chunk_f32) % QK_K)
	chunk_f32 = np.concatenate([chunk_f32, np.zeros(pad_len, dtype=np.float32)])
	chunk_imp = imat_full[processed:end] if imat_full is not None else None
	if chunk_imp is not None and len(chunk_imp) < len(chunk_f32):
	chunk_imp = np.concatenate([chunk_imp, np.ones(len(chunk_f32) - len(chunk_imp), dtype=np.float32)])
	chunk_data, _ = quantize_tensor_q2k_hpc(chunk_f32, opt_mode=2, importance=chunk_imp)
	actual_blocks = (end - processed + QK_K - 1) // QK_K
	chunks.append(chunk_data[:actual_blocks * 84])
	processed = end
	pct = 100.0 * processed / n_el
	print(f"\r → {processed/1e6:.0f}M/{n_el/1e6:.0f}M ({pct:.0f}%)", end='', flush=True)
	print()
	q2k_data = b''.join(chunks)
	n_blocks = n_el // QK_K
	else:
	# No HPC available — python fallback
	CHUNK_SIZE = 10_000_000
	CHUNK_SIZE = (CHUNK_SIZE // QK_K) * QK_K
	chunks = []
	processed = 0
	while processed < n_el:
	end = min(processed + CHUNK_SIZE, n_el)
	chunk_data, _ = quantize_tensor_q2k(f32[processed:end])
	chunks.append(chunk_data)
	processed = end
	pct = 100.0 * processed / n_el
	print(f"\r → {processed/1e6:.0f}M/{n_el/1e6:.0f}M ({pct:.0f}%)", end='', flush=True)
	print()
	q2k_data = b''.join(chunks)
	n_blocks = n_el // QK_K
	fout.write(q2k_data)

	# ── Compute and report exact per-tensor RMSE ──
	try:
	CHUNK_BLK = 100_000 # blocks per chunk to bound memory
	total_se = 0.0
	total_n = 0
	for ci in range(0, n_blocks, CHUNK_BLK):
	ce = min(ci + CHUNK_BLK, n_blocks)
	chunk_q = q2k_data[ci84:ce84]
	deq_chunk = dequant_q2k_fast(chunk_q, ce - ci)
	orig_chunk = f32[ciQK_K:ceQK_K]
	n_valid = min(len(orig_chunk), len(deq_chunk))
	diff = orig_chunk[:n_valid] - deq_chunk[:n_valid]
	total_se += np.sum(diff ** 2)
	total_n += n_valid
	tensor_rmse = np.sqrt(total_se / max(total_n, 1))
	q2k_rmse_sum += tensor_rmse
	q2k_tensor_count += 1
	print(f"\n [Q2_K] {ti['name'][:55]} RMSE={tensor_rmse:.6e}")
	except Exception as e:
	print(f"\n [Q2_K] {ti['name'][:55]} RMSE=err({e})")

	quant_count += 1
	total_quant_bytes += len(q2k_data)
	else:
	# Keep as-is (passthrough)
	fout.write(raw_data)
	total_keep_bytes += len(raw_data)

	# Alignment padding
	pad = align_offset(fout.tell()) - fout.tell()
	if pad > 0:
	fout.write(b'\x00' * pad)

	final_size = fout.tell()

	elapsed = time.time() - start_time
	print(f"\r {'█' * 40} 100.0% ({n_tensors}/{n_tensors}) {elapsed:.0f}s" + " " * 60)
	print()

	# ── Summary ──
	original_bytes = sum(ti['data_size'] for ti in tensor_infos)
	compression = original_bytes / max(final_size, 1)

	print(" ╔════════════════════════════════════════════════════════════════╗")
	print(" ║ RE-QUANTIZATION SUMMARY ║")
	print(" ╠════════════════════════════════════════════════════════════════╣")
	print(f" ║ Tensors quantized (Q2_K): {quant_count:<33d} ║")
	print(f" ║ Tensors kept as-is: {total_keep:<33d} ║")
	print(f" ║ Q2_K data: {total_quant_bytes:>12,} bytes ({total_quant_bytes/1024**2:>7.1f} MB) ║")
	print(f" ║ Kept data: {total_keep_bytes:>12,} bytes ({total_keep_bytes/1024**2:>7.1f} MB) ║")
	print(f" ║ Original size: {file_size:>12,} bytes ({file_size/1024**3:>7.2f} GB) ║")
	print(f" ║ Output size: {final_size:>12,} bytes ({final_size/1024**3:>7.2f} GB) ║")
	print(f" ║ Compression: {compression:>42.1f}x ║")
	if q2k_tensor_count > 0:
	mean_rmse = q2k_rmse_sum / q2k_tensor_count
	print(f" ║ Mean Q2_K RMSE: {mean_rmse:>12.6e} ║")
	print(f" ║ Total time: {elapsed:>39.1f} sec ║")
	print(" ╚════════════════════════════════════════════════════════════════╝")
	print()
	print(f" Output: {output_path}")
	print()


	if __name__ == '__main__':
	main()