CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

CompressedGemma commited on May 7

Commit

dc3b370

·

verified ·

1 Parent(s): 5c1c396

Tensor tweak

Files changed (1) hide show

hexstate_requantize.py +9 -1

hexstate_requantize.py CHANGED Viewed

@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-HexState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
 and re-quantizes eligible weight tensors to Q2_K using numpy.
@@ -636,6 +636,14 @@ def should_quantize(name, n_dims, dims, tied_embeddings=False):
         return False
     if 'layer_output_scale' in name:
         return False
     # DeltaNet state-space parameters — keep at full precision
     if 'ssm_a' in name or 'A_log' in name:
         return False

 #!/usr/bin/env python3
 """
+HExState GGUF Re-Quantizer — GGUF-to-GGUF Q2_K quantization.
 Reads a source GGUF (F16/BF16/F32), copies all metadata verbatim,
 and re-quantizes eligible weight tensors to Q2_K using numpy.
         return False
     if 'layer_output_scale' in name:
         return False
+    # Embedding table — this is a lookup, not a matmul; Q2_K destroys
+    # token distinctions. Keep at source precision (F16/BF16).
+    if 'token_embd' in name:
+        return False
+    # LM head output projection — logit precision is critical for generation.
+    # (When tied with embeddings, this is the same tensor and also skipped above.)
+    if name == 'output.weight':
+        return False
     # DeltaNet state-space parameters — keep at full precision
     if 'ssm_a' in name or 'A_log' in name:
         return False