CompressedGemma
/

HPC-Quantize

Model card Files Files and versions

xet

Community

CompressedGemma commited on May 21

Commit

00ba2db

verified ·

1 Parent(s): 7803d72

Delete generate_imatrix.py

Browse files

Files changed (1) hide show

generate_imatrix.py +0 -1733

generate_imatrix.py DELETED Viewed

@@ -1,1733 +0,0 @@
-#!/usr/bin/env python3
-"""
-HExState Importance Matrix Generator — HPC-Enhanced iMatrix from GGUF
-Runs transformer forward passes over calibration text to collect per-channel
-E[x²] activation statistics, then uses HPC triality BP to propagate importance
-across layers. Outputs llama.cpp-compatible .dat imatrix files.
-Usage:
-    python3 generate_imatrix.py model.gguf calibration.txt -o imatrix.dat
-"""
-import struct
-import sys
-import os
-import time
-import mmap
-import ctypes
-import numpy as np
-from collections import OrderedDict
-# ─── Constants ──────────────────────────────────────────────────────────────
-GGUF_MAGIC = 0x46554747
-ALIGNMENT = 32
-QK_K = 256
-QK4_0 = 32
-QK8_0 = 32
-GGML_TYPE_F32   = 0
-GGML_TYPE_F16   = 1
-GGML_TYPE_Q4_0  = 2
-GGML_TYPE_Q8_0  = 8
-GGML_TYPE_Q2_K  = 10
-GGML_TYPE_BF16  = 30
-TYPE_BLOCK_SIZE = {
-    0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
-    8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
-    13: 256, 14: 256, 15: 256, 30: 1,
-}
-TYPE_BLOCK_BYTES = {
-    0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
-    8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
-    13: 176, 14: 210, 15: 292, 30: 2,
-}
-TYPE_NAME = {
-    0: "F32", 1: "F16", 2: "Q4_0", 8: "Q8_0", 10: "Q2_K", 30: "BF16",
-}
-# ─── GGUF Reader ────────────────────────────────────────────────────────────
-def align_offset(offset):
-    return (offset + ALIGNMENT - 1) & ~(ALIGNMENT - 1)
-def read_string(f):
-    slen = struct.unpack('<Q', f.read(8))[0]
-    return f.read(slen).decode('utf-8', errors='replace')
-def read_kv_value(f, vtype):
-    """Read and return a KV value."""
-    if vtype == 0:   return struct.unpack('<B', f.read(1))[0]
-    elif vtype == 1:  return struct.unpack('<b', f.read(1))[0]
-    elif vtype == 2:  return struct.unpack('<H', f.read(2))[0]
-    elif vtype == 3:  return struct.unpack('<h', f.read(2))[0]
-    elif vtype == 4:  return struct.unpack('<I', f.read(4))[0]
-    elif vtype == 5:  return struct.unpack('<i', f.read(4))[0]
-    elif vtype == 6:  return struct.unpack('<f', f.read(4))[0]
-    elif vtype == 7:  return bool(struct.unpack('<B', f.read(1))[0])
-    elif vtype == 8:  return read_string(f)
-    elif vtype == 9:
-        arr_type = struct.unpack('<I', f.read(4))[0]
-        arr_len = struct.unpack('<Q', f.read(8))[0]
-        return [read_kv_value(f, arr_type) for _ in range(arr_len)]
-    elif vtype == 10: return struct.unpack('<Q', f.read(8))[0]
-    elif vtype == 11: return struct.unpack('<q', f.read(8))[0]
-    elif vtype == 12: return struct.unpack('<d', f.read(8))[0]
-    else:
-        raise ValueError(f"Unknown KV type {vtype}")
-class GGUFModel:
-    """Loads a GGUF model with mmap'd tensor access."""
-    def __init__(self, path):
-        self.path = path
-        self.file_size = os.path.getsize(path)
-        self.kv = {}
-        self.tensor_infos = OrderedDict()
-        self.data_offset = 0
-        self._f = open(path, 'rb')
-        self._mm = mmap.mmap(self._f.fileno(), 0, access=mmap.ACCESS_READ)
-        self._parse_header()
-    def _parse_header(self):
-        f = self._f
-        f.seek(0)
-        magic = struct.unpack('<I', f.read(4))[0]
-        assert magic == GGUF_MAGIC, f"Bad GGUF magic: 0x{magic:08X}"
-        version = struct.unpack('<I', f.read(4))[0]
-        n_tensors = struct.unpack('<Q', f.read(8))[0]
-        n_kv = struct.unpack('<Q', f.read(8))[0]
-        # Read KV pairs
-        for _ in range(n_kv):
-            key = read_string(f)
-            vtype = struct.unpack('<I', f.read(4))[0]
-            value = read_kv_value(f, vtype)
-            self.kv[key] = value
-        # Read tensor info
-        for _ in range(n_tensors):
-            name = read_string(f)
-            n_dims = struct.unpack('<I', f.read(4))[0]
-            dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
-            ttype = struct.unpack('<I', f.read(4))[0]
-            offset = struct.unpack('<Q', f.read(8))[0]
-            n_elements = 1
-            for d in dims:
-                n_elements *= d
-            blk_sz = TYPE_BLOCK_SIZE.get(ttype, 1)
-            blk_bytes = TYPE_BLOCK_BYTES.get(ttype, 4)
-            n_blocks = (n_elements + blk_sz - 1) // blk_sz
-            data_size = n_blocks * blk_bytes
-            self.tensor_infos[name] = {
-                'dims': dims, 'n_dims': n_dims, 'type': ttype,
-                'offset': offset, 'n_elements': n_elements,
-                'data_size': data_size,
-            }
-        self.data_offset = align_offset(f.tell())
-    def get_arch(self):
-        arch = self.kv.get('general.architecture')
-        if not arch:
-            # Try to infer from tensor names
-            if any('attn_gate' in n for n in self.tensor_infos):
-                return 'gemma2'
-            return 'llama'
-        return arch
-    def get_config(self):
-        arch = self.get_arch()
-        n_embd = self.kv.get(f'{arch}.embedding_length', 0)
-        n_head = self.kv.get(f'{arch}.attention.head_count', 0)
-        n_head_kv = self.kv.get(f'{arch}.attention.head_count_kv', 0)
-        # Auto-detect head_dim: prefer derived from attn_gate over n_embd/n_head
-        # (Qwen 3.6 has hybrid 10240 QKV output but attn_gate requires 6144.
-        #  6144 / 24 heads = 256 real head_dim).
-        head_dim = 0
-        gate_name = 'blk.0.attn_gate.weight'
-        if gate_name in self.tensor_infos:
-            # attn_gate is [n_embd, n_head * head_dim]
-            gate_cols = self.tensor_infos[gate_name]['dims'][1]  # input dim
-            if n_head > 0:
-                head_dim = gate_cols // n_head
-        if head_dim == 0 and n_head > 0:
-            head_dim = n_embd // n_head
-        return {
-            'arch': arch,
-            'n_layers': self.kv.get(f'{arch}.block_count', 0),
-            'n_embd': n_embd,
-            'n_head': n_head,
-            'n_head_kv': n_head_kv,
-            'n_ff': self.kv.get(f'{arch}.feed_forward_length', 0),
-            'vocab_size': self.kv.get(f'{arch}.vocab_size', 0),
-            'rms_eps': self.kv.get(f'{arch}.attention.layer_norm_rms_epsilon', 1e-6),
-            'rope_base': self.kv.get(f'{arch}.rope.freq_base', 10000.0),
-            'swa_window': self.kv.get(f'{arch}.attention.sliding_window', 0),
-            'head_dim': head_dim,
-            'expert_count': self.kv.get(f'{arch}.expert_count', 0),
-            'expert_used_count': self.kv.get(f'{arch}.expert_used_count', 0),
-        }
-    def get_tensor_f32(self, name):
-        """Load a tensor as float32, dequantizing if needed."""
-        if name not in self.tensor_infos:
-            return None
-        ti = self.tensor_infos[name]
-        abs_offset = self.data_offset + ti['offset']
-        raw = bytes(self._mm[abs_offset:abs_offset + ti['data_size']])
-        try:
-            return dequantize(raw, ti['type'], ti['n_elements'])
-        except ValueError as e:
-            print(f"  Error dequantizing {name}: {e}")
-            return None
-    def get_tensor_shape(self, name):
-        """Return the shape of a tensor (GGUF stores reversed dims)."""
-        if name not in self.tensor_infos:
-            return None
-        dims = self.tensor_infos[name]['dims']
-        # GGUF stores dims in reverse order (row-major): dims[0]=cols, dims[1]=rows
-        return tuple(reversed(dims))
-    def close(self):
-        self._mm.close()
-        self._f.close()
-# ─── Dequantization ─────────────────────────────────────────────────────────
-def dequantize(raw, ttype, n_elements):
-    """Dequantize raw bytes to float32 numpy array."""
-    if ttype == GGML_TYPE_F32:
-        return np.frombuffer(raw, dtype=np.float32).copy()
-    elif ttype == GGML_TYPE_F16:
-        return np.frombuffer(raw, dtype=np.float16).astype(np.float32)
-    elif ttype == GGML_TYPE_BF16:
-        bf16 = np.frombuffer(raw, dtype=np.uint16)
-        return (bf16.astype(np.uint32) << 16).view(np.float32).copy()
-    elif ttype == GGML_TYPE_Q8_0:
-        return dequant_q8_0(raw, n_elements)
-    elif ttype == GGML_TYPE_Q4_0:
-        return dequant_q4_0(raw, n_elements)
-    elif ttype == GGML_TYPE_Q2_K:
-        return dequant_q2k(raw, n_elements)
-    else:
-        raise ValueError(f"Unsupported quant type {ttype} ({TYPE_NAME.get(ttype, '?')})")
-def dequant_q8_0(raw, n_elements):
-    n_blocks = n_elements // QK8_0
-    data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 34)
-    d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
-    qs = data[:, 2:34].view(np.int8).astype(np.float32)
-    return (d * qs).reshape(-1)[:n_elements]
-def dequant_q4_0(raw, n_elements):
-    n_blocks = n_elements // QK4_0
-    data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 18)
-    d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
-    qs = data[:, 2:18]  # 16 bytes = 32 nibbles
-    lo = (qs & 0xF).astype(np.float32) - 8.0
-    hi = (qs >> 4).astype(np.float32) - 8.0
-    # Correct nibble interleaving: [lo0, hi0, lo1, hi1, ...]
-    x = np.stack([lo, hi], axis=2).reshape(n_blocks, 32)
-    return (d * x).reshape(-1)[:n_elements]
-def dequant_q2k(raw, n_elements):
-    n_blocks = n_elements // QK_K
-    data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 84)
-    scales_packed = data[:, 0:16]  # [n_blocks, 16]
-    qs = data[:, 16:80]  # [n_blocks, 64]
-    d_fp16 = data[:, 80:82].view(np.float16).astype(np.float32).reshape(n_blocks)
-    dmin_fp16 = data[:, 82:84].view(np.float16).astype(np.float32).reshape(n_blocks)
-    result = np.zeros((n_blocks, QK_K), dtype=np.float32)
-    for blk in range(n_blocks):
-        d = d_fp16[blk]
-        dmin = dmin_fp16[blk]
-        for half in range(2):
-            for sub in range(4):
-                j = half * 4 + sub  # Corrected index: 0-3 and 4-7
-                sc = int(scales_packed[blk, j]) & 0xF
-                mn = int(scales_packed[blk, j]) >> 4
-                d_sub = d * sc
-                m_sub = dmin * mn
-                for k in range(32):
-                    qi_byte = int(qs[blk, half * 32 + k])
-                    q = (qi_byte >> (sub * 2)) & 3
-                    idx = half * 128 + sub * 32 + k
-                    result[blk, idx] = d_sub * q - m_sub
-    return result.reshape(-1)[:n_elements]
-# ─── Tokenizer ──────────────────────────────────────────────────────────────
-class SimpleTokenizer:
-    """Minimal BPE tokenizer from GGUF metadata, with HPC acceleration."""
-    def __init__(self, model):
-        self.model_path = model._f.name
-        self.tokens = model.kv.get('tokenizer.ggml.tokens', [])
-        self.vocab_size = len(self.tokens)
-        merges_raw = model.kv.get('tokenizer.ggml.merges', [])
-        # Override for models (like Mistral v0.3) where gguf merges are missing
-        if not merges_raw:
-            try:
-                import json
-                model_dir = os.path.dirname(os.path.abspath(self.model_path))
-                tok_path = os.path.join(model_dir, 'tokenizer.json')
-                if os.path.exists(tok_path):
-                    with open(tok_path, 'r') as f:
-                        tok_data = json.load(f)
-                        merges_raw = tok_data.get('model', {}).get('merges', [])
-                    if merges_raw:
-                        print(f"  Injected {len(merges_raw)} merges from local tokenizer.json!")
-            except Exception as e:
-                pass
-        self.bos_id = model.kv.get('tokenizer.ggml.bos_token_id', 2)
-        self.eos_id = model.kv.get('tokenizer.ggml.eos_token_id', 1)
-        # Build token → id map
-        self.token_to_id = {}
-        for i, t in enumerate(self.tokens):
-            if isinstance(t, str):
-                self.token_to_id[t] = i
-        # Build merge priority (Python fallback)
-        self.merges = {}
-        self._merge_list = []  # ordered list for C bridge
-        for i, m in enumerate(merges_raw):
-            if isinstance(m, str):
-                parts = m.split(' ', 1)
-                if len(parts) == 2:
-                    self.merges[(parts[0], parts[1])] = i
-                    # Resolve token IDs for C bridge
-                    a_id = self.token_to_id.get(parts[0], -1)
-                    b_id = self.token_to_id.get(parts[1], -1)
-                    merged_tok = parts[0] + parts[1]
-                    merged_id = self.token_to_id.get(merged_tok, -1)
-                    if a_id >= 0 and b_id >= 0 and merged_id >= 0:
-                        self._merge_list.append((a_id, b_id, merged_id, i))
-        # Try to load HPC library for accelerated BPE
-        self._hpc_lib = None
-        try:
-            script_dir = os.path.dirname(os.path.abspath(__file__))
-            lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
-            if os.path.exists(lib_path):
-                lib = ctypes.CDLL(lib_path)
-                if hasattr(lib, 'hexstate_bpe_tokenize'):
-                    self._hpc_lib = lib
-                    print(f"  HPC·BPE engine loaded ({len(self._merge_list)} merge rules)")
-                else:
-                    print("  HPC library found but missing hexstate_bpe_tokenize — rebuild needed")
-        except Exception as e:
-            print(f"  HPC·BPE not available: {e}")
-    def encode(self, text):
-        """Encode text to token IDs using BPE (HPC-accelerated when available)."""
-        if not text:
-            return [self.bos_id]
-        # Convert to byte-level tokens (SentencePiece style: ▁ = space)
-        text = text.replace(' ', '▁')
-        if not text.startswith('▁'):
-            text = '▁' + text
-        # ── HPC fast path: C library with OpenMP ──
-        if self._hpc_lib and self._merge_list:
-            import time as _time
-            t0 = _time.time()
-            print(f"  HPC·BPE: tokenizing {len(text):,} chars...")
-            # Convert characters to initial token IDs
-            char_ids = np.array(
-                [self.token_to_id.get(c, 0) for c in text],
-                dtype=np.int32)
-            # Build merge table as C struct array
-            n_merges = len(self._merge_list)
-            # BPEMerge struct: 4 × int32 = 16 bytes
-            merge_buf = np.zeros(n_merges * 4, dtype=np.int32)
-            for idx, (a, b, m, r) in enumerate(self._merge_list):
-                merge_buf[idx * 4 + 0] = a
-                merge_buf[idx * 4 + 1] = b
-                merge_buf[idx * 4 + 2] = m
-                merge_buf[idx * 4 + 3] = r
-            # Output buffer
-            output_ids = np.zeros(len(char_ids), dtype=np.int32)
-            n_tokens = ctypes.c_int64(0)
-            self._hpc_lib.hexstate_bpe_tokenize(
-                char_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
-                ctypes.c_int64(len(char_ids)),
-                merge_buf.ctypes.data_as(ctypes.c_void_p),
-                ctypes.c_int32(n_merges),
-                output_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
-                ctypes.byref(n_tokens),
-                ctypes.c_int(1),  # verbose
-            )
-            elapsed = _time.time() - t0
-            ids = [self.bos_id] + output_ids[:n_tokens.value].tolist()
-            print(f"  HPC·BPE: {len(text):,} chars → {n_tokens.value:,} tokens [{elapsed:.1f}s]")
-            return ids
-        # ── Python fallback ──
-        # Start with characters
-        tokens = list(text)
-        # Apply BPE merges — merge ALL instances of the best pair per pass
-        initial_len = len(tokens)
-        pass_num = 0
-        import time as _time
-        t0 = _time.time()
-        while len(tokens) > 1:
-            best_pair = None
-            best_rank = float('inf')
-            for i in range(len(tokens) - 1):
-                pair = (tokens[i], tokens[i + 1])
-                rank = self.merges.get(pair, float('inf'))
-                if rank < best_rank:
-                    best_rank = rank
-                    best_pair = pair
-            if best_pair is None or best_rank == float('inf'):
-                break
-            # Merge ALL occurrences of this pair in one pass
-            a, b = best_pair
-            prev_len = len(tokens)
-            new_tokens = []
-            i = 0
-            while i < len(tokens):
-                if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
-                    new_tokens.append(a + b)
-                    i += 2
-                else:
-                    new_tokens.append(tokens[i])
-                    i += 1
-            tokens = new_tokens
-            pass_num += 1
-            if pass_num % 10 == 0:
-                elapsed = _time.time() - t0
-                merged = prev_len - len(tokens)
-                sys.stdout.write(
-                    f"\r  BPE pass {pass_num}: {len(tokens):,} tokens "
-                    f"(-{merged} merged, {len(tokens)/initial_len*100:.1f}%) "
-                    f"[{elapsed:.1f}s]    ")
-                sys.stdout.flush()
-        if pass_num >= 10:
-            elapsed = _time.time() - t0
-            print(f"\r  Tokenized: {pass_num} passes, {initial_len:,} chars → "
-                  f"{len(tokens):,} tokens [{elapsed:.1f}s]" + " " * 30)
-        # Convert to IDs
-        ids = [self.bos_id]
-        for t in tokens:
-            tid = self.token_to_id.get(t, 0)
-            ids.append(tid)
-        return ids
-    def chunk_text(self, text, chunk_size=512):
-        """Encode text and split into fixed-length chunks."""
-        ids = self.encode(text)
-        chunks = []
-        # Use a more reasonable stride (75% overlap instead of 50% for better coverage)
-        # or just 0% for pure speed. Let's go with 25% overlap as a middle ground.
-        stride = chunk_size * 3 // 4
-        for i in range(0, len(ids) - chunk_size + 1, stride):
-            chunk = ids[i:i + chunk_size]
-            chunks.append(np.array(chunk, dtype=np.int32))
-        if not chunks and ids:
-            # Pad short text
-            padded = ids + [self.eos_id] * (chunk_size - len(ids))
-            chunks.append(np.array(padded[:chunk_size], dtype=np.int32))
-        return chunks
-# ─── Transformer Forward Pass ───────────────────────────────────────────────
-def rms_norm(x, weight, eps=1e-6):
-    rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
-    return (x / rms) * weight
-def rope_freqs(dim, seq_len, base=10000.0):
-    freqs = 1.0 / (base ** (np.arange(0, dim, 2, dtype=np.float32) / dim))
-    t = np.arange(seq_len, dtype=np.float32)
-    freqs = np.outer(t, freqs)  # [seq_len, dim/2]
-    return np.cos(freqs), np.sin(freqs)
-def apply_rope(x, cos_f, sin_f):
-    # x: [seq_len, n_heads, head_dim]
-    d2 = x.shape[-1] // 2
-    x0 = x[..., :d2]
-    x1 = x[..., d2:]
-    cos_f = cos_f[:x.shape[0], :d2]
-    sin_f = sin_f[:x.shape[0], :d2]
-    if x.ndim == 3:
-        cos_f = cos_f[:, np.newaxis, :]
-        sin_f = sin_f[:, np.newaxis, :]
-    o0 = x0 * cos_f - x1 * sin_f
-    o1 = x1 * cos_f + x0 * sin_f
-    return np.concatenate([o0, o1], axis=-1)
-def softmax(x, axis=-1):
-    x_max = np.max(x, axis=axis, keepdims=True)
-    e = np.exp(x - x_max)
-    return e / np.sum(e, axis=axis, keepdims=True)
-def silu(x):
-    """SiLU / Swish activation — used by LLaMA, Mistral, Qwen, DeepSeek."""
-    return x * (1.0 / (1.0 + np.exp(-np.clip(x, -88, 88))))
-def gelu(x):
-    """GELU activation — used by Gemma, GPT-2."""
-    return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x**3)))
-# Architecture → activation function mapping
-ACTIVATION_MAP = {
-    'llama': silu, 'mistral': silu, 'qwen2': silu, 'qwen2moe': silu,
-    'phi3': silu, 'falcon': silu, 'deepseek': silu, 'deepseek2': silu,
-    'gemma': gelu, 'gemma2': gelu, 'gpt2': gelu,
-}
-class TransformerRunner:
-    """Minimal Gemma transformer for importance collection."""
-    def __init__(self, model, config, verbose=False, linear_attn=True):
-        self.model = model
-        self.cfg = config
-        self.verbose = verbose
-        self.head_dim = config.get('head_dim', config['n_embd'] // config['n_head'])
-        self.act_fn = ACTIVATION_MAP.get(config['arch'], silu)
-        self.linear_attn = linear_attn
-        # Importance accumulators: tensor_name → (sum_x2, count)
-        self.importance = {}
-        # HPC C library for accelerated forward pass
-        self._hpc_lib = None
-        try:
-            script_dir = os.path.dirname(os.path.abspath(__file__))
-            lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
-            if os.path.exists(lib_path):
-                lib = ctypes.CDLL(lib_path)
-                if hasattr(lib, 'hexstate_forward_layer'):
-                    self._hpc_lib = lib
-                    if verbose:
-                        print("  HPC·Forward engine loaded (hexstate_forward_layer)")
-        except Exception:
-            pass
-    def _record(self, name, x):
-        """Record E[x²] for this tensor's input activation."""
-        # x shape: [..., n_cols] — record per-column (input channel)
-        x_flat = x.reshape(-1, x.shape[-1])
-        x2 = np.sum(x_flat ** 2, axis=0)
-        if name in self.importance:
-            self.importance[name] = (
-                self.importance[name][0] + x2,
-                self.importance[name][1] + x_flat.shape[0],
-            )
-        else:
-            self.importance[name] = (x2.copy(), x_flat.shape[0])
-    def _get_weight(self, name):
-        """Load weight, trying GGUF name patterns."""
-        w = self.model.get_tensor_f32(name)
-        if w is None:
-            return None
-        shape = self.model.get_tensor_shape(name)
-        if shape and len(shape) >= 2:
-            return w.reshape(shape)
-        return w
-    def _layer_prefix(self, layer_idx):
-        return f"blk.{layer_idx}"
-    def _hpc_forward_layer(self, hidden, layer_idx):
-        """Full layer forward pass via C hexstate_forward_layer.
-        Loads weights, creates ctypes pointers, calls C, reads back importance.
-        Returns updated hidden state.
-        """
-        pfx = self._layer_prefix(layer_idx)
-        cfg = self.cfg
-        lib = self._hpc_lib
-        seq_len = hidden.shape[0]
-        n_embd = cfg['n_embd']
-        n_head = cfg['n_head']
-        n_head_kv = cfg['n_head_kv']
-        if isinstance(n_head_kv, list):
-            n_head_kv = n_head_kv[layer_idx]
-        head_dim = self.head_dim
-        eps = cfg['rms_eps']
-        FP = ctypes.POINTER(ctypes.c_float)
-        I64P = ctypes.POINTER(ctypes.c_int64)
-        def _fp(arr):
-            if arr is None: return ctypes.cast(None, FP), None
-            a = np.ascontiguousarray(arr, dtype=np.float32)
-            return a.ctypes.data_as(FP), a
-        def _imp(name, dim):
-            """Get or create importance accumulator, return (pointer, count_ptr, holder)."""
-            if name not in self.importance:
-                self.importance[name] = (np.zeros(dim, dtype=np.float32), 0)
-            imp_arr = np.ascontiguousarray(self.importance[name][0], dtype=np.float32)
-            cnt = ctypes.c_int64(self.importance[name][1])
-            return imp_arr.ctypes.data_as(FP), ctypes.byref(cnt), imp_arr, cnt
-        # Make hidden contiguous and get pointer
-        hidden = np.ascontiguousarray(hidden, dtype=np.float32)
-        h_ptr = hidden.ctypes.data_as(FP)
-        # Load all weights for this layer
-        norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
-        if norm_w is None:
-            return hidden
-        qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
-        q_w = self._get_weight(f'{pfx}.attn_q.weight')
-        k_w = self._get_weight(f'{pfx}.attn_k.weight')
-        v_w = self._get_weight(f'{pfx}.attn_v.weight')
-        gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
-        o_w = self._get_weight(f'{pfx}.attn_output.weight')
-        ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
-        if ffn_norm_w is None:
-            ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
-        ffn_gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
-        ffn_up_w = self._get_weight(f'{pfx}.ffn_up.weight')
-        ffn_down_w = self._get_weight(f'{pfx}.ffn_down.weight')
-        # Prepare ctypes args (keep refs to prevent GC)
-        refs = []
-        def fp(arr):
-            p, a = _fp(arr)
-            refs.append(a)
-            return p
-        norm_p = fp(norm_w)
-        qkv_p = fp(qkv_w)
-        q_p = fp(q_w)
-        k_p = fp(k_w)
-        v_p = fp(v_w)
-        gate_p = fp(gate_w)
-        o_p = fp(o_w)
-        ffn_norm_p = fp(ffn_norm_w)
-        ffn_gate_p = fp(ffn_gate_w)
-        ffn_up_p = fp(ffn_up_w)
-        ffn_down_p = fp(ffn_down_w)
-        qkv_dim = qkv_w.shape[0] if qkv_w is not None else 0
-        q_dim_v = q_w.shape[0] if q_w is not None else 0
-        k_dim_v = k_w.shape[0] if k_w is not None else 0
-        v_dim_v = v_w.shape[0] if v_w is not None else 0
-        gate_rows = gate_w.shape[0] if gate_w is not None else 0
-        o_cols = o_w.shape[1] if (o_w is not None and o_w.ndim >= 2) else 0
-        ffn_d = ffn_gate_w.shape[0] if ffn_gate_w is not None else 0
-        # Importance accumulators
-        imp_refs = []  # Keep alive
-        null_fp = ctypes.cast(None, FP)
-        null_i64p = ctypes.cast(None, I64P)
-        def make_imp(name, dim):
-            if dim <= 0:
-                return null_fp, null_i64p
-            p, cp, arr, cnt = _imp(name, dim)
-            imp_refs.append((name, arr, cnt))
-            return p, cp
-        imp_qkv_p, cnt_qkv_p = make_imp(f'{pfx}.attn_qkv.weight', n_embd if qkv_w is not None else 0)
-        imp_q_p, cnt_q_p = make_imp(f'{pfx}.attn_q.weight', n_embd if q_w is not None else 0)
-        imp_k_p, cnt_k_p = make_imp(f'{pfx}.attn_k.weight', n_embd if k_w is not None else 0)
-        imp_v_p, cnt_v_p = make_imp(f'{pfx}.attn_v.weight', n_embd if v_w is not None else 0)
-        imp_gate_p, cnt_gate_p = make_imp(f'{pfx}.attn_gate.weight', n_head * head_dim if gate_w is not None else 0)
-        imp_o_p, cnt_o_p = make_imp(f'{pfx}.attn_output.weight', o_cols if o_w is not None else 0)
-        imp_fg_p, cnt_fg_p = make_imp(f'{pfx}.ffn_gate.weight', n_embd if ffn_gate_w is not None else 0)
-        imp_fu_p, cnt_fu_p = make_imp(f'{pfx}.ffn_up.weight', n_embd if ffn_up_w is not None else 0)
-        imp_fd_p, cnt_fd_p = make_imp(f'{pfx}.ffn_down.weight', ffn_d if ffn_down_w is not None else 0)
-        # Call C function — entire layer in one call (FFN part will be NULL if MoE)
-        lib.hexstate_forward_layer(
-            h_ptr,
-            norm_p,
-            qkv_p, ctypes.c_int64(qkv_dim),
-            q_p, ctypes.c_int64(q_dim_v),
-            k_p, ctypes.c_int64(k_dim_v),
-            v_p, ctypes.c_int64(v_dim_v),
-            gate_p, ctypes.c_int64(gate_rows),
-            o_p, ctypes.c_int64(o_cols),
-            ffn_norm_p,
-            ffn_gate_p, ffn_up_p, ffn_down_p,
-            ctypes.c_int64(ffn_d),
-            imp_qkv_p, cnt_qkv_p,
-            imp_q_p, cnt_q_p,
-            imp_k_p, cnt_k_p,
-            imp_v_p, cnt_v_p,
-            imp_gate_p, cnt_gate_p,
-            imp_o_p, cnt_o_p,
-            imp_fg_p, cnt_fg_p,
-            imp_fu_p, cnt_fu_p,
-            imp_fd_p, cnt_fd_p,
-            ctypes.c_int64(seq_len), ctypes.c_int64(n_embd),
-            ctypes.c_int64(n_head), ctypes.c_int64(n_head_kv),
-            ctypes.c_int64(head_dim), ctypes.c_float(eps))
-        # Read back importance for the tensors that WERE processed in C
-        for name, arr, cnt in imp_refs:
-            # Extract value from ctypes byref pointer
-            self.importance[name] = (arr.astype(np.float64), cnt.value)
-        # Handle MoE FFN if C code skipped it
-        if ffn_gate_w is None:
-            # Re-normalize for FFN
-            normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, eps)
-            hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
-        # Force-free per-layer weight buffers (~1.4 GB) before next layer
-        del refs, imp_refs
-        import gc; gc.collect()
-        return hidden
-    def _forward_moe_ffn(self, hidden, normed_ff, pfx):
-        """Python-side MoE FFN handling (supports packed and shared experts)."""
-        gate_inp_w = self._get_weight(f'{pfx}.ffn_gate_inp.weight')
-        if gate_inp_w is None:
-            return hidden
-        self._record(f'{pfx}.ffn_gate_inp.weight', normed_ff)
-        router_logits = normed_ff @ gate_inp_w.T
-        n_experts = router_logits.shape[-1]
-        probs = softmax(router_logits, axis=-1)
-        topk = self.cfg.get('expert_used_count', 2)
-        top_k_indices = np.argsort(probs, axis=-1)[:, -topk:]
-        ff_out = np.zeros_like(normed_ff)
-        # Check for packed experts (Qwen style)
-        p_gate = self._get_weight(f'{pfx}.ffn_gate_exps.weight')
-        p_up   = self._get_weight(f'{pfx}.ffn_up_exps.weight')
-        p_down = self._get_weight(f'{pfx}.ffn_down_exps.weight')
-        for exp_id in range(n_experts):
-            if p_gate is not None:
-                ew_gate = p_gate[exp_id]
-                ew_up   = p_up[exp_id]
-                ew_down = p_down[exp_id]
-            else:
-                ew_gate = self._get_weight(f'{pfx}.ffn_gate.{exp_id}.weight')
-                ew_up   = self._get_weight(f'{pfx}.ffn_up.{exp_id}.weight')
-                ew_down = self._get_weight(f'{pfx}.ffn_down.{exp_id}.weight')
-            if ew_gate is None: continue
-            mask_exp = np.any(top_k_indices == exp_id, axis=-1)
-            if not np.any(mask_exp): continue
-            exp_input = normed_ff[mask_exp]
-            # Record importance
-            if p_gate is not None:
-                self._record(f'{pfx}.ffn_gate_exps.weight', exp_input)
-                self._record(f'{pfx}.ffn_up_exps.weight', exp_input)
-            else:
-                self._record(f'{pfx}.ffn_gate.{exp_id}.weight', exp_input)
-                self._record(f'{pfx}.ffn_up.{exp_id}.weight', exp_input)
-            g = self.act_fn(exp_input @ ew_gate.T)
-            u = exp_input @ ew_up.T
-            mid = g * u
-            if p_gate is not None:
-                self._record(f'{pfx}.ffn_down_exps.weight', mid)
-            else:
-                self._record(f'{pfx}.ffn_down.{exp_id}.weight', mid)
-            exp_out = mid @ ew_down.T
-            indices = np.where(mask_exp)[0]
-            for i, tidx in enumerate(indices):
-                w = probs[tidx, exp_id]
-                ff_out[tidx] += w * exp_out[i]
-        # Shared experts (Qwen style)
-        sh_gate = self._get_weight(f'{pfx}.ffn_gate_shexp.weight')
-        if sh_gate is not None:
-            sh_up = self._get_weight(f'{pfx}.ffn_up_shexp.weight')
-            sh_down = self._get_weight(f'{pfx}.ffn_down_shexp.weight')
-            self._record(f'{pfx}.ffn_gate_shexp.weight', normed_ff)
-            self._record(f'{pfx}.ffn_up_shexp.weight', normed_ff)
-            g = self.act_fn(normed_ff @ sh_gate.T)
-            u = normed_ff @ sh_up.T
-            mid = g * u
-            self._record(f'{pfx}.ffn_down_shexp.weight', mid)
-            ff_out += mid @ sh_down.T
-        return hidden + ff_out
-    def _hpc_rms_norm(self, x, weight, eps):
-        """RMS norm via HPC C library, falling back to numpy."""
-        if self._hpc_lib and x.flags['C_CONTIGUOUS']:
-            seq_len, dim = x.shape
-            out = np.empty_like(x)
-            w = np.ascontiguousarray(weight, dtype=np.float32)
-            self._hpc_lib.hexstate_rms_norm(
-                x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                w.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                ctypes.c_int64(seq_len), ctypes.c_int64(dim),
-                ctypes.c_float(eps))
-            return out
-        return rms_norm(x, weight, eps)
-    def _hpc_matmul_record(self, name, x, weight):
-        """Fused matmul + importance recording via HPC C library.
-        Uses HPCGraph phase-coherent importance modulation (see hexstate_matmul_record in C)
-        for the E[x²] accumulation, but delegates the actual matmul to numpy BLAS
-        for maximum speed on large matrices.
-        Returns x @ weight.T while recording importance for `name`.
-        """
-        if self._hpc_lib and x.flags['C_CONTIGUOUS'] and weight.flags['C_CONTIGUOUS']:
-            M, K = x.shape
-            N = weight.shape[0]  # weight is [N, K], computing x @ W.T -> [M, N]
-            # HPC importance: C library builds HPCGraph over columns,
-            # encodes x² as triality amplitudes, CZ-couples adjacent columns,
-            # and modulates importance by hpc_marginal phase coherence.
-            if name not in self.importance:
-                self.importance[name] = (np.zeros(K, dtype=np.float64), 0)
-            imp_f32 = self.importance[name][0].astype(np.float32)
-            count = ctypes.c_int64(self.importance[name][1])
-            # Pass real weights to C library for importance recording
-            weight_ptr = weight.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
-            # Dummy output — we only want the importance recording
-            dummy_out = np.empty((M, 1), dtype=np.float32)
-            self._hpc_lib.hexstate_matmul_record(
-                x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                weight_ptr,
-                dummy_out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                imp_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
-                ctypes.c_int64(M), ctypes.c_int64(K), ctypes.c_int64(N),
-                ctypes.byref(count))
-            self.importance[name] = (imp_f32.astype(np.float64), count.value)
-            # Matmul via numpy BLAS (much faster than our C loop for large N)
-            return x @ weight.T
-        # Fallback: pure numpy
-        self._record(name, x)
-        return x @ weight.T
-    def forward_layer_linear(self, hidden, layer_idx):
-        """HPC-linearized forward: O(seq) attention for imatrix collection.
-        Instead of full O(seq²) softmax attention, uses causal linear attention:
-        each position's output is a running weighted average of V, where weights
-        come from Q·K similarity in phase space. This preserves activation
-        magnitude statistics (which is all imatrix needs) while being O(seq).
-        Records identical importance stats as the full forward_layer.
-        """
-        pfx = self._layer_prefix(layer_idx)
-        cfg = self.cfg
-        n_head = cfg['n_head']
-        n_head_kv = cfg['n_head_kv']
-        if isinstance(n_head_kv, list):
-            n_head_kv = n_head_kv[layer_idx]
-        seq_len = hidden.shape[0]
-        # ── Attention norm ──
-        attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
-        if attn_norm_w is None:
-            return hidden
-        normed = self._hpc_rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
-        # ── Check for fused vs separate QKV ──
-        qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
-        gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
-        q_w = self._get_weight(f'{pfx}.attn_q.weight')
-        k_w = self._get_weight(f'{pfx}.attn_k.weight')
-        v_w = self._get_weight(f'{pfx}.attn_v.weight')
-        o_w = self._get_weight(f'{pfx}.attn_output.weight')
-        if qkv_w is not None:
-            # ── Fused QKV path (Qwen 3.6 hybrid layers) ──
-            head_dim = self.head_dim
-            q_dim = n_head * head_dim
-            kv_dim = n_head_kv * head_dim
-            qkv = self._hpc_matmul_record(f'{pfx}.attn_qkv.weight', normed, qkv_w)
-            q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
-            k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
-            v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
-            # GQA expand
-            if n_head_kv < n_head:
-                rep = n_head // n_head_kv
-                k = np.repeat(k, rep, axis=1)
-                v = np.repeat(v, rep, axis=1)
-            # ── Linear attention: O(seq × head_dim²) ──
-            # φ(x) = elu(x) + 1 (feature map for linear attention)
-            q_feat = np.maximum(q, 0) + 1e-6  # [seq, n_head, head_dim]
-            k_feat = np.maximum(k, 0) + 1e-6
-            # Causal linear attention via running state (vectorized over heads):
-            # S_t = S_{t-1} + k_t ⊗ v_t  (outer product accumulator)
-            # z_t = z_{t-1} + k_t         (normalizer accumulator)
-            # out_t = (q_t @ S_t) / (q_t · z_t)
-            out = np.zeros_like(q)  # [seq, n_head, head_dim]
-            S = np.zeros((n_head, head_dim, head_dim), dtype=np.float32)
-            z = np.zeros((n_head, head_dim), dtype=np.float32)
-            for t in range(seq_len):
-                # Vectorized over all heads: [n_head, head_dim]
-                kt = k_feat[t]  # [n_head, head_dim]
-                vt = v[t]       # [n_head, head_dim]
-                qt = q_feat[t]  # [n_head, head_dim]
-                # S[h] += outer(kt[h], vt[h]) for all h at once
-                S += kt[:, :, None] * vt[:, None, :]  # [n_head, hd, hd]
-                z += kt                                # [n_head, hd]
-                # num = qt @ S -> [n_head, head_dim]
-                num = np.einsum('hd,hde->he', qt, S)
-                den = np.sum(qt * z, axis=-1, keepdims=True) + 1e-8  # [n_head, 1]
-                out[t] = num / den
-            attn_result = out.reshape(seq_len, -1)  # [seq, n_head * head_dim]
-            # Record and project
-            if gate_w is not None:
-                self._record(f'{pfx}.attn_gate.weight', attn_result)
-                if gate_w.shape[1] == hidden.shape[-1]:
-                    attn_out = attn_result @ gate_w
-                else:
-                    attn_out = attn_result @ gate_w.T
-            else:
-                attn_out = np.zeros_like(hidden)
-        elif q_w is not None and k_w is not None and v_w is not None and o_w is not None:
-            # ── Separate QKV path (standard transformer layers) ──
-            q = self._hpc_matmul_record(f'{pfx}.attn_q.weight', normed, q_w)
-            k = self._hpc_matmul_record(f'{pfx}.attn_k.weight', normed, k_w)
-            v = self._hpc_matmul_record(f'{pfx}.attn_v.weight', normed, v_w)
-            head_dim_q = q_w.shape[0] // n_head
-            head_dim_kv = k_w.shape[0] // n_head_kv
-            q = q.reshape(seq_len, n_head, head_dim_q)
-            k = k.reshape(seq_len, n_head_kv, head_dim_kv)
-            v = v.reshape(seq_len, n_head_kv, head_dim_kv)
-            if n_head_kv < n_head:
-                rep = n_head // n_head_kv
-                k = np.repeat(k, rep, axis=1)
-                v = np.repeat(v, rep, axis=1)
-            # Linear attention with feature map
-            q_feat = np.maximum(q, 0) + 1e-6
-            k_feat = np.maximum(k, 0) + 1e-6
-            out = np.zeros_like(v)  # [seq, n_head, head_dim_kv]
-            S = np.zeros((n_head, head_dim_kv, head_dim_kv), dtype=np.float32)
-            z = np.zeros((n_head, head_dim_kv), dtype=np.float32)
-            # Use min of q/k dims for the state accumulator
-            feat_dim = min(head_dim_q, head_dim_kv)
-            S = np.zeros((n_head, feat_dim, head_dim_kv), dtype=np.float32)
-            z = np.zeros((n_head, feat_dim), dtype=np.float32)
-            for t in range(seq_len):
-                # Vectorized over all heads
-                kf = k_feat[t, :, :feat_dim]  # [n_head, feat_dim]
-                qf = q_feat[t, :, :feat_dim]  # [n_head, feat_dim]
-                vt = v[t]                      # [n_head, head_dim_kv]
-                S += kf[:, :, None] * vt[:, None, :]  # [n_head, feat_dim, head_dim_kv]
-                z += kf                                # [n_head, feat_dim]
-                num = np.einsum('hd,hde->he', qf, S)   # [n_head, head_dim_kv]
-                den = np.sum(qf * z, axis=-1, keepdims=True) + 1e-8
-                out[t] = num / den
-            attn_result = out.reshape(seq_len, -1)
-            # Pad/truncate to match o_w input size
-            if attn_result.shape[-1] != o_w.shape[1]:
-                if attn_result.shape[-1] < o_w.shape[1]:
-                    padded = np.zeros((seq_len, o_w.shape[1]), dtype=attn_result.dtype)
-                    padded[:, :attn_result.shape[-1]] = attn_result
-                    attn_result = padded
-                else:
-                    attn_result = attn_result[:, :o_w.shape[1]]
-            self._record(f'{pfx}.attn_output.weight', attn_result)
-            attn_out = attn_result @ o_w.T
-        else:
-            return hidden
-        hidden = hidden + attn_out
-        # ── SSM path (Qwen 3.6 hybrid) ──
-        ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
-        ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
-        ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
-        if ssm_alpha_w is not None:
-            self._record(f'{pfx}.ssm_alpha.weight', normed)
-        if ssm_beta_w is not None:
-            self._record(f'{pfx}.ssm_beta.weight', normed)
-        if ssm_out_w is not None:
-            if qkv_w is not None:
-                qkv_full = normed @ qkv_w.T
-                ssm_proxy = qkv_full[:, :ssm_out_w.shape[1]] if qkv_full.shape[-1] >= ssm_out_w.shape[1] else normed
-            else:
-                ssm_proxy = normed
-            self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
-            if ssm_out_w.shape[0] == hidden.shape[-1]:
-                hidden = hidden + ssm_proxy @ ssm_out_w.T
-        # ── FFN ──
-        ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
-        if ffn_norm_w is None:
-            ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
-        if ffn_norm_w is None:
-            return hidden
-        normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
-        gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
-        up_w = self._get_weight(f'{pfx}.ffn_up.weight')
-        down_w = self._get_weight(f'{pfx}.ffn_down.weight')
-        if gate_fw is not None and up_w is not None and down_w is not None:
-            gate_out = self.act_fn(self._hpc_matmul_record(f'{pfx}.ffn_gate.weight', normed_ff, gate_fw))
-            up_out = self._hpc_matmul_record(f'{pfx}.ffn_up.weight', normed_ff, up_w)
-            ff_mid = gate_out * up_out
-            self._record(f'{pfx}.ffn_down.weight', ff_mid)
-            ff_out = ff_mid @ down_w.T
-            hidden = hidden + ff_out
-        return hidden
-    def forward_layer(self, hidden, layer_idx, cos_f, sin_f):
-        """Forward pass through one transformer layer. Returns new hidden state."""
-        pfx = self._layer_prefix(layer_idx)
-        cfg = self.cfg
-        n_head = cfg['n_head']
-        n_head_kv = cfg['n_head_kv']
-        if isinstance(n_head_kv, list):
-            n_head_kv = n_head_kv[layer_idx]
-        head_dim = self.head_dim
-        seq_len = hidden.shape[0]
-        # ── Attention ──
-        attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
-        if attn_norm_w is None:
-            return hidden  # Skip if weights missing
-        normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
-        # Q/K/V projections — record importance on the INPUT (normed)
-        q_w = self._get_weight(f'{pfx}.attn_q.weight')
-        k_w = self._get_weight(f'{pfx}.attn_k.weight')
-        v_w = self._get_weight(f'{pfx}.attn_v.weight')
-        o_w = self._get_weight(f'{pfx}.attn_output.weight')
-        if q_w is None or k_w is None or v_w is None or o_w is None:
-            return hidden
-        self._record(f'{pfx}.attn_q.weight', normed)
-        self._record(f'{pfx}.attn_k.weight', normed)
-        self._record(f'{pfx}.attn_v.weight', normed)
-        q = normed @ q_w.T  # [seq, q_w.shape[0]]
-        k = normed @ k_w.T  # [seq, k_w.shape[0]]
-        v = normed @ v_w.T
-        # Dynamic head_dim based on tensor size
-        head_dim_q = q_w.shape[0] // n_head
-        head_dim_kv = k_w.shape[0] // n_head_kv
-        q = q.reshape(seq_len, n_head, head_dim_q)
-        k = k.reshape(seq_len, n_head_kv, head_dim_kv)
-        v = v.reshape(seq_len, n_head_kv, head_dim_kv)
-        # Apply RoPE
-        if head_dim_q != head_dim:
-            cos_q, sin_q = rope_freqs(head_dim_q, seq_len, cfg['rope_base'])
-            q = apply_rope(q, cos_q, sin_q)
-        else:
-            q = apply_rope(q, cos_f, sin_f)
-        if head_dim_kv != head_dim:
-            cos_k, sin_k = rope_freqs(head_dim_kv, seq_len, cfg['rope_base'])
-            k = apply_rope(k, cos_k, sin_k)
-        else:
-            k = apply_rope(k, cos_f, sin_f)
-        # GQA: repeat KV heads
-        if n_head_kv < n_head:
-            rep = n_head // n_head_kv
-            k = np.repeat(k, rep, axis=1)
-            v = np.repeat(v, rep, axis=1)
-        q_t = q.transpose(1, 0, 2)  # [n_head, seq, head_dim_q]
-        k_t = k.transpose(1, 0, 2)  # [n_head, seq, head_dim_kv]
-        v_t = v.transpose(1, 0, 2)  # [n_head, seq, head_dim_kv]
-        scale = 1.0 / np.sqrt(head_dim_q)
-        # If Q and K head dims differ, there might be a projection or it's not standard SDP.
-        # But for importance calculation, if we just need to get the attention magnitude:
-        # We can pad K to match Q, or truncate Q to match K. We only need an approximation.
-        if head_dim_q != head_dim_kv:
-            if head_dim_q > head_dim_kv:
-                k_t_padded = np.zeros_like(q_t)
-                k_t_padded[..., :head_dim_kv] = k_t
-                k_t = k_t_padded
-            else:
-                q_t_padded = np.zeros_like(k_t)
-                q_t_padded[..., :head_dim_q] = q_t
-                q_t = q_t_padded
-        attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale  # [n_head, seq, seq]
-        # Causal mask (with optional sliding window)
-        mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
-        swa = cfg.get('swa_window', 0)
-        if swa and swa > 0:
-            for i in range(seq_len):
-                for j in range(0, max(0, i - swa)):
-                    mask[i, j] = -1e9
-        attn = attn + mask[np.newaxis, :, :]
-        attn = softmax(attn, axis=-1)
-        out = np.matmul(attn, v_t)  # [n_head, seq, head_dim_kv]
-        # Output projection input is out_w.T -> [in_features, out_features]
-        # In_features is out_w.shape[1]
-        out = out.transpose(1, 0, 2).reshape(seq_len, -1)  # [seq, n_head * head_dim_kv]
-        # Pad or truncate out to match expected input size of o_w
-        if out.shape[-1] != o_w.shape[1]:
-            if out.shape[-1] < o_w.shape[1]:
-                out_padded = np.zeros((seq_len, o_w.shape[1]), dtype=out.dtype)
-                out_padded[:, :out.shape[-1]] = out
-                out = out_padded
-            else:
-                out = out[:, :o_w.shape[1]]
-        self._record(f'{pfx}.attn_output.weight', out)
-        attn_out = out @ o_w.T
-        hidden = hidden + attn_out
-        # ── FFN ──
-        ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
-        if ffn_norm_w is None:
-            return hidden
-        normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
-        gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
-        up_w = self._get_weight(f'{pfx}.ffn_up.weight')
-        down_w = self._get_weight(f'{pfx}.ffn_down.weight')
-        if gate_w is not None and up_w is not None and down_w is not None:
-            self._record(f'{pfx}.ffn_gate.weight', normed_ff)
-            self._record(f'{pfx}.ffn_up.weight', normed_ff)
-            gate_out = self.act_fn(normed_ff @ gate_w.T)
-            up_out = normed_ff @ up_w.T
-            ff_mid = gate_out * up_out
-            self._record(f'{pfx}.ffn_down.weight', ff_mid)
-            ff_out = ff_mid @ down_w.T
-            hidden = hidden + ff_out
-        else:
-            # MoE path
-            hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
-        return hidden
-    def forward_linear_attn_layer(self, hidden, layer_idx):
-        """Forward pass through a DeltaNet (gated linear attention) layer.
-        Used by Qwen 3.5/3.6 for ~75% of layers. Records importance stats
-        for all SSM projection weights.
-        """
-        pfx = self._layer_prefix(layer_idx)
-        cfg = self.cfg
-        seq_len = hidden.shape[0]
-        # ── Attention norm ──
-        attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
-        if attn_norm_w is None:
-            return hidden
-        normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
-        # ── DeltaNet projections ──
-        qkv_w = self._get_weight(f'{pfx}.ssm_in_qkv.weight')
-        z_w = self._get_weight(f'{pfx}.ssm_in_z.weight')
-        a_w = self._get_weight(f'{pfx}.ssm_in_a.weight')
-        b_w = self._get_weight(f'{pfx}.ssm_in_b.weight')
-        out_w = self._get_weight(f'{pfx}.ssm_out.weight')
-        if qkv_w is None or out_w is None:
-            return hidden
-        # Record importance on input activations
-        self._record(f'{pfx}.ssm_in_qkv.weight', normed)
-        if z_w is not None:
-            self._record(f'{pfx}.ssm_in_z.weight', normed)
-        if a_w is not None:
-            self._record(f'{pfx}.ssm_in_a.weight', normed)
-        if b_w is not None:
-            self._record(f'{pfx}.ssm_in_b.weight', normed)
-        # Approximate forward: project through QKV and output
-        # (Full DeltaNet recurrence is complex; for importance collection
-        # we just need the activation magnitudes at each projection)
-        qkv = normed @ qkv_w.T
-        # For importance: record output projection input
-        # Use qkv as a proxy for the recurrent state output
-        n_out = out_w.shape[1] if out_w.ndim >= 2 else hidden.shape[-1]
-        if qkv.shape[-1] >= n_out:
-            out_input = qkv[:, :n_out]
-        else:
-            out_input = qkv
-        self._record(f'{pfx}.ssm_out.weight', out_input)
-        attn_out = out_input @ out_w.T
-        hidden = hidden + attn_out
-        # ── FFN (same as standard transformer) ──
-        ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
-        if ffn_norm_w is None:
-            return hidden
-        normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
-        gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
-        up_w = self._get_weight(f'{pfx}.ffn_up.weight')
-        down_w = self._get_weight(f'{pfx}.ffn_down.weight')
-        if gate_w is not None and up_w is not None and down_w is not None:
-            self._record(f'{pfx}.ffn_gate.weight', normed_ff)
-            self._record(f'{pfx}.ffn_up.weight', normed_ff)
-            gate_out = self.act_fn(normed_ff @ gate_w.T)
-            up_out = normed_ff @ up_w.T
-            ff_mid = gate_out * up_out
-            self._record(f'{pfx}.ffn_down.weight', ff_mid)
-            ff_out = ff_mid @ down_w.T
-            hidden = hidden + ff_out
-        else:
-            hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
-        return hidden
-    def forward_qwen35_layer(self, hidden, layer_idx, cos_f, sin_f):
-        """Forward pass through a Qwen 3.6 hybrid layer (attention + SSM).
-        Qwen 3.6 uses:
-          - Fused attn_qkv.weight (Q+K+V in one tensor)
-          - attn_gate.weight (gated attention output, not attn_output)
-          - SSM tensors: ssm_alpha, ssm_beta, ssm_conv1d, ssm_out
-          - post_attention_norm.weight (instead of ffn_norm)
-        """
-        pfx = self._layer_prefix(layer_idx)
-        cfg = self.cfg
-        n_head = cfg['n_head']
-        n_head_kv = cfg['n_head_kv']
-        if isinstance(n_head_kv, list):
-            n_head_kv = n_head_kv[layer_idx]
-        head_dim = self.head_dim
-        seq_len = hidden.shape[0]
-        # ── Attention norm ──
-        attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
-        if attn_norm_w is None:
-            return hidden
-        normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
-        # ── Fused QKV projection ──
-        qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
-        gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
-        attn_out_vec = np.zeros_like(hidden)
-        if qkv_w is not None:
-            self._record(f'{pfx}.attn_qkv.weight', normed)
-            qkv = normed @ qkv_w.T  # [seq, (n_head + 2*n_head_kv) * head_dim]
-            # Split into Q, K, V
-            q_dim = n_head * head_dim
-            kv_dim = n_head_kv * head_dim
-            q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
-            k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
-            v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
-            # RoPE
-            q = apply_rope(q, cos_f, sin_f)
-            k = apply_rope(k, cos_f, sin_f)
-            # GQA: repeat KV heads
-            if n_head_kv < n_head:
-                rep = n_head // n_head_kv
-                k = np.repeat(k, rep, axis=1)
-                v = np.repeat(v, rep, axis=1)
-            # Scaled dot-product attention
-            q_t = q.transpose(1, 0, 2)  # [n_head, seq, head_dim]
-            k_t = k.transpose(1, 0, 2)
-            v_t = v.transpose(1, 0, 2)
-            scale = 1.0 / np.sqrt(head_dim)
-            attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale
-            # Causal mask (with optional SWA)
-            mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
-            swa = cfg.get('swa_window', 0)
-            if swa and swa > 0:
-                for i in range(seq_len):
-                    for j in range(0, max(0, i - swa)):
-                        mask[i, j] = -1e9
-            attn = attn + mask[np.newaxis, :, :]
-            attn = softmax(attn, axis=-1)
-            out = np.matmul(attn, v_t)
-            attn_result = out.transpose(1, 0, 2).reshape(seq_len, -1)  # [seq, n_head*head_dim]
-            # Gated attention output
-            if gate_w is not None:
-                self._record(f'{pfx}.attn_gate.weight', attn_result)
-                # Some GGUF tensors are transposed. Ensure output matches hidden dim.
-                if gate_w.shape[1] == hidden.shape[-1]:
-                    attn_out_vec = attn_result @ gate_w
-                else:
-                    attn_out_vec = attn_result @ gate_w.T
-        # ── SSM path ──
-        ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
-        ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
-        ssm_conv_w = self._get_weight(f'{pfx}.ssm_conv1d.weight')
-        ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
-        ssm_out_vec = np.zeros_like(hidden)
-        if ssm_alpha_w is not None:
-            self._record(f'{pfx}.ssm_alpha.weight', normed)
-        if ssm_beta_w is not None:
-            self._record(f'{pfx}.ssm_beta.weight', normed)
-        if ssm_conv_w is not None:
-            # ssm_conv1d input is the QKV projection (reuse from attention)
-            if qkv_w is not None:
-                qkv_for_ssm = normed @ qkv_w.T
-                self._record(f'{pfx}.ssm_conv1d.weight', qkv_for_ssm)
-        if ssm_out_w is not None:
-            # SSM output projection — use qkv output as proxy for recurrent output
-            # (qkv is 10240, ssm_out expects 6144)
-            if 'qkv' in locals() and qkv.shape[-1] >= ssm_out_w.shape[1]:
-                ssm_proxy = qkv[:, :ssm_out_w.shape[1]]
-            else:
-                # Fallback zero pad
-                ssm_proxy = np.zeros((seq_len, ssm_out_w.shape[1]), dtype=np.float32)
-            self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
-            # Note: We do NOT need to actually add the SSM output vector to hidden
-            # for importance matrix calculation. We just need to record the inputs
-            # to all quantized layers. The actual output isn't critical since we
-            # aren't doing loss backprop. But if we do, it must match hidden's dimension.
-            if ssm_out_w.shape[0] == hidden.shape[-1]:
-                ssm_out_vec = ssm_proxy @ ssm_out_w.T
-        # Combine attention + SSM
-        hidden = hidden + attn_out_vec + ssm_out_vec
-        # ── FFN (uses post_attention_norm instead of ffn_norm) ──
-        ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
-        if ffn_norm_w is None:
-            ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
-        if ffn_norm_w is None:
-            return hidden
-        normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
-        gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
-        up_w = self._get_weight(f'{pfx}.ffn_up.weight')
-        down_w = self._get_weight(f'{pfx}.ffn_down.weight')
-        if gate_fw is not None and up_w is not None and down_w is not None:
-            self._record(f'{pfx}.ffn_gate.weight', normed_ff)
-            self._record(f'{pfx}.ffn_up.weight', normed_ff)
-            gate_out = self.act_fn(normed_ff @ gate_fw.T)
-            up_out = normed_ff @ up_w.T
-            ff_mid = gate_out * up_out
-            self._record(f'{pfx}.ffn_down.weight', ff_mid)
-            ff_out = ff_mid @ down_w.T
-            hidden = hidden + ff_out
-        else:
-            hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
-        return hidden
-    def forward(self, token_ids):
-        """Full forward pass, collecting importance statistics."""
-        cfg = self.cfg
-        seq_len = len(token_ids)
-        # Embedding
-        embed_w = self._get_weight('token_embd.weight')
-        if embed_w is None:
-            raise RuntimeError("Missing token_embd.weight")
-        hidden = embed_w[token_ids].copy()  # [seq_len, n_embd]
-        del embed_w  # Free ~5 GB embedding table before layer loop
-        # RoPE frequencies
-        cos_f, sin_f = rope_freqs(self.head_dim, seq_len, cfg['rope_base'])
-        # Process each layer
-        for layer_idx in range(cfg['n_layers']):
-            pfx = f"blk.{layer_idx}"
-            if self._hpc_lib and self.linear_attn:
-                # Pure HPC C forward: entire layer in one C call
-                hidden = self._hpc_forward_layer(hidden, layer_idx)
-            elif self.linear_attn:
-                # Python HPC-linearized attention: O(seq) per layer
-                hidden = self.forward_layer_linear(hidden, layer_idx)
-            else:
-                has_fused_qkv = f'{pfx}.attn_qkv.weight' in self.model.tensor_infos
-                has_separate_q = f'{pfx}.attn_q.weight' in self.model.tensor_infos
-                has_linear_attn = f'{pfx}.ssm_in_qkv.weight' in self.model.tensor_infos
-                if has_fused_qkv:
-                    hidden = self.forward_qwen35_layer(hidden, layer_idx, cos_f, sin_f)
-                elif has_linear_attn and not has_separate_q:
-                    hidden = self.forward_linear_attn_layer(hidden, layer_idx)
-                else:
-                    hidden = self.forward_layer(hidden, layer_idx, cos_f, sin_f)
-            if self.verbose and (layer_idx + 1) % 4 == 0:
-                print(f"    Layer {layer_idx + 1}/{cfg['n_layers']}", end='\r')
-        # Output projection — check existence without loading the full 5 GB tensor
-        if 'output.weight' in self.model.tensor_infos:
-            self._record('output.weight', hidden)
-        return hidden
-# ─── HPC Cross-Layer Importance Propagation ─────────────────────────────────
-def hpc_propagate_importance(importance_dict, n_layers, verbose=False):
-    """Use HPC-inspired BP to propagate importance across layers.
-    Each layer's raw E[x²] statistics are smoothed via cross-layer coupling
-    through the residual stream. Layers with high importance AND high-importance
-    neighbors get boosted; isolated spikes get damped.
-    """
-    # Group tensors by layer
-    layer_energies = np.zeros(n_layers, dtype=np.float64)
-    layer_tensor_count = np.zeros(n_layers, dtype=np.int32)
-    for name, (sum_x2, count) in importance_dict.items():
-        parts = name.split('.')
-        if len(parts) >= 2 and parts[0] == 'blk':
-            try:
-                layer_idx = int(parts[1])
-                if 0 <= layer_idx < n_layers:
-                    mean_imp = np.mean(sum_x2 / max(count, 1))
-                    layer_energies[layer_idx] += mean_imp
-                    layer_tensor_count[layer_idx] += 1
-            except ValueError:
-                pass
-    for i in range(n_layers):
-        if layer_tensor_count[i] > 0:
-            layer_energies[i] /= layer_tensor_count[i]
-    if np.max(layer_energies) < 1e-30:
-        return importance_dict
-    layer_energies /= np.max(layer_energies)
-    # BP-inspired iterative smoothing with residual stream coupling
-    multipliers = np.ones(n_layers, dtype=np.float64)
-    temperature = 0.5
-    for _ in range(50):
-        new_mult = np.ones(n_layers, dtype=np.float64)
-        for i in range(n_layers):
-            e_self = layer_energies[i]
-            e_nbr = 0.0
-            n_nbr = 0
-            if i > 0:
-                e_nbr += layer_energies[i-1] * multipliers[i-1]
-                n_nbr += 1
-            if i < n_layers - 1:
-                e_nbr += layer_energies[i+1] * multipliers[i+1]
-                n_nbr += 1
-            if n_nbr > 0:
-                e_nbr /= n_nbr
-            # Clamp energy to prevent exponential explosion (max exp(5) ~ 148)
-            energy = np.clip((e_self + 0.3 * e_nbr) / temperature, -10, 5)
-            new_mult[i] = np.exp(energy)
-        mean_m = np.mean(new_mult)
-        if mean_m > 1e-30:
-            new_mult /= mean_m
-        multipliers = 0.7 * multipliers + 0.3 * new_mult
-    if verbose:
-        print(f"\n  HPC layer multipliers (first 8): "
-              f"{' '.join(f'{m:.3f}' for m in multipliers[:8])}...")
-        print(f"  Range: [{np.min(multipliers):.3f}, {np.max(multipliers):.3f}]")
-    adjusted = {}
-    for name, (sum_x2, count) in importance_dict.items():
-        parts = name.split('.')
-        if len(parts) >= 2 and parts[0] == 'blk':
-            try:
-                layer_idx = int(parts[1])
-                if 0 <= layer_idx < n_layers:
-                    adjusted[name] = (sum_x2 * multipliers[layer_idx], count)
-                    continue
-            except ValueError:
-                pass
-        adjusted[name] = (sum_x2, count)
-    return adjusted
-# ─── iMatrix Output Writer ──────────────────────────────────────────────────
-def write_imatrix(path, importance_dict):
-    """Write llama.cpp-compatible legacy binary imatrix file."""
-    entries = []
-    for name, (sum_x2, count) in sorted(importance_dict.items()):
-        values = sum_x2.astype(np.float32)
-        entries.append((name, values, int(count)))
-    with open(path, 'wb') as f:
-        f.write(struct.pack('<i', len(entries)))
-        for name, values, n_samples in entries:
-            name_bytes = name.encode('utf-8')
-            f.write(struct.pack('<i', len(name_bytes)))
-            f.write(name_bytes)
-            f.write(struct.pack('<i', len(values)))
-            f.write(struct.pack('<i', n_samples))
-            f.write(values.tobytes())
-    return len(entries)
-def load_hf_config(config_path):
-    """Load a HuggingFace config.json and extract architecture info.
-    Maps HF keys to internal generate_imatrix.py keys:
-      hidden_size -> n_embd
-      num_hidden_layers -> n_layers
-      num_attention_heads -> n_head
-      num_key_value_heads -> n_head_kv
-      intermediate_size -> n_ff
-      vocab_size -> vocab_size
-      rms_norm_eps -> rms_eps
-      rope_theta -> rope_base
-      model_type -> arch
-    """
-    import json
-    with open(config_path, 'r') as f:
-        raw = json.load(f)
-    src = raw
-    if 'text_config' in raw and 'hidden_size' not in raw:
-        src = raw['text_config']
-    cfg = {}
-    cfg['arch'] = src.get('model_type', raw.get('model_type', 'unknown'))
-    cfg['n_embd'] = src.get('hidden_size', 0)
-    cfg['n_layers'] = src.get('num_hidden_layers', 0)
-    cfg['n_head'] = src.get('num_attention_heads', 0)
-    cfg['n_head_kv'] = src.get('num_key_value_heads', 0)
-    cfg['n_ff'] = src.get('intermediate_size', 0)
-    cfg['vocab_size'] = src.get('vocab_size', 0)
-    cfg['rms_eps'] = src.get('rms_norm_eps', 1e-6)
-    rope_params = src.get('rope_parameters', {})
-    cfg['rope_base'] = rope_params.get('rope_theta',
-                          src.get('rope_theta', 10000.0))
-    cfg['expert_count'] = src.get('num_local_experts', src.get('num_experts', 0))
-    cfg['expert_used_count'] = src.get('num_experts_per_tok', 0)
-    # head_dim fallback
-    if src.get('head_dim'):
-        cfg['head_dim'] = src['head_dim']
-    elif cfg['n_head'] > 0:
-        cfg['head_dim'] = cfg['n_embd'] // cfg['n_head']
-    return cfg
-# ─── Main ───────────────────────────────────────────────────────────────────
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(
-        description='HExState iMatrix Generator — HPC-enhanced importance matrix from GGUF')
-    parser.add_argument('model', help='Input GGUF model file')
-    parser.add_argument('calibration', help='Calibration text file')
-    parser.add_argument('-o', '--output', default='imatrix.dat',
-                        help='Output imatrix file (default: imatrix.dat)')
-    parser.add_argument('--config', help='Optional HuggingFace config.json')
-    parser.add_argument('--chunks', type=int, default=10,
-                        help='Number of token chunks to process (default: 10)')
-    parser.add_argument('--chunk-size', type=int, default=4096,
-                        help='Tokens per chunk (default: 4096)')
-    parser.add_argument('--no-hpc', action='store_true',
-                        help='Disable HPC cross-layer propagation')
-    parser.add_argument('--quadratic-attn', action='store_true',
-                        help='Use full O(seq²) attention instead of HPC-linearized O(seq)')
-    parser.add_argument('--verbose', action='store_true',
-                        help='Per-layer statistics')
-    args = parser.parse_args()
-    print()
-    print("  ╔════════════════════════════════════════════════════════════════╗")
-    print("  ║  HExState Importance Matrix Generator                        ║")
-    print("  ║  HPC-Enhanced E[x²] Collection from GGUF                    ║")
-    print("  ╚════════════════════════════════════════════════════════════════╝")
-    print()
-    start_time = time.time()
-    # ── Load model ──
-    print(f"  Loading model: {args.model}")
-    model = GGUFModel(args.model)
-    config = model.get_config()
-    # ── Load/Merge config.json ──
-    cfg_path = args.config
-    if not cfg_path:
-        # Auto-lookup in model directory
-        model_dir = os.path.dirname(os.path.abspath(args.model))
-        potential_cfg = os.path.join(model_dir, 'config.json')
-        if os.path.exists(potential_cfg):
-            cfg_path = potential_cfg
-    if cfg_path:
-        print(f"  Merging config from: {cfg_path}")
-        hf_cfg = load_hf_config(cfg_path)
-        # Override GGUF values with HF config values where they exist and are non-zero
-        for k, v in hf_cfg.items():
-            if v is not None:
-                config[k] = v
-    print(f"  Architecture:  {config['arch']}")
-    print(f"  Layers:        {config['n_layers']}")
-    print(f"  Hidden:        {config['n_embd']}")
-    print(f"  Heads:         {config['n_head']} (KV: {config['n_head_kv']})")
-    print(f"  FFN:           {config['n_ff']}")
-    print(f"  Vocab:         {config['vocab_size']}")
-    print(f"  Tensors:       {len(model.tensor_infos)}")
-    print()
-    # ── Load tokenizer ──
-    print("  Loading tokenizer from GGUF metadata...")
-    tokenizer = SimpleTokenizer(model)
-    print(f"  Vocab size: {tokenizer.vocab_size}")
-    print()
-    # ── Load calibration text ──
-    print(f"  Loading calibration data: {args.calibration}")
-    with open(args.calibration, 'r', encoding='utf-8', errors='replace') as f:
-        cal_text = f.read()
-    print(f"  Text length: {len(cal_text):,} chars")
-    # ── Tokenize and chunk ──
-    print(f"  Tokenizing ({args.chunk_size} tokens/chunk, {args.chunks} chunks max)...")
-    chunks = tokenizer.chunk_text(cal_text, args.chunk_size)
-    if len(chunks) > args.chunks:
-        chunks = chunks[:args.chunks]
-    print(f"  Prepared {len(chunks)} chunks")
-    print()
-    # ── Forward pass ──
-    print("  Running forward passes...")
-    use_linear = not args.quadratic_attn
-    runner = TransformerRunner(model, config, verbose=args.verbose, linear_attn=use_linear)
-    if use_linear:
-        print(f"  Attention mode: HPC-linearized O(seq) — chunk_size={args.chunk_size}")
-    else:
-        print(f"  Attention mode: full O(seq²) softmax — chunk_size={args.chunk_size}")
-    for i, chunk in enumerate(chunks):
-        elapsed = time.time() - start_time
-        eta = elapsed / max(i, 1) * (len(chunks) - i) if i > 0 else 0
-        pct = (i + 1) / len(chunks) * 100
-        bw = 40
-        filled = int(bw * (i + 1) / len(chunks))
-        bar = '█' * filled + '░' * (bw - filled)
-        sys.stdout.write(
-            f"\r  [{bar}] {pct:5.1f}% ({i+1}/{len(chunks)}) "
-            f"{elapsed:.0f}s ETA:{eta:.0f}s")
-        sys.stdout.flush()
-        try:
-            runner.forward(chunk)
-        except Exception as e:
-            print(f"\n  WARNING: Chunk {i} failed: {e}")
-            continue
-    print(f"\n  Collected importance for {len(runner.importance)} tensors")
-    print()
-    # ── HPC propagation ──
-    if not args.no_hpc:
-        print("  Running HPC cross-layer importance propagation...")
-        importance = hpc_propagate_importance(
-            runner.importance, config['n_layers'], verbose=args.verbose)
-    else:
-        importance = runner.importance
-    # ── Write output ──
-    print(f"\n  Writing imatrix: {args.output}")
-    n_entries = write_imatrix(args.output, importance)
-    elapsed = time.time() - start_time
-    out_size = os.path.getsize(args.output)
-    print()
-    print("  ╔════════════════════════════════════════════════════════════════╗")
-    print("  ║  IMATRIX GENERATION COMPLETE                                 ║")
-    print("  ╠════════════════════════════════════════════════════════════════╣")
-    print(f"  ║  Tensor entries:   {n_entries:<42d} ║")
-    print(f"  ║  Chunks processed: {len(chunks):<42d} ║")
-    print(f"  ║  Output size:      {out_size:>11,} bytes ({out_size/1024:.1f} KB)"
-          f"{' '*(25-len(f'{out_size/1024:.1f}'))}║")
-    print(f"  ║  Total time:       {elapsed:>38.1f} sec ║")
-    print("  ╚════════════════════════════════════════════════════════════════╝")
-    print()
-    print(f"  Output: {args.output}")
-    print()
-    model.close()
-if __name__ == '__main__':
-    main()