Delete generate_imatrix.py
Browse files- generate_imatrix.py +0 -1733
generate_imatrix.py
DELETED
|
@@ -1,1733 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
HExState Importance Matrix Generator — HPC-Enhanced iMatrix from GGUF
|
| 4 |
-
|
| 5 |
-
Runs transformer forward passes over calibration text to collect per-channel
|
| 6 |
-
E[x²] activation statistics, then uses HPC triality BP to propagate importance
|
| 7 |
-
across layers. Outputs llama.cpp-compatible .dat imatrix files.
|
| 8 |
-
|
| 9 |
-
Usage:
|
| 10 |
-
python3 generate_imatrix.py model.gguf calibration.txt -o imatrix.dat
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
import struct
|
| 14 |
-
import sys
|
| 15 |
-
import os
|
| 16 |
-
import time
|
| 17 |
-
import mmap
|
| 18 |
-
import ctypes
|
| 19 |
-
import numpy as np
|
| 20 |
-
from collections import OrderedDict
|
| 21 |
-
|
| 22 |
-
# ─── Constants ──────────────────────────────────────────────────────────────
|
| 23 |
-
GGUF_MAGIC = 0x46554747
|
| 24 |
-
ALIGNMENT = 32
|
| 25 |
-
QK_K = 256
|
| 26 |
-
QK4_0 = 32
|
| 27 |
-
QK8_0 = 32
|
| 28 |
-
|
| 29 |
-
GGML_TYPE_F32 = 0
|
| 30 |
-
GGML_TYPE_F16 = 1
|
| 31 |
-
GGML_TYPE_Q4_0 = 2
|
| 32 |
-
GGML_TYPE_Q8_0 = 8
|
| 33 |
-
GGML_TYPE_Q2_K = 10
|
| 34 |
-
GGML_TYPE_BF16 = 30
|
| 35 |
-
|
| 36 |
-
TYPE_BLOCK_SIZE = {
|
| 37 |
-
0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
|
| 38 |
-
8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
|
| 39 |
-
13: 256, 14: 256, 15: 256, 30: 1,
|
| 40 |
-
}
|
| 41 |
-
TYPE_BLOCK_BYTES = {
|
| 42 |
-
0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
|
| 43 |
-
8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
|
| 44 |
-
13: 176, 14: 210, 15: 292, 30: 2,
|
| 45 |
-
}
|
| 46 |
-
TYPE_NAME = {
|
| 47 |
-
0: "F32", 1: "F16", 2: "Q4_0", 8: "Q8_0", 10: "Q2_K", 30: "BF16",
|
| 48 |
-
}
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
# ─── GGUF Reader ────────────────────────────────────────────────────────────
|
| 52 |
-
|
| 53 |
-
def align_offset(offset):
|
| 54 |
-
return (offset + ALIGNMENT - 1) & ~(ALIGNMENT - 1)
|
| 55 |
-
|
| 56 |
-
def read_string(f):
|
| 57 |
-
slen = struct.unpack('<Q', f.read(8))[0]
|
| 58 |
-
return f.read(slen).decode('utf-8', errors='replace')
|
| 59 |
-
|
| 60 |
-
def read_kv_value(f, vtype):
|
| 61 |
-
"""Read and return a KV value."""
|
| 62 |
-
if vtype == 0: return struct.unpack('<B', f.read(1))[0]
|
| 63 |
-
elif vtype == 1: return struct.unpack('<b', f.read(1))[0]
|
| 64 |
-
elif vtype == 2: return struct.unpack('<H', f.read(2))[0]
|
| 65 |
-
elif vtype == 3: return struct.unpack('<h', f.read(2))[0]
|
| 66 |
-
elif vtype == 4: return struct.unpack('<I', f.read(4))[0]
|
| 67 |
-
elif vtype == 5: return struct.unpack('<i', f.read(4))[0]
|
| 68 |
-
elif vtype == 6: return struct.unpack('<f', f.read(4))[0]
|
| 69 |
-
elif vtype == 7: return bool(struct.unpack('<B', f.read(1))[0])
|
| 70 |
-
elif vtype == 8: return read_string(f)
|
| 71 |
-
elif vtype == 9:
|
| 72 |
-
arr_type = struct.unpack('<I', f.read(4))[0]
|
| 73 |
-
arr_len = struct.unpack('<Q', f.read(8))[0]
|
| 74 |
-
return [read_kv_value(f, arr_type) for _ in range(arr_len)]
|
| 75 |
-
elif vtype == 10: return struct.unpack('<Q', f.read(8))[0]
|
| 76 |
-
elif vtype == 11: return struct.unpack('<q', f.read(8))[0]
|
| 77 |
-
elif vtype == 12: return struct.unpack('<d', f.read(8))[0]
|
| 78 |
-
else:
|
| 79 |
-
raise ValueError(f"Unknown KV type {vtype}")
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
class GGUFModel:
|
| 83 |
-
"""Loads a GGUF model with mmap'd tensor access."""
|
| 84 |
-
|
| 85 |
-
def __init__(self, path):
|
| 86 |
-
self.path = path
|
| 87 |
-
self.file_size = os.path.getsize(path)
|
| 88 |
-
self.kv = {}
|
| 89 |
-
self.tensor_infos = OrderedDict()
|
| 90 |
-
self.data_offset = 0
|
| 91 |
-
|
| 92 |
-
self._f = open(path, 'rb')
|
| 93 |
-
self._mm = mmap.mmap(self._f.fileno(), 0, access=mmap.ACCESS_READ)
|
| 94 |
-
self._parse_header()
|
| 95 |
-
|
| 96 |
-
def _parse_header(self):
|
| 97 |
-
f = self._f
|
| 98 |
-
f.seek(0)
|
| 99 |
-
magic = struct.unpack('<I', f.read(4))[0]
|
| 100 |
-
assert magic == GGUF_MAGIC, f"Bad GGUF magic: 0x{magic:08X}"
|
| 101 |
-
version = struct.unpack('<I', f.read(4))[0]
|
| 102 |
-
n_tensors = struct.unpack('<Q', f.read(8))[0]
|
| 103 |
-
n_kv = struct.unpack('<Q', f.read(8))[0]
|
| 104 |
-
|
| 105 |
-
# Read KV pairs
|
| 106 |
-
for _ in range(n_kv):
|
| 107 |
-
key = read_string(f)
|
| 108 |
-
vtype = struct.unpack('<I', f.read(4))[0]
|
| 109 |
-
value = read_kv_value(f, vtype)
|
| 110 |
-
self.kv[key] = value
|
| 111 |
-
|
| 112 |
-
# Read tensor info
|
| 113 |
-
for _ in range(n_tensors):
|
| 114 |
-
name = read_string(f)
|
| 115 |
-
n_dims = struct.unpack('<I', f.read(4))[0]
|
| 116 |
-
dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
|
| 117 |
-
ttype = struct.unpack('<I', f.read(4))[0]
|
| 118 |
-
offset = struct.unpack('<Q', f.read(8))[0]
|
| 119 |
-
n_elements = 1
|
| 120 |
-
for d in dims:
|
| 121 |
-
n_elements *= d
|
| 122 |
-
blk_sz = TYPE_BLOCK_SIZE.get(ttype, 1)
|
| 123 |
-
blk_bytes = TYPE_BLOCK_BYTES.get(ttype, 4)
|
| 124 |
-
n_blocks = (n_elements + blk_sz - 1) // blk_sz
|
| 125 |
-
data_size = n_blocks * blk_bytes
|
| 126 |
-
self.tensor_infos[name] = {
|
| 127 |
-
'dims': dims, 'n_dims': n_dims, 'type': ttype,
|
| 128 |
-
'offset': offset, 'n_elements': n_elements,
|
| 129 |
-
'data_size': data_size,
|
| 130 |
-
}
|
| 131 |
-
|
| 132 |
-
self.data_offset = align_offset(f.tell())
|
| 133 |
-
|
| 134 |
-
def get_arch(self):
|
| 135 |
-
arch = self.kv.get('general.architecture')
|
| 136 |
-
if not arch:
|
| 137 |
-
# Try to infer from tensor names
|
| 138 |
-
if any('attn_gate' in n for n in self.tensor_infos):
|
| 139 |
-
return 'gemma2'
|
| 140 |
-
return 'llama'
|
| 141 |
-
return arch
|
| 142 |
-
|
| 143 |
-
def get_config(self):
|
| 144 |
-
arch = self.get_arch()
|
| 145 |
-
n_embd = self.kv.get(f'{arch}.embedding_length', 0)
|
| 146 |
-
n_head = self.kv.get(f'{arch}.attention.head_count', 0)
|
| 147 |
-
n_head_kv = self.kv.get(f'{arch}.attention.head_count_kv', 0)
|
| 148 |
-
|
| 149 |
-
# Auto-detect head_dim: prefer derived from attn_gate over n_embd/n_head
|
| 150 |
-
# (Qwen 3.6 has hybrid 10240 QKV output but attn_gate requires 6144.
|
| 151 |
-
# 6144 / 24 heads = 256 real head_dim).
|
| 152 |
-
head_dim = 0
|
| 153 |
-
gate_name = 'blk.0.attn_gate.weight'
|
| 154 |
-
if gate_name in self.tensor_infos:
|
| 155 |
-
# attn_gate is [n_embd, n_head * head_dim]
|
| 156 |
-
gate_cols = self.tensor_infos[gate_name]['dims'][1] # input dim
|
| 157 |
-
if n_head > 0:
|
| 158 |
-
head_dim = gate_cols // n_head
|
| 159 |
-
if head_dim == 0 and n_head > 0:
|
| 160 |
-
head_dim = n_embd // n_head
|
| 161 |
-
|
| 162 |
-
return {
|
| 163 |
-
'arch': arch,
|
| 164 |
-
'n_layers': self.kv.get(f'{arch}.block_count', 0),
|
| 165 |
-
'n_embd': n_embd,
|
| 166 |
-
'n_head': n_head,
|
| 167 |
-
'n_head_kv': n_head_kv,
|
| 168 |
-
'n_ff': self.kv.get(f'{arch}.feed_forward_length', 0),
|
| 169 |
-
'vocab_size': self.kv.get(f'{arch}.vocab_size', 0),
|
| 170 |
-
'rms_eps': self.kv.get(f'{arch}.attention.layer_norm_rms_epsilon', 1e-6),
|
| 171 |
-
'rope_base': self.kv.get(f'{arch}.rope.freq_base', 10000.0),
|
| 172 |
-
'swa_window': self.kv.get(f'{arch}.attention.sliding_window', 0),
|
| 173 |
-
'head_dim': head_dim,
|
| 174 |
-
'expert_count': self.kv.get(f'{arch}.expert_count', 0),
|
| 175 |
-
'expert_used_count': self.kv.get(f'{arch}.expert_used_count', 0),
|
| 176 |
-
}
|
| 177 |
-
|
| 178 |
-
def get_tensor_f32(self, name):
|
| 179 |
-
"""Load a tensor as float32, dequantizing if needed."""
|
| 180 |
-
if name not in self.tensor_infos:
|
| 181 |
-
return None
|
| 182 |
-
ti = self.tensor_infos[name]
|
| 183 |
-
abs_offset = self.data_offset + ti['offset']
|
| 184 |
-
raw = bytes(self._mm[abs_offset:abs_offset + ti['data_size']])
|
| 185 |
-
try:
|
| 186 |
-
return dequantize(raw, ti['type'], ti['n_elements'])
|
| 187 |
-
except ValueError as e:
|
| 188 |
-
print(f" Error dequantizing {name}: {e}")
|
| 189 |
-
return None
|
| 190 |
-
|
| 191 |
-
def get_tensor_shape(self, name):
|
| 192 |
-
"""Return the shape of a tensor (GGUF stores reversed dims)."""
|
| 193 |
-
if name not in self.tensor_infos:
|
| 194 |
-
return None
|
| 195 |
-
dims = self.tensor_infos[name]['dims']
|
| 196 |
-
# GGUF stores dims in reverse order (row-major): dims[0]=cols, dims[1]=rows
|
| 197 |
-
return tuple(reversed(dims))
|
| 198 |
-
|
| 199 |
-
def close(self):
|
| 200 |
-
self._mm.close()
|
| 201 |
-
self._f.close()
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
# ─── Dequantization ─────────────────────────────────────────────────────────
|
| 205 |
-
|
| 206 |
-
def dequantize(raw, ttype, n_elements):
|
| 207 |
-
"""Dequantize raw bytes to float32 numpy array."""
|
| 208 |
-
if ttype == GGML_TYPE_F32:
|
| 209 |
-
return np.frombuffer(raw, dtype=np.float32).copy()
|
| 210 |
-
elif ttype == GGML_TYPE_F16:
|
| 211 |
-
return np.frombuffer(raw, dtype=np.float16).astype(np.float32)
|
| 212 |
-
elif ttype == GGML_TYPE_BF16:
|
| 213 |
-
bf16 = np.frombuffer(raw, dtype=np.uint16)
|
| 214 |
-
return (bf16.astype(np.uint32) << 16).view(np.float32).copy()
|
| 215 |
-
elif ttype == GGML_TYPE_Q8_0:
|
| 216 |
-
return dequant_q8_0(raw, n_elements)
|
| 217 |
-
elif ttype == GGML_TYPE_Q4_0:
|
| 218 |
-
return dequant_q4_0(raw, n_elements)
|
| 219 |
-
elif ttype == GGML_TYPE_Q2_K:
|
| 220 |
-
return dequant_q2k(raw, n_elements)
|
| 221 |
-
else:
|
| 222 |
-
raise ValueError(f"Unsupported quant type {ttype} ({TYPE_NAME.get(ttype, '?')})")
|
| 223 |
-
|
| 224 |
-
def dequant_q8_0(raw, n_elements):
|
| 225 |
-
n_blocks = n_elements // QK8_0
|
| 226 |
-
data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 34)
|
| 227 |
-
d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
|
| 228 |
-
qs = data[:, 2:34].view(np.int8).astype(np.float32)
|
| 229 |
-
return (d * qs).reshape(-1)[:n_elements]
|
| 230 |
-
|
| 231 |
-
def dequant_q4_0(raw, n_elements):
|
| 232 |
-
n_blocks = n_elements // QK4_0
|
| 233 |
-
data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 18)
|
| 234 |
-
d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
|
| 235 |
-
qs = data[:, 2:18] # 16 bytes = 32 nibbles
|
| 236 |
-
lo = (qs & 0xF).astype(np.float32) - 8.0
|
| 237 |
-
hi = (qs >> 4).astype(np.float32) - 8.0
|
| 238 |
-
# Correct nibble interleaving: [lo0, hi0, lo1, hi1, ...]
|
| 239 |
-
x = np.stack([lo, hi], axis=2).reshape(n_blocks, 32)
|
| 240 |
-
return (d * x).reshape(-1)[:n_elements]
|
| 241 |
-
|
| 242 |
-
def dequant_q2k(raw, n_elements):
|
| 243 |
-
n_blocks = n_elements // QK_K
|
| 244 |
-
data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 84)
|
| 245 |
-
scales_packed = data[:, 0:16] # [n_blocks, 16]
|
| 246 |
-
qs = data[:, 16:80] # [n_blocks, 64]
|
| 247 |
-
d_fp16 = data[:, 80:82].view(np.float16).astype(np.float32).reshape(n_blocks)
|
| 248 |
-
dmin_fp16 = data[:, 82:84].view(np.float16).astype(np.float32).reshape(n_blocks)
|
| 249 |
-
|
| 250 |
-
result = np.zeros((n_blocks, QK_K), dtype=np.float32)
|
| 251 |
-
for blk in range(n_blocks):
|
| 252 |
-
d = d_fp16[blk]
|
| 253 |
-
dmin = dmin_fp16[blk]
|
| 254 |
-
for half in range(2):
|
| 255 |
-
for sub in range(4):
|
| 256 |
-
j = half * 4 + sub # Corrected index: 0-3 and 4-7
|
| 257 |
-
sc = int(scales_packed[blk, j]) & 0xF
|
| 258 |
-
mn = int(scales_packed[blk, j]) >> 4
|
| 259 |
-
d_sub = d * sc
|
| 260 |
-
m_sub = dmin * mn
|
| 261 |
-
for k in range(32):
|
| 262 |
-
qi_byte = int(qs[blk, half * 32 + k])
|
| 263 |
-
q = (qi_byte >> (sub * 2)) & 3
|
| 264 |
-
idx = half * 128 + sub * 32 + k
|
| 265 |
-
result[blk, idx] = d_sub * q - m_sub
|
| 266 |
-
return result.reshape(-1)[:n_elements]
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
# ─── Tokenizer ──────────────────────────────────────────────────────────────
|
| 270 |
-
|
| 271 |
-
class SimpleTokenizer:
|
| 272 |
-
"""Minimal BPE tokenizer from GGUF metadata, with HPC acceleration."""
|
| 273 |
-
|
| 274 |
-
def __init__(self, model):
|
| 275 |
-
self.model_path = model._f.name
|
| 276 |
-
self.tokens = model.kv.get('tokenizer.ggml.tokens', [])
|
| 277 |
-
self.vocab_size = len(self.tokens)
|
| 278 |
-
merges_raw = model.kv.get('tokenizer.ggml.merges', [])
|
| 279 |
-
|
| 280 |
-
# Override for models (like Mistral v0.3) where gguf merges are missing
|
| 281 |
-
if not merges_raw:
|
| 282 |
-
try:
|
| 283 |
-
import json
|
| 284 |
-
model_dir = os.path.dirname(os.path.abspath(self.model_path))
|
| 285 |
-
tok_path = os.path.join(model_dir, 'tokenizer.json')
|
| 286 |
-
if os.path.exists(tok_path):
|
| 287 |
-
with open(tok_path, 'r') as f:
|
| 288 |
-
tok_data = json.load(f)
|
| 289 |
-
merges_raw = tok_data.get('model', {}).get('merges', [])
|
| 290 |
-
if merges_raw:
|
| 291 |
-
print(f" Injected {len(merges_raw)} merges from local tokenizer.json!")
|
| 292 |
-
except Exception as e:
|
| 293 |
-
pass
|
| 294 |
-
|
| 295 |
-
self.bos_id = model.kv.get('tokenizer.ggml.bos_token_id', 2)
|
| 296 |
-
self.eos_id = model.kv.get('tokenizer.ggml.eos_token_id', 1)
|
| 297 |
-
|
| 298 |
-
# Build token → id map
|
| 299 |
-
self.token_to_id = {}
|
| 300 |
-
for i, t in enumerate(self.tokens):
|
| 301 |
-
if isinstance(t, str):
|
| 302 |
-
self.token_to_id[t] = i
|
| 303 |
-
|
| 304 |
-
# Build merge priority (Python fallback)
|
| 305 |
-
self.merges = {}
|
| 306 |
-
self._merge_list = [] # ordered list for C bridge
|
| 307 |
-
for i, m in enumerate(merges_raw):
|
| 308 |
-
if isinstance(m, str):
|
| 309 |
-
parts = m.split(' ', 1)
|
| 310 |
-
if len(parts) == 2:
|
| 311 |
-
self.merges[(parts[0], parts[1])] = i
|
| 312 |
-
# Resolve token IDs for C bridge
|
| 313 |
-
a_id = self.token_to_id.get(parts[0], -1)
|
| 314 |
-
b_id = self.token_to_id.get(parts[1], -1)
|
| 315 |
-
merged_tok = parts[0] + parts[1]
|
| 316 |
-
merged_id = self.token_to_id.get(merged_tok, -1)
|
| 317 |
-
if a_id >= 0 and b_id >= 0 and merged_id >= 0:
|
| 318 |
-
self._merge_list.append((a_id, b_id, merged_id, i))
|
| 319 |
-
|
| 320 |
-
# Try to load HPC library for accelerated BPE
|
| 321 |
-
self._hpc_lib = None
|
| 322 |
-
try:
|
| 323 |
-
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 324 |
-
lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
|
| 325 |
-
if os.path.exists(lib_path):
|
| 326 |
-
lib = ctypes.CDLL(lib_path)
|
| 327 |
-
if hasattr(lib, 'hexstate_bpe_tokenize'):
|
| 328 |
-
self._hpc_lib = lib
|
| 329 |
-
print(f" HPC·BPE engine loaded ({len(self._merge_list)} merge rules)")
|
| 330 |
-
else:
|
| 331 |
-
print(" HPC library found but missing hexstate_bpe_tokenize — rebuild needed")
|
| 332 |
-
except Exception as e:
|
| 333 |
-
print(f" HPC·BPE not available: {e}")
|
| 334 |
-
|
| 335 |
-
def encode(self, text):
|
| 336 |
-
"""Encode text to token IDs using BPE (HPC-accelerated when available)."""
|
| 337 |
-
if not text:
|
| 338 |
-
return [self.bos_id]
|
| 339 |
-
|
| 340 |
-
# Convert to byte-level tokens (SentencePiece style: ▁ = space)
|
| 341 |
-
text = text.replace(' ', '▁')
|
| 342 |
-
if not text.startswith('▁'):
|
| 343 |
-
text = '▁' + text
|
| 344 |
-
|
| 345 |
-
# ── HPC fast path: C library with OpenMP ──
|
| 346 |
-
if self._hpc_lib and self._merge_list:
|
| 347 |
-
import time as _time
|
| 348 |
-
t0 = _time.time()
|
| 349 |
-
print(f" HPC·BPE: tokenizing {len(text):,} chars...")
|
| 350 |
-
|
| 351 |
-
# Convert characters to initial token IDs
|
| 352 |
-
char_ids = np.array(
|
| 353 |
-
[self.token_to_id.get(c, 0) for c in text],
|
| 354 |
-
dtype=np.int32)
|
| 355 |
-
|
| 356 |
-
# Build merge table as C struct array
|
| 357 |
-
n_merges = len(self._merge_list)
|
| 358 |
-
# BPEMerge struct: 4 × int32 = 16 bytes
|
| 359 |
-
merge_buf = np.zeros(n_merges * 4, dtype=np.int32)
|
| 360 |
-
for idx, (a, b, m, r) in enumerate(self._merge_list):
|
| 361 |
-
merge_buf[idx * 4 + 0] = a
|
| 362 |
-
merge_buf[idx * 4 + 1] = b
|
| 363 |
-
merge_buf[idx * 4 + 2] = m
|
| 364 |
-
merge_buf[idx * 4 + 3] = r
|
| 365 |
-
|
| 366 |
-
# Output buffer
|
| 367 |
-
output_ids = np.zeros(len(char_ids), dtype=np.int32)
|
| 368 |
-
n_tokens = ctypes.c_int64(0)
|
| 369 |
-
|
| 370 |
-
self._hpc_lib.hexstate_bpe_tokenize(
|
| 371 |
-
char_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
| 372 |
-
ctypes.c_int64(len(char_ids)),
|
| 373 |
-
merge_buf.ctypes.data_as(ctypes.c_void_p),
|
| 374 |
-
ctypes.c_int32(n_merges),
|
| 375 |
-
output_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
|
| 376 |
-
ctypes.byref(n_tokens),
|
| 377 |
-
ctypes.c_int(1), # verbose
|
| 378 |
-
)
|
| 379 |
-
|
| 380 |
-
elapsed = _time.time() - t0
|
| 381 |
-
ids = [self.bos_id] + output_ids[:n_tokens.value].tolist()
|
| 382 |
-
print(f" HPC·BPE: {len(text):,} chars → {n_tokens.value:,} tokens [{elapsed:.1f}s]")
|
| 383 |
-
return ids
|
| 384 |
-
|
| 385 |
-
# ── Python fallback ──
|
| 386 |
-
# Start with characters
|
| 387 |
-
tokens = list(text)
|
| 388 |
-
|
| 389 |
-
# Apply BPE merges — merge ALL instances of the best pair per pass
|
| 390 |
-
initial_len = len(tokens)
|
| 391 |
-
pass_num = 0
|
| 392 |
-
import time as _time
|
| 393 |
-
t0 = _time.time()
|
| 394 |
-
while len(tokens) > 1:
|
| 395 |
-
best_pair = None
|
| 396 |
-
best_rank = float('inf')
|
| 397 |
-
for i in range(len(tokens) - 1):
|
| 398 |
-
pair = (tokens[i], tokens[i + 1])
|
| 399 |
-
rank = self.merges.get(pair, float('inf'))
|
| 400 |
-
if rank < best_rank:
|
| 401 |
-
best_rank = rank
|
| 402 |
-
best_pair = pair
|
| 403 |
-
if best_pair is None or best_rank == float('inf'):
|
| 404 |
-
break
|
| 405 |
-
# Merge ALL occurrences of this pair in one pass
|
| 406 |
-
a, b = best_pair
|
| 407 |
-
prev_len = len(tokens)
|
| 408 |
-
new_tokens = []
|
| 409 |
-
i = 0
|
| 410 |
-
while i < len(tokens):
|
| 411 |
-
if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
|
| 412 |
-
new_tokens.append(a + b)
|
| 413 |
-
i += 2
|
| 414 |
-
else:
|
| 415 |
-
new_tokens.append(tokens[i])
|
| 416 |
-
i += 1
|
| 417 |
-
tokens = new_tokens
|
| 418 |
-
pass_num += 1
|
| 419 |
-
if pass_num % 10 == 0:
|
| 420 |
-
elapsed = _time.time() - t0
|
| 421 |
-
merged = prev_len - len(tokens)
|
| 422 |
-
sys.stdout.write(
|
| 423 |
-
f"\r BPE pass {pass_num}: {len(tokens):,} tokens "
|
| 424 |
-
f"(-{merged} merged, {len(tokens)/initial_len*100:.1f}%) "
|
| 425 |
-
f"[{elapsed:.1f}s] ")
|
| 426 |
-
sys.stdout.flush()
|
| 427 |
-
if pass_num >= 10:
|
| 428 |
-
elapsed = _time.time() - t0
|
| 429 |
-
print(f"\r Tokenized: {pass_num} passes, {initial_len:,} chars → "
|
| 430 |
-
f"{len(tokens):,} tokens [{elapsed:.1f}s]" + " " * 30)
|
| 431 |
-
|
| 432 |
-
# Convert to IDs
|
| 433 |
-
ids = [self.bos_id]
|
| 434 |
-
for t in tokens:
|
| 435 |
-
tid = self.token_to_id.get(t, 0)
|
| 436 |
-
ids.append(tid)
|
| 437 |
-
return ids
|
| 438 |
-
|
| 439 |
-
def chunk_text(self, text, chunk_size=512):
|
| 440 |
-
"""Encode text and split into fixed-length chunks."""
|
| 441 |
-
ids = self.encode(text)
|
| 442 |
-
chunks = []
|
| 443 |
-
# Use a more reasonable stride (75% overlap instead of 50% for better coverage)
|
| 444 |
-
# or just 0% for pure speed. Let's go with 25% overlap as a middle ground.
|
| 445 |
-
stride = chunk_size * 3 // 4
|
| 446 |
-
for i in range(0, len(ids) - chunk_size + 1, stride):
|
| 447 |
-
chunk = ids[i:i + chunk_size]
|
| 448 |
-
chunks.append(np.array(chunk, dtype=np.int32))
|
| 449 |
-
|
| 450 |
-
if not chunks and ids:
|
| 451 |
-
# Pad short text
|
| 452 |
-
padded = ids + [self.eos_id] * (chunk_size - len(ids))
|
| 453 |
-
chunks.append(np.array(padded[:chunk_size], dtype=np.int32))
|
| 454 |
-
return chunks
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
# ─── Transformer Forward Pass ───────────────────────────────────────────────
|
| 458 |
-
|
| 459 |
-
def rms_norm(x, weight, eps=1e-6):
|
| 460 |
-
rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
|
| 461 |
-
return (x / rms) * weight
|
| 462 |
-
|
| 463 |
-
def rope_freqs(dim, seq_len, base=10000.0):
|
| 464 |
-
freqs = 1.0 / (base ** (np.arange(0, dim, 2, dtype=np.float32) / dim))
|
| 465 |
-
t = np.arange(seq_len, dtype=np.float32)
|
| 466 |
-
freqs = np.outer(t, freqs) # [seq_len, dim/2]
|
| 467 |
-
return np.cos(freqs), np.sin(freqs)
|
| 468 |
-
|
| 469 |
-
def apply_rope(x, cos_f, sin_f):
|
| 470 |
-
# x: [seq_len, n_heads, head_dim]
|
| 471 |
-
d2 = x.shape[-1] // 2
|
| 472 |
-
x0 = x[..., :d2]
|
| 473 |
-
x1 = x[..., d2:]
|
| 474 |
-
cos_f = cos_f[:x.shape[0], :d2]
|
| 475 |
-
sin_f = sin_f[:x.shape[0], :d2]
|
| 476 |
-
if x.ndim == 3:
|
| 477 |
-
cos_f = cos_f[:, np.newaxis, :]
|
| 478 |
-
sin_f = sin_f[:, np.newaxis, :]
|
| 479 |
-
o0 = x0 * cos_f - x1 * sin_f
|
| 480 |
-
o1 = x1 * cos_f + x0 * sin_f
|
| 481 |
-
return np.concatenate([o0, o1], axis=-1)
|
| 482 |
-
|
| 483 |
-
def softmax(x, axis=-1):
|
| 484 |
-
x_max = np.max(x, axis=axis, keepdims=True)
|
| 485 |
-
e = np.exp(x - x_max)
|
| 486 |
-
return e / np.sum(e, axis=axis, keepdims=True)
|
| 487 |
-
|
| 488 |
-
def silu(x):
|
| 489 |
-
"""SiLU / Swish activation — used by LLaMA, Mistral, Qwen, DeepSeek."""
|
| 490 |
-
return x * (1.0 / (1.0 + np.exp(-np.clip(x, -88, 88))))
|
| 491 |
-
|
| 492 |
-
def gelu(x):
|
| 493 |
-
"""GELU activation — used by Gemma, GPT-2."""
|
| 494 |
-
return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x**3)))
|
| 495 |
-
|
| 496 |
-
# Architecture → activation function mapping
|
| 497 |
-
ACTIVATION_MAP = {
|
| 498 |
-
'llama': silu, 'mistral': silu, 'qwen2': silu, 'qwen2moe': silu,
|
| 499 |
-
'phi3': silu, 'falcon': silu, 'deepseek': silu, 'deepseek2': silu,
|
| 500 |
-
'gemma': gelu, 'gemma2': gelu, 'gpt2': gelu,
|
| 501 |
-
}
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
class TransformerRunner:
|
| 505 |
-
"""Minimal Gemma transformer for importance collection."""
|
| 506 |
-
|
| 507 |
-
def __init__(self, model, config, verbose=False, linear_attn=True):
|
| 508 |
-
self.model = model
|
| 509 |
-
self.cfg = config
|
| 510 |
-
self.verbose = verbose
|
| 511 |
-
self.head_dim = config.get('head_dim', config['n_embd'] // config['n_head'])
|
| 512 |
-
self.act_fn = ACTIVATION_MAP.get(config['arch'], silu)
|
| 513 |
-
self.linear_attn = linear_attn
|
| 514 |
-
|
| 515 |
-
# Importance accumulators: tensor_name → (sum_x2, count)
|
| 516 |
-
self.importance = {}
|
| 517 |
-
|
| 518 |
-
# HPC C library for accelerated forward pass
|
| 519 |
-
self._hpc_lib = None
|
| 520 |
-
try:
|
| 521 |
-
script_dir = os.path.dirname(os.path.abspath(__file__))
|
| 522 |
-
lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
|
| 523 |
-
if os.path.exists(lib_path):
|
| 524 |
-
lib = ctypes.CDLL(lib_path)
|
| 525 |
-
if hasattr(lib, 'hexstate_forward_layer'):
|
| 526 |
-
self._hpc_lib = lib
|
| 527 |
-
if verbose:
|
| 528 |
-
print(" HPC·Forward engine loaded (hexstate_forward_layer)")
|
| 529 |
-
except Exception:
|
| 530 |
-
pass
|
| 531 |
-
|
| 532 |
-
def _record(self, name, x):
|
| 533 |
-
"""Record E[x²] for this tensor's input activation."""
|
| 534 |
-
# x shape: [..., n_cols] — record per-column (input channel)
|
| 535 |
-
x_flat = x.reshape(-1, x.shape[-1])
|
| 536 |
-
x2 = np.sum(x_flat ** 2, axis=0)
|
| 537 |
-
if name in self.importance:
|
| 538 |
-
self.importance[name] = (
|
| 539 |
-
self.importance[name][0] + x2,
|
| 540 |
-
self.importance[name][1] + x_flat.shape[0],
|
| 541 |
-
)
|
| 542 |
-
else:
|
| 543 |
-
self.importance[name] = (x2.copy(), x_flat.shape[0])
|
| 544 |
-
|
| 545 |
-
def _get_weight(self, name):
|
| 546 |
-
"""Load weight, trying GGUF name patterns."""
|
| 547 |
-
w = self.model.get_tensor_f32(name)
|
| 548 |
-
if w is None:
|
| 549 |
-
return None
|
| 550 |
-
shape = self.model.get_tensor_shape(name)
|
| 551 |
-
if shape and len(shape) >= 2:
|
| 552 |
-
return w.reshape(shape)
|
| 553 |
-
return w
|
| 554 |
-
|
| 555 |
-
def _layer_prefix(self, layer_idx):
|
| 556 |
-
return f"blk.{layer_idx}"
|
| 557 |
-
|
| 558 |
-
def _hpc_forward_layer(self, hidden, layer_idx):
|
| 559 |
-
"""Full layer forward pass via C hexstate_forward_layer.
|
| 560 |
-
|
| 561 |
-
Loads weights, creates ctypes pointers, calls C, reads back importance.
|
| 562 |
-
Returns updated hidden state.
|
| 563 |
-
"""
|
| 564 |
-
pfx = self._layer_prefix(layer_idx)
|
| 565 |
-
cfg = self.cfg
|
| 566 |
-
lib = self._hpc_lib
|
| 567 |
-
seq_len = hidden.shape[0]
|
| 568 |
-
n_embd = cfg['n_embd']
|
| 569 |
-
n_head = cfg['n_head']
|
| 570 |
-
n_head_kv = cfg['n_head_kv']
|
| 571 |
-
if isinstance(n_head_kv, list):
|
| 572 |
-
n_head_kv = n_head_kv[layer_idx]
|
| 573 |
-
head_dim = self.head_dim
|
| 574 |
-
eps = cfg['rms_eps']
|
| 575 |
-
|
| 576 |
-
FP = ctypes.POINTER(ctypes.c_float)
|
| 577 |
-
I64P = ctypes.POINTER(ctypes.c_int64)
|
| 578 |
-
|
| 579 |
-
def _fp(arr):
|
| 580 |
-
if arr is None: return ctypes.cast(None, FP), None
|
| 581 |
-
a = np.ascontiguousarray(arr, dtype=np.float32)
|
| 582 |
-
return a.ctypes.data_as(FP), a
|
| 583 |
-
|
| 584 |
-
def _imp(name, dim):
|
| 585 |
-
"""Get or create importance accumulator, return (pointer, count_ptr, holder)."""
|
| 586 |
-
if name not in self.importance:
|
| 587 |
-
self.importance[name] = (np.zeros(dim, dtype=np.float32), 0)
|
| 588 |
-
imp_arr = np.ascontiguousarray(self.importance[name][0], dtype=np.float32)
|
| 589 |
-
cnt = ctypes.c_int64(self.importance[name][1])
|
| 590 |
-
return imp_arr.ctypes.data_as(FP), ctypes.byref(cnt), imp_arr, cnt
|
| 591 |
-
|
| 592 |
-
# Make hidden contiguous and get pointer
|
| 593 |
-
hidden = np.ascontiguousarray(hidden, dtype=np.float32)
|
| 594 |
-
h_ptr = hidden.ctypes.data_as(FP)
|
| 595 |
-
|
| 596 |
-
# Load all weights for this layer
|
| 597 |
-
norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
|
| 598 |
-
if norm_w is None:
|
| 599 |
-
return hidden
|
| 600 |
-
|
| 601 |
-
qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
|
| 602 |
-
q_w = self._get_weight(f'{pfx}.attn_q.weight')
|
| 603 |
-
k_w = self._get_weight(f'{pfx}.attn_k.weight')
|
| 604 |
-
v_w = self._get_weight(f'{pfx}.attn_v.weight')
|
| 605 |
-
gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
|
| 606 |
-
o_w = self._get_weight(f'{pfx}.attn_output.weight')
|
| 607 |
-
ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
|
| 608 |
-
if ffn_norm_w is None:
|
| 609 |
-
ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
|
| 610 |
-
ffn_gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
|
| 611 |
-
ffn_up_w = self._get_weight(f'{pfx}.ffn_up.weight')
|
| 612 |
-
ffn_down_w = self._get_weight(f'{pfx}.ffn_down.weight')
|
| 613 |
-
|
| 614 |
-
# Prepare ctypes args (keep refs to prevent GC)
|
| 615 |
-
refs = []
|
| 616 |
-
def fp(arr):
|
| 617 |
-
p, a = _fp(arr)
|
| 618 |
-
refs.append(a)
|
| 619 |
-
return p
|
| 620 |
-
|
| 621 |
-
norm_p = fp(norm_w)
|
| 622 |
-
qkv_p = fp(qkv_w)
|
| 623 |
-
q_p = fp(q_w)
|
| 624 |
-
k_p = fp(k_w)
|
| 625 |
-
v_p = fp(v_w)
|
| 626 |
-
gate_p = fp(gate_w)
|
| 627 |
-
o_p = fp(o_w)
|
| 628 |
-
ffn_norm_p = fp(ffn_norm_w)
|
| 629 |
-
ffn_gate_p = fp(ffn_gate_w)
|
| 630 |
-
ffn_up_p = fp(ffn_up_w)
|
| 631 |
-
ffn_down_p = fp(ffn_down_w)
|
| 632 |
-
|
| 633 |
-
qkv_dim = qkv_w.shape[0] if qkv_w is not None else 0
|
| 634 |
-
q_dim_v = q_w.shape[0] if q_w is not None else 0
|
| 635 |
-
k_dim_v = k_w.shape[0] if k_w is not None else 0
|
| 636 |
-
v_dim_v = v_w.shape[0] if v_w is not None else 0
|
| 637 |
-
gate_rows = gate_w.shape[0] if gate_w is not None else 0
|
| 638 |
-
o_cols = o_w.shape[1] if (o_w is not None and o_w.ndim >= 2) else 0
|
| 639 |
-
ffn_d = ffn_gate_w.shape[0] if ffn_gate_w is not None else 0
|
| 640 |
-
|
| 641 |
-
# Importance accumulators
|
| 642 |
-
imp_refs = [] # Keep alive
|
| 643 |
-
null_fp = ctypes.cast(None, FP)
|
| 644 |
-
null_i64p = ctypes.cast(None, I64P)
|
| 645 |
-
|
| 646 |
-
def make_imp(name, dim):
|
| 647 |
-
if dim <= 0:
|
| 648 |
-
return null_fp, null_i64p
|
| 649 |
-
p, cp, arr, cnt = _imp(name, dim)
|
| 650 |
-
imp_refs.append((name, arr, cnt))
|
| 651 |
-
return p, cp
|
| 652 |
-
|
| 653 |
-
imp_qkv_p, cnt_qkv_p = make_imp(f'{pfx}.attn_qkv.weight', n_embd if qkv_w is not None else 0)
|
| 654 |
-
imp_q_p, cnt_q_p = make_imp(f'{pfx}.attn_q.weight', n_embd if q_w is not None else 0)
|
| 655 |
-
imp_k_p, cnt_k_p = make_imp(f'{pfx}.attn_k.weight', n_embd if k_w is not None else 0)
|
| 656 |
-
imp_v_p, cnt_v_p = make_imp(f'{pfx}.attn_v.weight', n_embd if v_w is not None else 0)
|
| 657 |
-
imp_gate_p, cnt_gate_p = make_imp(f'{pfx}.attn_gate.weight', n_head * head_dim if gate_w is not None else 0)
|
| 658 |
-
imp_o_p, cnt_o_p = make_imp(f'{pfx}.attn_output.weight', o_cols if o_w is not None else 0)
|
| 659 |
-
imp_fg_p, cnt_fg_p = make_imp(f'{pfx}.ffn_gate.weight', n_embd if ffn_gate_w is not None else 0)
|
| 660 |
-
imp_fu_p, cnt_fu_p = make_imp(f'{pfx}.ffn_up.weight', n_embd if ffn_up_w is not None else 0)
|
| 661 |
-
imp_fd_p, cnt_fd_p = make_imp(f'{pfx}.ffn_down.weight', ffn_d if ffn_down_w is not None else 0)
|
| 662 |
-
|
| 663 |
-
# Call C function — entire layer in one call (FFN part will be NULL if MoE)
|
| 664 |
-
lib.hexstate_forward_layer(
|
| 665 |
-
h_ptr,
|
| 666 |
-
norm_p,
|
| 667 |
-
qkv_p, ctypes.c_int64(qkv_dim),
|
| 668 |
-
q_p, ctypes.c_int64(q_dim_v),
|
| 669 |
-
k_p, ctypes.c_int64(k_dim_v),
|
| 670 |
-
v_p, ctypes.c_int64(v_dim_v),
|
| 671 |
-
gate_p, ctypes.c_int64(gate_rows),
|
| 672 |
-
o_p, ctypes.c_int64(o_cols),
|
| 673 |
-
ffn_norm_p,
|
| 674 |
-
ffn_gate_p, ffn_up_p, ffn_down_p,
|
| 675 |
-
ctypes.c_int64(ffn_d),
|
| 676 |
-
imp_qkv_p, cnt_qkv_p,
|
| 677 |
-
imp_q_p, cnt_q_p,
|
| 678 |
-
imp_k_p, cnt_k_p,
|
| 679 |
-
imp_v_p, cnt_v_p,
|
| 680 |
-
imp_gate_p, cnt_gate_p,
|
| 681 |
-
imp_o_p, cnt_o_p,
|
| 682 |
-
imp_fg_p, cnt_fg_p,
|
| 683 |
-
imp_fu_p, cnt_fu_p,
|
| 684 |
-
imp_fd_p, cnt_fd_p,
|
| 685 |
-
ctypes.c_int64(seq_len), ctypes.c_int64(n_embd),
|
| 686 |
-
ctypes.c_int64(n_head), ctypes.c_int64(n_head_kv),
|
| 687 |
-
ctypes.c_int64(head_dim), ctypes.c_float(eps))
|
| 688 |
-
|
| 689 |
-
# Read back importance for the tensors that WERE processed in C
|
| 690 |
-
for name, arr, cnt in imp_refs:
|
| 691 |
-
# Extract value from ctypes byref pointer
|
| 692 |
-
self.importance[name] = (arr.astype(np.float64), cnt.value)
|
| 693 |
-
|
| 694 |
-
# Handle MoE FFN if C code skipped it
|
| 695 |
-
if ffn_gate_w is None:
|
| 696 |
-
# Re-normalize for FFN
|
| 697 |
-
normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, eps)
|
| 698 |
-
hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
|
| 699 |
-
|
| 700 |
-
# Force-free per-layer weight buffers (~1.4 GB) before next layer
|
| 701 |
-
del refs, imp_refs
|
| 702 |
-
import gc; gc.collect()
|
| 703 |
-
|
| 704 |
-
return hidden
|
| 705 |
-
|
| 706 |
-
def _forward_moe_ffn(self, hidden, normed_ff, pfx):
|
| 707 |
-
"""Python-side MoE FFN handling (supports packed and shared experts)."""
|
| 708 |
-
gate_inp_w = self._get_weight(f'{pfx}.ffn_gate_inp.weight')
|
| 709 |
-
if gate_inp_w is None:
|
| 710 |
-
return hidden
|
| 711 |
-
|
| 712 |
-
self._record(f'{pfx}.ffn_gate_inp.weight', normed_ff)
|
| 713 |
-
router_logits = normed_ff @ gate_inp_w.T
|
| 714 |
-
n_experts = router_logits.shape[-1]
|
| 715 |
-
probs = softmax(router_logits, axis=-1)
|
| 716 |
-
topk = self.cfg.get('expert_used_count', 2)
|
| 717 |
-
top_k_indices = np.argsort(probs, axis=-1)[:, -topk:]
|
| 718 |
-
|
| 719 |
-
ff_out = np.zeros_like(normed_ff)
|
| 720 |
-
|
| 721 |
-
# Check for packed experts (Qwen style)
|
| 722 |
-
p_gate = self._get_weight(f'{pfx}.ffn_gate_exps.weight')
|
| 723 |
-
p_up = self._get_weight(f'{pfx}.ffn_up_exps.weight')
|
| 724 |
-
p_down = self._get_weight(f'{pfx}.ffn_down_exps.weight')
|
| 725 |
-
|
| 726 |
-
for exp_id in range(n_experts):
|
| 727 |
-
if p_gate is not None:
|
| 728 |
-
ew_gate = p_gate[exp_id]
|
| 729 |
-
ew_up = p_up[exp_id]
|
| 730 |
-
ew_down = p_down[exp_id]
|
| 731 |
-
else:
|
| 732 |
-
ew_gate = self._get_weight(f'{pfx}.ffn_gate.{exp_id}.weight')
|
| 733 |
-
ew_up = self._get_weight(f'{pfx}.ffn_up.{exp_id}.weight')
|
| 734 |
-
ew_down = self._get_weight(f'{pfx}.ffn_down.{exp_id}.weight')
|
| 735 |
-
|
| 736 |
-
if ew_gate is None: continue
|
| 737 |
-
|
| 738 |
-
mask_exp = np.any(top_k_indices == exp_id, axis=-1)
|
| 739 |
-
if not np.any(mask_exp): continue
|
| 740 |
-
|
| 741 |
-
exp_input = normed_ff[mask_exp]
|
| 742 |
-
|
| 743 |
-
# Record importance
|
| 744 |
-
if p_gate is not None:
|
| 745 |
-
self._record(f'{pfx}.ffn_gate_exps.weight', exp_input)
|
| 746 |
-
self._record(f'{pfx}.ffn_up_exps.weight', exp_input)
|
| 747 |
-
else:
|
| 748 |
-
self._record(f'{pfx}.ffn_gate.{exp_id}.weight', exp_input)
|
| 749 |
-
self._record(f'{pfx}.ffn_up.{exp_id}.weight', exp_input)
|
| 750 |
-
|
| 751 |
-
g = self.act_fn(exp_input @ ew_gate.T)
|
| 752 |
-
u = exp_input @ ew_up.T
|
| 753 |
-
mid = g * u
|
| 754 |
-
|
| 755 |
-
if p_gate is not None:
|
| 756 |
-
self._record(f'{pfx}.ffn_down_exps.weight', mid)
|
| 757 |
-
else:
|
| 758 |
-
self._record(f'{pfx}.ffn_down.{exp_id}.weight', mid)
|
| 759 |
-
|
| 760 |
-
exp_out = mid @ ew_down.T
|
| 761 |
-
indices = np.where(mask_exp)[0]
|
| 762 |
-
for i, tidx in enumerate(indices):
|
| 763 |
-
w = probs[tidx, exp_id]
|
| 764 |
-
ff_out[tidx] += w * exp_out[i]
|
| 765 |
-
|
| 766 |
-
# Shared experts (Qwen style)
|
| 767 |
-
sh_gate = self._get_weight(f'{pfx}.ffn_gate_shexp.weight')
|
| 768 |
-
if sh_gate is not None:
|
| 769 |
-
sh_up = self._get_weight(f'{pfx}.ffn_up_shexp.weight')
|
| 770 |
-
sh_down = self._get_weight(f'{pfx}.ffn_down_shexp.weight')
|
| 771 |
-
self._record(f'{pfx}.ffn_gate_shexp.weight', normed_ff)
|
| 772 |
-
self._record(f'{pfx}.ffn_up_shexp.weight', normed_ff)
|
| 773 |
-
g = self.act_fn(normed_ff @ sh_gate.T)
|
| 774 |
-
u = normed_ff @ sh_up.T
|
| 775 |
-
mid = g * u
|
| 776 |
-
self._record(f'{pfx}.ffn_down_shexp.weight', mid)
|
| 777 |
-
ff_out += mid @ sh_down.T
|
| 778 |
-
|
| 779 |
-
return hidden + ff_out
|
| 780 |
-
|
| 781 |
-
def _hpc_rms_norm(self, x, weight, eps):
|
| 782 |
-
"""RMS norm via HPC C library, falling back to numpy."""
|
| 783 |
-
if self._hpc_lib and x.flags['C_CONTIGUOUS']:
|
| 784 |
-
seq_len, dim = x.shape
|
| 785 |
-
out = np.empty_like(x)
|
| 786 |
-
w = np.ascontiguousarray(weight, dtype=np.float32)
|
| 787 |
-
self._hpc_lib.hexstate_rms_norm(
|
| 788 |
-
x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 789 |
-
w.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 790 |
-
out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 791 |
-
ctypes.c_int64(seq_len), ctypes.c_int64(dim),
|
| 792 |
-
ctypes.c_float(eps))
|
| 793 |
-
return out
|
| 794 |
-
return rms_norm(x, weight, eps)
|
| 795 |
-
|
| 796 |
-
def _hpc_matmul_record(self, name, x, weight):
|
| 797 |
-
"""Fused matmul + importance recording via HPC C library.
|
| 798 |
-
|
| 799 |
-
Uses HPCGraph phase-coherent importance modulation (see hexstate_matmul_record in C)
|
| 800 |
-
for the E[x²] accumulation, but delegates the actual matmul to numpy BLAS
|
| 801 |
-
for maximum speed on large matrices.
|
| 802 |
-
Returns x @ weight.T while recording importance for `name`.
|
| 803 |
-
"""
|
| 804 |
-
if self._hpc_lib and x.flags['C_CONTIGUOUS'] and weight.flags['C_CONTIGUOUS']:
|
| 805 |
-
M, K = x.shape
|
| 806 |
-
N = weight.shape[0] # weight is [N, K], computing x @ W.T -> [M, N]
|
| 807 |
-
|
| 808 |
-
# HPC importance: C library builds HPCGraph over columns,
|
| 809 |
-
# encodes x² as triality amplitudes, CZ-couples adjacent columns,
|
| 810 |
-
# and modulates importance by hpc_marginal phase coherence.
|
| 811 |
-
if name not in self.importance:
|
| 812 |
-
self.importance[name] = (np.zeros(K, dtype=np.float64), 0)
|
| 813 |
-
imp_f32 = self.importance[name][0].astype(np.float32)
|
| 814 |
-
count = ctypes.c_int64(self.importance[name][1])
|
| 815 |
-
|
| 816 |
-
# Pass real weights to C library for importance recording
|
| 817 |
-
weight_ptr = weight.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
|
| 818 |
-
# Dummy output — we only want the importance recording
|
| 819 |
-
dummy_out = np.empty((M, 1), dtype=np.float32)
|
| 820 |
-
|
| 821 |
-
self._hpc_lib.hexstate_matmul_record(
|
| 822 |
-
x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 823 |
-
weight_ptr,
|
| 824 |
-
dummy_out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 825 |
-
imp_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
|
| 826 |
-
ctypes.c_int64(M), ctypes.c_int64(K), ctypes.c_int64(N),
|
| 827 |
-
ctypes.byref(count))
|
| 828 |
-
|
| 829 |
-
self.importance[name] = (imp_f32.astype(np.float64), count.value)
|
| 830 |
-
|
| 831 |
-
# Matmul via numpy BLAS (much faster than our C loop for large N)
|
| 832 |
-
return x @ weight.T
|
| 833 |
-
|
| 834 |
-
# Fallback: pure numpy
|
| 835 |
-
self._record(name, x)
|
| 836 |
-
return x @ weight.T
|
| 837 |
-
|
| 838 |
-
def forward_layer_linear(self, hidden, layer_idx):
|
| 839 |
-
"""HPC-linearized forward: O(seq) attention for imatrix collection.
|
| 840 |
-
|
| 841 |
-
Instead of full O(seq²) softmax attention, uses causal linear attention:
|
| 842 |
-
each position's output is a running weighted average of V, where weights
|
| 843 |
-
come from Q·K similarity in phase space. This preserves activation
|
| 844 |
-
magnitude statistics (which is all imatrix needs) while being O(seq).
|
| 845 |
-
|
| 846 |
-
Records identical importance stats as the full forward_layer.
|
| 847 |
-
"""
|
| 848 |
-
pfx = self._layer_prefix(layer_idx)
|
| 849 |
-
cfg = self.cfg
|
| 850 |
-
n_head = cfg['n_head']
|
| 851 |
-
n_head_kv = cfg['n_head_kv']
|
| 852 |
-
if isinstance(n_head_kv, list):
|
| 853 |
-
n_head_kv = n_head_kv[layer_idx]
|
| 854 |
-
seq_len = hidden.shape[0]
|
| 855 |
-
|
| 856 |
-
# ── Attention norm ──
|
| 857 |
-
attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
|
| 858 |
-
if attn_norm_w is None:
|
| 859 |
-
return hidden
|
| 860 |
-
normed = self._hpc_rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
|
| 861 |
-
|
| 862 |
-
# ── Check for fused vs separate QKV ──
|
| 863 |
-
qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
|
| 864 |
-
gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
|
| 865 |
-
q_w = self._get_weight(f'{pfx}.attn_q.weight')
|
| 866 |
-
k_w = self._get_weight(f'{pfx}.attn_k.weight')
|
| 867 |
-
v_w = self._get_weight(f'{pfx}.attn_v.weight')
|
| 868 |
-
o_w = self._get_weight(f'{pfx}.attn_output.weight')
|
| 869 |
-
|
| 870 |
-
if qkv_w is not None:
|
| 871 |
-
# ── Fused QKV path (Qwen 3.6 hybrid layers) ──
|
| 872 |
-
head_dim = self.head_dim
|
| 873 |
-
q_dim = n_head * head_dim
|
| 874 |
-
kv_dim = n_head_kv * head_dim
|
| 875 |
-
qkv = self._hpc_matmul_record(f'{pfx}.attn_qkv.weight', normed, qkv_w)
|
| 876 |
-
q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
|
| 877 |
-
k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
|
| 878 |
-
v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
|
| 879 |
-
|
| 880 |
-
# GQA expand
|
| 881 |
-
if n_head_kv < n_head:
|
| 882 |
-
rep = n_head // n_head_kv
|
| 883 |
-
k = np.repeat(k, rep, axis=1)
|
| 884 |
-
v = np.repeat(v, rep, axis=1)
|
| 885 |
-
|
| 886 |
-
# ── Linear attention: O(seq × head_dim²) ──
|
| 887 |
-
# φ(x) = elu(x) + 1 (feature map for linear attention)
|
| 888 |
-
q_feat = np.maximum(q, 0) + 1e-6 # [seq, n_head, head_dim]
|
| 889 |
-
k_feat = np.maximum(k, 0) + 1e-6
|
| 890 |
-
|
| 891 |
-
# Causal linear attention via running state (vectorized over heads):
|
| 892 |
-
# S_t = S_{t-1} + k_t ⊗ v_t (outer product accumulator)
|
| 893 |
-
# z_t = z_{t-1} + k_t (normalizer accumulator)
|
| 894 |
-
# out_t = (q_t @ S_t) / (q_t · z_t)
|
| 895 |
-
out = np.zeros_like(q) # [seq, n_head, head_dim]
|
| 896 |
-
S = np.zeros((n_head, head_dim, head_dim), dtype=np.float32)
|
| 897 |
-
z = np.zeros((n_head, head_dim), dtype=np.float32)
|
| 898 |
-
|
| 899 |
-
for t in range(seq_len):
|
| 900 |
-
# Vectorized over all heads: [n_head, head_dim]
|
| 901 |
-
kt = k_feat[t] # [n_head, head_dim]
|
| 902 |
-
vt = v[t] # [n_head, head_dim]
|
| 903 |
-
qt = q_feat[t] # [n_head, head_dim]
|
| 904 |
-
# S[h] += outer(kt[h], vt[h]) for all h at once
|
| 905 |
-
S += kt[:, :, None] * vt[:, None, :] # [n_head, hd, hd]
|
| 906 |
-
z += kt # [n_head, hd]
|
| 907 |
-
# num = qt @ S -> [n_head, head_dim]
|
| 908 |
-
num = np.einsum('hd,hde->he', qt, S)
|
| 909 |
-
den = np.sum(qt * z, axis=-1, keepdims=True) + 1e-8 # [n_head, 1]
|
| 910 |
-
out[t] = num / den
|
| 911 |
-
|
| 912 |
-
attn_result = out.reshape(seq_len, -1) # [seq, n_head * head_dim]
|
| 913 |
-
|
| 914 |
-
# Record and project
|
| 915 |
-
if gate_w is not None:
|
| 916 |
-
self._record(f'{pfx}.attn_gate.weight', attn_result)
|
| 917 |
-
if gate_w.shape[1] == hidden.shape[-1]:
|
| 918 |
-
attn_out = attn_result @ gate_w
|
| 919 |
-
else:
|
| 920 |
-
attn_out = attn_result @ gate_w.T
|
| 921 |
-
else:
|
| 922 |
-
attn_out = np.zeros_like(hidden)
|
| 923 |
-
|
| 924 |
-
elif q_w is not None and k_w is not None and v_w is not None and o_w is not None:
|
| 925 |
-
# ── Separate QKV path (standard transformer layers) ──
|
| 926 |
-
q = self._hpc_matmul_record(f'{pfx}.attn_q.weight', normed, q_w)
|
| 927 |
-
k = self._hpc_matmul_record(f'{pfx}.attn_k.weight', normed, k_w)
|
| 928 |
-
v = self._hpc_matmul_record(f'{pfx}.attn_v.weight', normed, v_w)
|
| 929 |
-
|
| 930 |
-
head_dim_q = q_w.shape[0] // n_head
|
| 931 |
-
head_dim_kv = k_w.shape[0] // n_head_kv
|
| 932 |
-
|
| 933 |
-
q = q.reshape(seq_len, n_head, head_dim_q)
|
| 934 |
-
k = k.reshape(seq_len, n_head_kv, head_dim_kv)
|
| 935 |
-
v = v.reshape(seq_len, n_head_kv, head_dim_kv)
|
| 936 |
-
|
| 937 |
-
if n_head_kv < n_head:
|
| 938 |
-
rep = n_head // n_head_kv
|
| 939 |
-
k = np.repeat(k, rep, axis=1)
|
| 940 |
-
v = np.repeat(v, rep, axis=1)
|
| 941 |
-
|
| 942 |
-
# Linear attention with feature map
|
| 943 |
-
q_feat = np.maximum(q, 0) + 1e-6
|
| 944 |
-
k_feat = np.maximum(k, 0) + 1e-6
|
| 945 |
-
|
| 946 |
-
out = np.zeros_like(v) # [seq, n_head, head_dim_kv]
|
| 947 |
-
S = np.zeros((n_head, head_dim_kv, head_dim_kv), dtype=np.float32)
|
| 948 |
-
z = np.zeros((n_head, head_dim_kv), dtype=np.float32)
|
| 949 |
-
|
| 950 |
-
# Use min of q/k dims for the state accumulator
|
| 951 |
-
feat_dim = min(head_dim_q, head_dim_kv)
|
| 952 |
-
S = np.zeros((n_head, feat_dim, head_dim_kv), dtype=np.float32)
|
| 953 |
-
z = np.zeros((n_head, feat_dim), dtype=np.float32)
|
| 954 |
-
|
| 955 |
-
for t in range(seq_len):
|
| 956 |
-
# Vectorized over all heads
|
| 957 |
-
kf = k_feat[t, :, :feat_dim] # [n_head, feat_dim]
|
| 958 |
-
qf = q_feat[t, :, :feat_dim] # [n_head, feat_dim]
|
| 959 |
-
vt = v[t] # [n_head, head_dim_kv]
|
| 960 |
-
S += kf[:, :, None] * vt[:, None, :] # [n_head, feat_dim, head_dim_kv]
|
| 961 |
-
z += kf # [n_head, feat_dim]
|
| 962 |
-
num = np.einsum('hd,hde->he', qf, S) # [n_head, head_dim_kv]
|
| 963 |
-
den = np.sum(qf * z, axis=-1, keepdims=True) + 1e-8
|
| 964 |
-
out[t] = num / den
|
| 965 |
-
|
| 966 |
-
attn_result = out.reshape(seq_len, -1)
|
| 967 |
-
|
| 968 |
-
# Pad/truncate to match o_w input size
|
| 969 |
-
if attn_result.shape[-1] != o_w.shape[1]:
|
| 970 |
-
if attn_result.shape[-1] < o_w.shape[1]:
|
| 971 |
-
padded = np.zeros((seq_len, o_w.shape[1]), dtype=attn_result.dtype)
|
| 972 |
-
padded[:, :attn_result.shape[-1]] = attn_result
|
| 973 |
-
attn_result = padded
|
| 974 |
-
else:
|
| 975 |
-
attn_result = attn_result[:, :o_w.shape[1]]
|
| 976 |
-
|
| 977 |
-
self._record(f'{pfx}.attn_output.weight', attn_result)
|
| 978 |
-
attn_out = attn_result @ o_w.T
|
| 979 |
-
else:
|
| 980 |
-
return hidden
|
| 981 |
-
|
| 982 |
-
hidden = hidden + attn_out
|
| 983 |
-
|
| 984 |
-
# ── SSM path (Qwen 3.6 hybrid) ──
|
| 985 |
-
ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
|
| 986 |
-
ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
|
| 987 |
-
ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
|
| 988 |
-
if ssm_alpha_w is not None:
|
| 989 |
-
self._record(f'{pfx}.ssm_alpha.weight', normed)
|
| 990 |
-
if ssm_beta_w is not None:
|
| 991 |
-
self._record(f'{pfx}.ssm_beta.weight', normed)
|
| 992 |
-
if ssm_out_w is not None:
|
| 993 |
-
if qkv_w is not None:
|
| 994 |
-
qkv_full = normed @ qkv_w.T
|
| 995 |
-
ssm_proxy = qkv_full[:, :ssm_out_w.shape[1]] if qkv_full.shape[-1] >= ssm_out_w.shape[1] else normed
|
| 996 |
-
else:
|
| 997 |
-
ssm_proxy = normed
|
| 998 |
-
self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
|
| 999 |
-
if ssm_out_w.shape[0] == hidden.shape[-1]:
|
| 1000 |
-
hidden = hidden + ssm_proxy @ ssm_out_w.T
|
| 1001 |
-
|
| 1002 |
-
# ── FFN ──
|
| 1003 |
-
ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
|
| 1004 |
-
if ffn_norm_w is None:
|
| 1005 |
-
ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
|
| 1006 |
-
if ffn_norm_w is None:
|
| 1007 |
-
return hidden
|
| 1008 |
-
|
| 1009 |
-
normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
|
| 1010 |
-
|
| 1011 |
-
gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
|
| 1012 |
-
up_w = self._get_weight(f'{pfx}.ffn_up.weight')
|
| 1013 |
-
down_w = self._get_weight(f'{pfx}.ffn_down.weight')
|
| 1014 |
-
|
| 1015 |
-
if gate_fw is not None and up_w is not None and down_w is not None:
|
| 1016 |
-
gate_out = self.act_fn(self._hpc_matmul_record(f'{pfx}.ffn_gate.weight', normed_ff, gate_fw))
|
| 1017 |
-
up_out = self._hpc_matmul_record(f'{pfx}.ffn_up.weight', normed_ff, up_w)
|
| 1018 |
-
ff_mid = gate_out * up_out
|
| 1019 |
-
self._record(f'{pfx}.ffn_down.weight', ff_mid)
|
| 1020 |
-
ff_out = ff_mid @ down_w.T
|
| 1021 |
-
hidden = hidden + ff_out
|
| 1022 |
-
|
| 1023 |
-
return hidden
|
| 1024 |
-
|
| 1025 |
-
def forward_layer(self, hidden, layer_idx, cos_f, sin_f):
|
| 1026 |
-
"""Forward pass through one transformer layer. Returns new hidden state."""
|
| 1027 |
-
pfx = self._layer_prefix(layer_idx)
|
| 1028 |
-
cfg = self.cfg
|
| 1029 |
-
n_head = cfg['n_head']
|
| 1030 |
-
n_head_kv = cfg['n_head_kv']
|
| 1031 |
-
if isinstance(n_head_kv, list):
|
| 1032 |
-
n_head_kv = n_head_kv[layer_idx]
|
| 1033 |
-
head_dim = self.head_dim
|
| 1034 |
-
seq_len = hidden.shape[0]
|
| 1035 |
-
|
| 1036 |
-
# ── Attention ──
|
| 1037 |
-
attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
|
| 1038 |
-
if attn_norm_w is None:
|
| 1039 |
-
return hidden # Skip if weights missing
|
| 1040 |
-
|
| 1041 |
-
normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
|
| 1042 |
-
|
| 1043 |
-
# Q/K/V projections — record importance on the INPUT (normed)
|
| 1044 |
-
q_w = self._get_weight(f'{pfx}.attn_q.weight')
|
| 1045 |
-
k_w = self._get_weight(f'{pfx}.attn_k.weight')
|
| 1046 |
-
v_w = self._get_weight(f'{pfx}.attn_v.weight')
|
| 1047 |
-
o_w = self._get_weight(f'{pfx}.attn_output.weight')
|
| 1048 |
-
|
| 1049 |
-
if q_w is None or k_w is None or v_w is None or o_w is None:
|
| 1050 |
-
return hidden
|
| 1051 |
-
|
| 1052 |
-
self._record(f'{pfx}.attn_q.weight', normed)
|
| 1053 |
-
self._record(f'{pfx}.attn_k.weight', normed)
|
| 1054 |
-
self._record(f'{pfx}.attn_v.weight', normed)
|
| 1055 |
-
|
| 1056 |
-
q = normed @ q_w.T # [seq, q_w.shape[0]]
|
| 1057 |
-
k = normed @ k_w.T # [seq, k_w.shape[0]]
|
| 1058 |
-
v = normed @ v_w.T
|
| 1059 |
-
|
| 1060 |
-
# Dynamic head_dim based on tensor size
|
| 1061 |
-
head_dim_q = q_w.shape[0] // n_head
|
| 1062 |
-
head_dim_kv = k_w.shape[0] // n_head_kv
|
| 1063 |
-
|
| 1064 |
-
q = q.reshape(seq_len, n_head, head_dim_q)
|
| 1065 |
-
k = k.reshape(seq_len, n_head_kv, head_dim_kv)
|
| 1066 |
-
v = v.reshape(seq_len, n_head_kv, head_dim_kv)
|
| 1067 |
-
|
| 1068 |
-
# Apply RoPE
|
| 1069 |
-
if head_dim_q != head_dim:
|
| 1070 |
-
cos_q, sin_q = rope_freqs(head_dim_q, seq_len, cfg['rope_base'])
|
| 1071 |
-
q = apply_rope(q, cos_q, sin_q)
|
| 1072 |
-
else:
|
| 1073 |
-
q = apply_rope(q, cos_f, sin_f)
|
| 1074 |
-
|
| 1075 |
-
if head_dim_kv != head_dim:
|
| 1076 |
-
cos_k, sin_k = rope_freqs(head_dim_kv, seq_len, cfg['rope_base'])
|
| 1077 |
-
k = apply_rope(k, cos_k, sin_k)
|
| 1078 |
-
else:
|
| 1079 |
-
k = apply_rope(k, cos_f, sin_f)
|
| 1080 |
-
|
| 1081 |
-
# GQA: repeat KV heads
|
| 1082 |
-
if n_head_kv < n_head:
|
| 1083 |
-
rep = n_head // n_head_kv
|
| 1084 |
-
k = np.repeat(k, rep, axis=1)
|
| 1085 |
-
v = np.repeat(v, rep, axis=1)
|
| 1086 |
-
|
| 1087 |
-
q_t = q.transpose(1, 0, 2) # [n_head, seq, head_dim_q]
|
| 1088 |
-
k_t = k.transpose(1, 0, 2) # [n_head, seq, head_dim_kv]
|
| 1089 |
-
v_t = v.transpose(1, 0, 2) # [n_head, seq, head_dim_kv]
|
| 1090 |
-
|
| 1091 |
-
scale = 1.0 / np.sqrt(head_dim_q)
|
| 1092 |
-
|
| 1093 |
-
# If Q and K head dims differ, there might be a projection or it's not standard SDP.
|
| 1094 |
-
# But for importance calculation, if we just need to get the attention magnitude:
|
| 1095 |
-
# We can pad K to match Q, or truncate Q to match K. We only need an approximation.
|
| 1096 |
-
if head_dim_q != head_dim_kv:
|
| 1097 |
-
if head_dim_q > head_dim_kv:
|
| 1098 |
-
k_t_padded = np.zeros_like(q_t)
|
| 1099 |
-
k_t_padded[..., :head_dim_kv] = k_t
|
| 1100 |
-
k_t = k_t_padded
|
| 1101 |
-
else:
|
| 1102 |
-
q_t_padded = np.zeros_like(k_t)
|
| 1103 |
-
q_t_padded[..., :head_dim_q] = q_t
|
| 1104 |
-
q_t = q_t_padded
|
| 1105 |
-
|
| 1106 |
-
attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale # [n_head, seq, seq]
|
| 1107 |
-
|
| 1108 |
-
# Causal mask (with optional sliding window)
|
| 1109 |
-
mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
|
| 1110 |
-
swa = cfg.get('swa_window', 0)
|
| 1111 |
-
if swa and swa > 0:
|
| 1112 |
-
for i in range(seq_len):
|
| 1113 |
-
for j in range(0, max(0, i - swa)):
|
| 1114 |
-
mask[i, j] = -1e9
|
| 1115 |
-
attn = attn + mask[np.newaxis, :, :]
|
| 1116 |
-
attn = softmax(attn, axis=-1)
|
| 1117 |
-
|
| 1118 |
-
out = np.matmul(attn, v_t) # [n_head, seq, head_dim_kv]
|
| 1119 |
-
|
| 1120 |
-
# Output projection input is out_w.T -> [in_features, out_features]
|
| 1121 |
-
# In_features is out_w.shape[1]
|
| 1122 |
-
out = out.transpose(1, 0, 2).reshape(seq_len, -1) # [seq, n_head * head_dim_kv]
|
| 1123 |
-
|
| 1124 |
-
# Pad or truncate out to match expected input size of o_w
|
| 1125 |
-
if out.shape[-1] != o_w.shape[1]:
|
| 1126 |
-
if out.shape[-1] < o_w.shape[1]:
|
| 1127 |
-
out_padded = np.zeros((seq_len, o_w.shape[1]), dtype=out.dtype)
|
| 1128 |
-
out_padded[:, :out.shape[-1]] = out
|
| 1129 |
-
out = out_padded
|
| 1130 |
-
else:
|
| 1131 |
-
out = out[:, :o_w.shape[1]]
|
| 1132 |
-
|
| 1133 |
-
self._record(f'{pfx}.attn_output.weight', out)
|
| 1134 |
-
attn_out = out @ o_w.T
|
| 1135 |
-
|
| 1136 |
-
hidden = hidden + attn_out
|
| 1137 |
-
|
| 1138 |
-
# ── FFN ──
|
| 1139 |
-
ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
|
| 1140 |
-
if ffn_norm_w is None:
|
| 1141 |
-
return hidden
|
| 1142 |
-
|
| 1143 |
-
normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
|
| 1144 |
-
|
| 1145 |
-
gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
|
| 1146 |
-
up_w = self._get_weight(f'{pfx}.ffn_up.weight')
|
| 1147 |
-
down_w = self._get_weight(f'{pfx}.ffn_down.weight')
|
| 1148 |
-
|
| 1149 |
-
if gate_w is not None and up_w is not None and down_w is not None:
|
| 1150 |
-
self._record(f'{pfx}.ffn_gate.weight', normed_ff)
|
| 1151 |
-
self._record(f'{pfx}.ffn_up.weight', normed_ff)
|
| 1152 |
-
|
| 1153 |
-
gate_out = self.act_fn(normed_ff @ gate_w.T)
|
| 1154 |
-
up_out = normed_ff @ up_w.T
|
| 1155 |
-
ff_mid = gate_out * up_out
|
| 1156 |
-
|
| 1157 |
-
self._record(f'{pfx}.ffn_down.weight', ff_mid)
|
| 1158 |
-
ff_out = ff_mid @ down_w.T
|
| 1159 |
-
hidden = hidden + ff_out
|
| 1160 |
-
else:
|
| 1161 |
-
# MoE path
|
| 1162 |
-
hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
|
| 1163 |
-
|
| 1164 |
-
return hidden
|
| 1165 |
-
|
| 1166 |
-
def forward_linear_attn_layer(self, hidden, layer_idx):
|
| 1167 |
-
"""Forward pass through a DeltaNet (gated linear attention) layer.
|
| 1168 |
-
|
| 1169 |
-
Used by Qwen 3.5/3.6 for ~75% of layers. Records importance stats
|
| 1170 |
-
for all SSM projection weights.
|
| 1171 |
-
"""
|
| 1172 |
-
pfx = self._layer_prefix(layer_idx)
|
| 1173 |
-
cfg = self.cfg
|
| 1174 |
-
seq_len = hidden.shape[0]
|
| 1175 |
-
|
| 1176 |
-
# ── Attention norm ──
|
| 1177 |
-
attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
|
| 1178 |
-
if attn_norm_w is None:
|
| 1179 |
-
return hidden
|
| 1180 |
-
normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
|
| 1181 |
-
|
| 1182 |
-
# ── DeltaNet projections ──
|
| 1183 |
-
qkv_w = self._get_weight(f'{pfx}.ssm_in_qkv.weight')
|
| 1184 |
-
z_w = self._get_weight(f'{pfx}.ssm_in_z.weight')
|
| 1185 |
-
a_w = self._get_weight(f'{pfx}.ssm_in_a.weight')
|
| 1186 |
-
b_w = self._get_weight(f'{pfx}.ssm_in_b.weight')
|
| 1187 |
-
out_w = self._get_weight(f'{pfx}.ssm_out.weight')
|
| 1188 |
-
|
| 1189 |
-
if qkv_w is None or out_w is None:
|
| 1190 |
-
return hidden
|
| 1191 |
-
|
| 1192 |
-
# Record importance on input activations
|
| 1193 |
-
self._record(f'{pfx}.ssm_in_qkv.weight', normed)
|
| 1194 |
-
if z_w is not None:
|
| 1195 |
-
self._record(f'{pfx}.ssm_in_z.weight', normed)
|
| 1196 |
-
if a_w is not None:
|
| 1197 |
-
self._record(f'{pfx}.ssm_in_a.weight', normed)
|
| 1198 |
-
if b_w is not None:
|
| 1199 |
-
self._record(f'{pfx}.ssm_in_b.weight', normed)
|
| 1200 |
-
|
| 1201 |
-
# Approximate forward: project through QKV and output
|
| 1202 |
-
# (Full DeltaNet recurrence is complex; for importance collection
|
| 1203 |
-
# we just need the activation magnitudes at each projection)
|
| 1204 |
-
qkv = normed @ qkv_w.T
|
| 1205 |
-
|
| 1206 |
-
# For importance: record output projection input
|
| 1207 |
-
# Use qkv as a proxy for the recurrent state output
|
| 1208 |
-
n_out = out_w.shape[1] if out_w.ndim >= 2 else hidden.shape[-1]
|
| 1209 |
-
if qkv.shape[-1] >= n_out:
|
| 1210 |
-
out_input = qkv[:, :n_out]
|
| 1211 |
-
else:
|
| 1212 |
-
out_input = qkv
|
| 1213 |
-
self._record(f'{pfx}.ssm_out.weight', out_input)
|
| 1214 |
-
|
| 1215 |
-
attn_out = out_input @ out_w.T
|
| 1216 |
-
hidden = hidden + attn_out
|
| 1217 |
-
|
| 1218 |
-
# ── FFN (same as standard transformer) ──
|
| 1219 |
-
ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
|
| 1220 |
-
if ffn_norm_w is None:
|
| 1221 |
-
return hidden
|
| 1222 |
-
|
| 1223 |
-
normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
|
| 1224 |
-
|
| 1225 |
-
gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
|
| 1226 |
-
up_w = self._get_weight(f'{pfx}.ffn_up.weight')
|
| 1227 |
-
down_w = self._get_weight(f'{pfx}.ffn_down.weight')
|
| 1228 |
-
|
| 1229 |
-
if gate_w is not None and up_w is not None and down_w is not None:
|
| 1230 |
-
self._record(f'{pfx}.ffn_gate.weight', normed_ff)
|
| 1231 |
-
self._record(f'{pfx}.ffn_up.weight', normed_ff)
|
| 1232 |
-
gate_out = self.act_fn(normed_ff @ gate_w.T)
|
| 1233 |
-
up_out = normed_ff @ up_w.T
|
| 1234 |
-
ff_mid = gate_out * up_out
|
| 1235 |
-
self._record(f'{pfx}.ffn_down.weight', ff_mid)
|
| 1236 |
-
ff_out = ff_mid @ down_w.T
|
| 1237 |
-
hidden = hidden + ff_out
|
| 1238 |
-
else:
|
| 1239 |
-
hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
|
| 1240 |
-
|
| 1241 |
-
return hidden
|
| 1242 |
-
|
| 1243 |
-
def forward_qwen35_layer(self, hidden, layer_idx, cos_f, sin_f):
|
| 1244 |
-
"""Forward pass through a Qwen 3.6 hybrid layer (attention + SSM).
|
| 1245 |
-
|
| 1246 |
-
Qwen 3.6 uses:
|
| 1247 |
-
- Fused attn_qkv.weight (Q+K+V in one tensor)
|
| 1248 |
-
- attn_gate.weight (gated attention output, not attn_output)
|
| 1249 |
-
- SSM tensors: ssm_alpha, ssm_beta, ssm_conv1d, ssm_out
|
| 1250 |
-
- post_attention_norm.weight (instead of ffn_norm)
|
| 1251 |
-
"""
|
| 1252 |
-
pfx = self._layer_prefix(layer_idx)
|
| 1253 |
-
cfg = self.cfg
|
| 1254 |
-
n_head = cfg['n_head']
|
| 1255 |
-
n_head_kv = cfg['n_head_kv']
|
| 1256 |
-
if isinstance(n_head_kv, list):
|
| 1257 |
-
n_head_kv = n_head_kv[layer_idx]
|
| 1258 |
-
head_dim = self.head_dim
|
| 1259 |
-
seq_len = hidden.shape[0]
|
| 1260 |
-
|
| 1261 |
-
# ── Attention norm ──
|
| 1262 |
-
attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
|
| 1263 |
-
if attn_norm_w is None:
|
| 1264 |
-
return hidden
|
| 1265 |
-
normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
|
| 1266 |
-
|
| 1267 |
-
# ── Fused QKV projection ──
|
| 1268 |
-
qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
|
| 1269 |
-
gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
|
| 1270 |
-
|
| 1271 |
-
attn_out_vec = np.zeros_like(hidden)
|
| 1272 |
-
if qkv_w is not None:
|
| 1273 |
-
self._record(f'{pfx}.attn_qkv.weight', normed)
|
| 1274 |
-
|
| 1275 |
-
qkv = normed @ qkv_w.T # [seq, (n_head + 2*n_head_kv) * head_dim]
|
| 1276 |
-
|
| 1277 |
-
# Split into Q, K, V
|
| 1278 |
-
q_dim = n_head * head_dim
|
| 1279 |
-
kv_dim = n_head_kv * head_dim
|
| 1280 |
-
q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
|
| 1281 |
-
k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
|
| 1282 |
-
v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
|
| 1283 |
-
|
| 1284 |
-
# RoPE
|
| 1285 |
-
q = apply_rope(q, cos_f, sin_f)
|
| 1286 |
-
k = apply_rope(k, cos_f, sin_f)
|
| 1287 |
-
|
| 1288 |
-
# GQA: repeat KV heads
|
| 1289 |
-
if n_head_kv < n_head:
|
| 1290 |
-
rep = n_head // n_head_kv
|
| 1291 |
-
k = np.repeat(k, rep, axis=1)
|
| 1292 |
-
v = np.repeat(v, rep, axis=1)
|
| 1293 |
-
|
| 1294 |
-
# Scaled dot-product attention
|
| 1295 |
-
q_t = q.transpose(1, 0, 2) # [n_head, seq, head_dim]
|
| 1296 |
-
k_t = k.transpose(1, 0, 2)
|
| 1297 |
-
v_t = v.transpose(1, 0, 2)
|
| 1298 |
-
|
| 1299 |
-
scale = 1.0 / np.sqrt(head_dim)
|
| 1300 |
-
attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale
|
| 1301 |
-
|
| 1302 |
-
# Causal mask (with optional SWA)
|
| 1303 |
-
mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
|
| 1304 |
-
swa = cfg.get('swa_window', 0)
|
| 1305 |
-
if swa and swa > 0:
|
| 1306 |
-
for i in range(seq_len):
|
| 1307 |
-
for j in range(0, max(0, i - swa)):
|
| 1308 |
-
mask[i, j] = -1e9
|
| 1309 |
-
attn = attn + mask[np.newaxis, :, :]
|
| 1310 |
-
attn = softmax(attn, axis=-1)
|
| 1311 |
-
|
| 1312 |
-
out = np.matmul(attn, v_t)
|
| 1313 |
-
attn_result = out.transpose(1, 0, 2).reshape(seq_len, -1) # [seq, n_head*head_dim]
|
| 1314 |
-
|
| 1315 |
-
# Gated attention output
|
| 1316 |
-
if gate_w is not None:
|
| 1317 |
-
self._record(f'{pfx}.attn_gate.weight', attn_result)
|
| 1318 |
-
# Some GGUF tensors are transposed. Ensure output matches hidden dim.
|
| 1319 |
-
if gate_w.shape[1] == hidden.shape[-1]:
|
| 1320 |
-
attn_out_vec = attn_result @ gate_w
|
| 1321 |
-
else:
|
| 1322 |
-
attn_out_vec = attn_result @ gate_w.T
|
| 1323 |
-
|
| 1324 |
-
# ── SSM path ──
|
| 1325 |
-
ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
|
| 1326 |
-
ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
|
| 1327 |
-
ssm_conv_w = self._get_weight(f'{pfx}.ssm_conv1d.weight')
|
| 1328 |
-
ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
|
| 1329 |
-
|
| 1330 |
-
ssm_out_vec = np.zeros_like(hidden)
|
| 1331 |
-
if ssm_alpha_w is not None:
|
| 1332 |
-
self._record(f'{pfx}.ssm_alpha.weight', normed)
|
| 1333 |
-
if ssm_beta_w is not None:
|
| 1334 |
-
self._record(f'{pfx}.ssm_beta.weight', normed)
|
| 1335 |
-
if ssm_conv_w is not None:
|
| 1336 |
-
# ssm_conv1d input is the QKV projection (reuse from attention)
|
| 1337 |
-
if qkv_w is not None:
|
| 1338 |
-
qkv_for_ssm = normed @ qkv_w.T
|
| 1339 |
-
self._record(f'{pfx}.ssm_conv1d.weight', qkv_for_ssm)
|
| 1340 |
-
if ssm_out_w is not None:
|
| 1341 |
-
# SSM output projection — use qkv output as proxy for recurrent output
|
| 1342 |
-
# (qkv is 10240, ssm_out expects 6144)
|
| 1343 |
-
if 'qkv' in locals() and qkv.shape[-1] >= ssm_out_w.shape[1]:
|
| 1344 |
-
ssm_proxy = qkv[:, :ssm_out_w.shape[1]]
|
| 1345 |
-
else:
|
| 1346 |
-
# Fallback zero pad
|
| 1347 |
-
ssm_proxy = np.zeros((seq_len, ssm_out_w.shape[1]), dtype=np.float32)
|
| 1348 |
-
|
| 1349 |
-
self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
|
| 1350 |
-
|
| 1351 |
-
# Note: We do NOT need to actually add the SSM output vector to hidden
|
| 1352 |
-
# for importance matrix calculation. We just need to record the inputs
|
| 1353 |
-
# to all quantized layers. The actual output isn't critical since we
|
| 1354 |
-
# aren't doing loss backprop. But if we do, it must match hidden's dimension.
|
| 1355 |
-
if ssm_out_w.shape[0] == hidden.shape[-1]:
|
| 1356 |
-
ssm_out_vec = ssm_proxy @ ssm_out_w.T
|
| 1357 |
-
|
| 1358 |
-
# Combine attention + SSM
|
| 1359 |
-
hidden = hidden + attn_out_vec + ssm_out_vec
|
| 1360 |
-
|
| 1361 |
-
# ── FFN (uses post_attention_norm instead of ffn_norm) ──
|
| 1362 |
-
ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
|
| 1363 |
-
if ffn_norm_w is None:
|
| 1364 |
-
ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
|
| 1365 |
-
if ffn_norm_w is None:
|
| 1366 |
-
return hidden
|
| 1367 |
-
|
| 1368 |
-
normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
|
| 1369 |
-
|
| 1370 |
-
gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
|
| 1371 |
-
up_w = self._get_weight(f'{pfx}.ffn_up.weight')
|
| 1372 |
-
down_w = self._get_weight(f'{pfx}.ffn_down.weight')
|
| 1373 |
-
|
| 1374 |
-
if gate_fw is not None and up_w is not None and down_w is not None:
|
| 1375 |
-
self._record(f'{pfx}.ffn_gate.weight', normed_ff)
|
| 1376 |
-
self._record(f'{pfx}.ffn_up.weight', normed_ff)
|
| 1377 |
-
gate_out = self.act_fn(normed_ff @ gate_fw.T)
|
| 1378 |
-
up_out = normed_ff @ up_w.T
|
| 1379 |
-
ff_mid = gate_out * up_out
|
| 1380 |
-
self._record(f'{pfx}.ffn_down.weight', ff_mid)
|
| 1381 |
-
ff_out = ff_mid @ down_w.T
|
| 1382 |
-
hidden = hidden + ff_out
|
| 1383 |
-
else:
|
| 1384 |
-
hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
|
| 1385 |
-
|
| 1386 |
-
return hidden
|
| 1387 |
-
|
| 1388 |
-
def forward(self, token_ids):
|
| 1389 |
-
"""Full forward pass, collecting importance statistics."""
|
| 1390 |
-
cfg = self.cfg
|
| 1391 |
-
seq_len = len(token_ids)
|
| 1392 |
-
|
| 1393 |
-
# Embedding
|
| 1394 |
-
embed_w = self._get_weight('token_embd.weight')
|
| 1395 |
-
if embed_w is None:
|
| 1396 |
-
raise RuntimeError("Missing token_embd.weight")
|
| 1397 |
-
|
| 1398 |
-
hidden = embed_w[token_ids].copy() # [seq_len, n_embd]
|
| 1399 |
-
del embed_w # Free ~5 GB embedding table before layer loop
|
| 1400 |
-
|
| 1401 |
-
# RoPE frequencies
|
| 1402 |
-
cos_f, sin_f = rope_freqs(self.head_dim, seq_len, cfg['rope_base'])
|
| 1403 |
-
|
| 1404 |
-
# Process each layer
|
| 1405 |
-
for layer_idx in range(cfg['n_layers']):
|
| 1406 |
-
pfx = f"blk.{layer_idx}"
|
| 1407 |
-
|
| 1408 |
-
if self._hpc_lib and self.linear_attn:
|
| 1409 |
-
# Pure HPC C forward: entire layer in one C call
|
| 1410 |
-
hidden = self._hpc_forward_layer(hidden, layer_idx)
|
| 1411 |
-
elif self.linear_attn:
|
| 1412 |
-
# Python HPC-linearized attention: O(seq) per layer
|
| 1413 |
-
hidden = self.forward_layer_linear(hidden, layer_idx)
|
| 1414 |
-
else:
|
| 1415 |
-
has_fused_qkv = f'{pfx}.attn_qkv.weight' in self.model.tensor_infos
|
| 1416 |
-
has_separate_q = f'{pfx}.attn_q.weight' in self.model.tensor_infos
|
| 1417 |
-
has_linear_attn = f'{pfx}.ssm_in_qkv.weight' in self.model.tensor_infos
|
| 1418 |
-
|
| 1419 |
-
if has_fused_qkv:
|
| 1420 |
-
hidden = self.forward_qwen35_layer(hidden, layer_idx, cos_f, sin_f)
|
| 1421 |
-
elif has_linear_attn and not has_separate_q:
|
| 1422 |
-
hidden = self.forward_linear_attn_layer(hidden, layer_idx)
|
| 1423 |
-
else:
|
| 1424 |
-
hidden = self.forward_layer(hidden, layer_idx, cos_f, sin_f)
|
| 1425 |
-
if self.verbose and (layer_idx + 1) % 4 == 0:
|
| 1426 |
-
print(f" Layer {layer_idx + 1}/{cfg['n_layers']}", end='\r')
|
| 1427 |
-
|
| 1428 |
-
# Output projection — check existence without loading the full 5 GB tensor
|
| 1429 |
-
if 'output.weight' in self.model.tensor_infos:
|
| 1430 |
-
self._record('output.weight', hidden)
|
| 1431 |
-
|
| 1432 |
-
return hidden
|
| 1433 |
-
|
| 1434 |
-
|
| 1435 |
-
# ─── HPC Cross-Layer Importance Propagation ─────────────────────────────────
|
| 1436 |
-
|
| 1437 |
-
def hpc_propagate_importance(importance_dict, n_layers, verbose=False):
|
| 1438 |
-
"""Use HPC-inspired BP to propagate importance across layers.
|
| 1439 |
-
|
| 1440 |
-
Each layer's raw E[x²] statistics are smoothed via cross-layer coupling
|
| 1441 |
-
through the residual stream. Layers with high importance AND high-importance
|
| 1442 |
-
neighbors get boosted; isolated spikes get damped.
|
| 1443 |
-
"""
|
| 1444 |
-
# Group tensors by layer
|
| 1445 |
-
layer_energies = np.zeros(n_layers, dtype=np.float64)
|
| 1446 |
-
layer_tensor_count = np.zeros(n_layers, dtype=np.int32)
|
| 1447 |
-
|
| 1448 |
-
for name, (sum_x2, count) in importance_dict.items():
|
| 1449 |
-
parts = name.split('.')
|
| 1450 |
-
if len(parts) >= 2 and parts[0] == 'blk':
|
| 1451 |
-
try:
|
| 1452 |
-
layer_idx = int(parts[1])
|
| 1453 |
-
if 0 <= layer_idx < n_layers:
|
| 1454 |
-
mean_imp = np.mean(sum_x2 / max(count, 1))
|
| 1455 |
-
layer_energies[layer_idx] += mean_imp
|
| 1456 |
-
layer_tensor_count[layer_idx] += 1
|
| 1457 |
-
except ValueError:
|
| 1458 |
-
pass
|
| 1459 |
-
|
| 1460 |
-
for i in range(n_layers):
|
| 1461 |
-
if layer_tensor_count[i] > 0:
|
| 1462 |
-
layer_energies[i] /= layer_tensor_count[i]
|
| 1463 |
-
|
| 1464 |
-
if np.max(layer_energies) < 1e-30:
|
| 1465 |
-
return importance_dict
|
| 1466 |
-
|
| 1467 |
-
layer_energies /= np.max(layer_energies)
|
| 1468 |
-
|
| 1469 |
-
# BP-inspired iterative smoothing with residual stream coupling
|
| 1470 |
-
multipliers = np.ones(n_layers, dtype=np.float64)
|
| 1471 |
-
temperature = 0.5
|
| 1472 |
-
|
| 1473 |
-
for _ in range(50):
|
| 1474 |
-
new_mult = np.ones(n_layers, dtype=np.float64)
|
| 1475 |
-
for i in range(n_layers):
|
| 1476 |
-
e_self = layer_energies[i]
|
| 1477 |
-
e_nbr = 0.0
|
| 1478 |
-
n_nbr = 0
|
| 1479 |
-
if i > 0:
|
| 1480 |
-
e_nbr += layer_energies[i-1] * multipliers[i-1]
|
| 1481 |
-
n_nbr += 1
|
| 1482 |
-
if i < n_layers - 1:
|
| 1483 |
-
e_nbr += layer_energies[i+1] * multipliers[i+1]
|
| 1484 |
-
n_nbr += 1
|
| 1485 |
-
if n_nbr > 0:
|
| 1486 |
-
e_nbr /= n_nbr
|
| 1487 |
-
# Clamp energy to prevent exponential explosion (max exp(5) ~ 148)
|
| 1488 |
-
energy = np.clip((e_self + 0.3 * e_nbr) / temperature, -10, 5)
|
| 1489 |
-
new_mult[i] = np.exp(energy)
|
| 1490 |
-
|
| 1491 |
-
mean_m = np.mean(new_mult)
|
| 1492 |
-
if mean_m > 1e-30:
|
| 1493 |
-
new_mult /= mean_m
|
| 1494 |
-
multipliers = 0.7 * multipliers + 0.3 * new_mult
|
| 1495 |
-
|
| 1496 |
-
if verbose:
|
| 1497 |
-
print(f"\n HPC layer multipliers (first 8): "
|
| 1498 |
-
f"{' '.join(f'{m:.3f}' for m in multipliers[:8])}...")
|
| 1499 |
-
print(f" Range: [{np.min(multipliers):.3f}, {np.max(multipliers):.3f}]")
|
| 1500 |
-
|
| 1501 |
-
adjusted = {}
|
| 1502 |
-
for name, (sum_x2, count) in importance_dict.items():
|
| 1503 |
-
parts = name.split('.')
|
| 1504 |
-
if len(parts) >= 2 and parts[0] == 'blk':
|
| 1505 |
-
try:
|
| 1506 |
-
layer_idx = int(parts[1])
|
| 1507 |
-
if 0 <= layer_idx < n_layers:
|
| 1508 |
-
adjusted[name] = (sum_x2 * multipliers[layer_idx], count)
|
| 1509 |
-
continue
|
| 1510 |
-
except ValueError:
|
| 1511 |
-
pass
|
| 1512 |
-
adjusted[name] = (sum_x2, count)
|
| 1513 |
-
|
| 1514 |
-
return adjusted
|
| 1515 |
-
|
| 1516 |
-
|
| 1517 |
-
# ─── iMatrix Output Writer ──────────────────────────────────────────────────
|
| 1518 |
-
|
| 1519 |
-
def write_imatrix(path, importance_dict):
|
| 1520 |
-
"""Write llama.cpp-compatible legacy binary imatrix file."""
|
| 1521 |
-
entries = []
|
| 1522 |
-
for name, (sum_x2, count) in sorted(importance_dict.items()):
|
| 1523 |
-
values = sum_x2.astype(np.float32)
|
| 1524 |
-
entries.append((name, values, int(count)))
|
| 1525 |
-
|
| 1526 |
-
with open(path, 'wb') as f:
|
| 1527 |
-
f.write(struct.pack('<i', len(entries)))
|
| 1528 |
-
for name, values, n_samples in entries:
|
| 1529 |
-
name_bytes = name.encode('utf-8')
|
| 1530 |
-
f.write(struct.pack('<i', len(name_bytes)))
|
| 1531 |
-
f.write(name_bytes)
|
| 1532 |
-
f.write(struct.pack('<i', len(values)))
|
| 1533 |
-
f.write(struct.pack('<i', n_samples))
|
| 1534 |
-
f.write(values.tobytes())
|
| 1535 |
-
|
| 1536 |
-
return len(entries)
|
| 1537 |
-
|
| 1538 |
-
|
| 1539 |
-
def load_hf_config(config_path):
|
| 1540 |
-
"""Load a HuggingFace config.json and extract architecture info.
|
| 1541 |
-
|
| 1542 |
-
Maps HF keys to internal generate_imatrix.py keys:
|
| 1543 |
-
hidden_size -> n_embd
|
| 1544 |
-
num_hidden_layers -> n_layers
|
| 1545 |
-
num_attention_heads -> n_head
|
| 1546 |
-
num_key_value_heads -> n_head_kv
|
| 1547 |
-
intermediate_size -> n_ff
|
| 1548 |
-
vocab_size -> vocab_size
|
| 1549 |
-
rms_norm_eps -> rms_eps
|
| 1550 |
-
rope_theta -> rope_base
|
| 1551 |
-
model_type -> arch
|
| 1552 |
-
"""
|
| 1553 |
-
import json
|
| 1554 |
-
with open(config_path, 'r') as f:
|
| 1555 |
-
raw = json.load(f)
|
| 1556 |
-
|
| 1557 |
-
src = raw
|
| 1558 |
-
if 'text_config' in raw and 'hidden_size' not in raw:
|
| 1559 |
-
src = raw['text_config']
|
| 1560 |
-
|
| 1561 |
-
cfg = {}
|
| 1562 |
-
cfg['arch'] = src.get('model_type', raw.get('model_type', 'unknown'))
|
| 1563 |
-
cfg['n_embd'] = src.get('hidden_size', 0)
|
| 1564 |
-
cfg['n_layers'] = src.get('num_hidden_layers', 0)
|
| 1565 |
-
cfg['n_head'] = src.get('num_attention_heads', 0)
|
| 1566 |
-
cfg['n_head_kv'] = src.get('num_key_value_heads', 0)
|
| 1567 |
-
cfg['n_ff'] = src.get('intermediate_size', 0)
|
| 1568 |
-
cfg['vocab_size'] = src.get('vocab_size', 0)
|
| 1569 |
-
cfg['rms_eps'] = src.get('rms_norm_eps', 1e-6)
|
| 1570 |
-
|
| 1571 |
-
rope_params = src.get('rope_parameters', {})
|
| 1572 |
-
cfg['rope_base'] = rope_params.get('rope_theta',
|
| 1573 |
-
src.get('rope_theta', 10000.0))
|
| 1574 |
-
|
| 1575 |
-
cfg['expert_count'] = src.get('num_local_experts', src.get('num_experts', 0))
|
| 1576 |
-
cfg['expert_used_count'] = src.get('num_experts_per_tok', 0)
|
| 1577 |
-
|
| 1578 |
-
# head_dim fallback
|
| 1579 |
-
if src.get('head_dim'):
|
| 1580 |
-
cfg['head_dim'] = src['head_dim']
|
| 1581 |
-
elif cfg['n_head'] > 0:
|
| 1582 |
-
cfg['head_dim'] = cfg['n_embd'] // cfg['n_head']
|
| 1583 |
-
|
| 1584 |
-
return cfg
|
| 1585 |
-
|
| 1586 |
-
|
| 1587 |
-
# ─── Main ───────────────────────────────────────────────────────────────────
|
| 1588 |
-
|
| 1589 |
-
def main():
|
| 1590 |
-
import argparse
|
| 1591 |
-
parser = argparse.ArgumentParser(
|
| 1592 |
-
description='HExState iMatrix Generator — HPC-enhanced importance matrix from GGUF')
|
| 1593 |
-
parser.add_argument('model', help='Input GGUF model file')
|
| 1594 |
-
parser.add_argument('calibration', help='Calibration text file')
|
| 1595 |
-
parser.add_argument('-o', '--output', default='imatrix.dat',
|
| 1596 |
-
help='Output imatrix file (default: imatrix.dat)')
|
| 1597 |
-
parser.add_argument('--config', help='Optional HuggingFace config.json')
|
| 1598 |
-
parser.add_argument('--chunks', type=int, default=10,
|
| 1599 |
-
help='Number of token chunks to process (default: 10)')
|
| 1600 |
-
parser.add_argument('--chunk-size', type=int, default=4096,
|
| 1601 |
-
help='Tokens per chunk (default: 4096)')
|
| 1602 |
-
parser.add_argument('--no-hpc', action='store_true',
|
| 1603 |
-
help='Disable HPC cross-layer propagation')
|
| 1604 |
-
parser.add_argument('--quadratic-attn', action='store_true',
|
| 1605 |
-
help='Use full O(seq²) attention instead of HPC-linearized O(seq)')
|
| 1606 |
-
parser.add_argument('--verbose', action='store_true',
|
| 1607 |
-
help='Per-layer statistics')
|
| 1608 |
-
args = parser.parse_args()
|
| 1609 |
-
|
| 1610 |
-
print()
|
| 1611 |
-
print(" ╔════════════════════════════════════════════════════════════════╗")
|
| 1612 |
-
print(" ║ HExState Importance Matrix Generator ║")
|
| 1613 |
-
print(" ║ HPC-Enhanced E[x²] Collection from GGUF ║")
|
| 1614 |
-
print(" ╚════════════════════════════════════════════════════════════════╝")
|
| 1615 |
-
print()
|
| 1616 |
-
|
| 1617 |
-
start_time = time.time()
|
| 1618 |
-
|
| 1619 |
-
# ── Load model ──
|
| 1620 |
-
print(f" Loading model: {args.model}")
|
| 1621 |
-
model = GGUFModel(args.model)
|
| 1622 |
-
config = model.get_config()
|
| 1623 |
-
|
| 1624 |
-
# ── Load/Merge config.json ──
|
| 1625 |
-
cfg_path = args.config
|
| 1626 |
-
if not cfg_path:
|
| 1627 |
-
# Auto-lookup in model directory
|
| 1628 |
-
model_dir = os.path.dirname(os.path.abspath(args.model))
|
| 1629 |
-
potential_cfg = os.path.join(model_dir, 'config.json')
|
| 1630 |
-
if os.path.exists(potential_cfg):
|
| 1631 |
-
cfg_path = potential_cfg
|
| 1632 |
-
|
| 1633 |
-
if cfg_path:
|
| 1634 |
-
print(f" Merging config from: {cfg_path}")
|
| 1635 |
-
hf_cfg = load_hf_config(cfg_path)
|
| 1636 |
-
# Override GGUF values with HF config values where they exist and are non-zero
|
| 1637 |
-
for k, v in hf_cfg.items():
|
| 1638 |
-
if v is not None:
|
| 1639 |
-
config[k] = v
|
| 1640 |
-
|
| 1641 |
-
print(f" Architecture: {config['arch']}")
|
| 1642 |
-
print(f" Layers: {config['n_layers']}")
|
| 1643 |
-
print(f" Hidden: {config['n_embd']}")
|
| 1644 |
-
print(f" Heads: {config['n_head']} (KV: {config['n_head_kv']})")
|
| 1645 |
-
print(f" FFN: {config['n_ff']}")
|
| 1646 |
-
print(f" Vocab: {config['vocab_size']}")
|
| 1647 |
-
print(f" Tensors: {len(model.tensor_infos)}")
|
| 1648 |
-
print()
|
| 1649 |
-
|
| 1650 |
-
# ── Load tokenizer ──
|
| 1651 |
-
print(" Loading tokenizer from GGUF metadata...")
|
| 1652 |
-
tokenizer = SimpleTokenizer(model)
|
| 1653 |
-
print(f" Vocab size: {tokenizer.vocab_size}")
|
| 1654 |
-
print()
|
| 1655 |
-
|
| 1656 |
-
# ── Load calibration text ──
|
| 1657 |
-
print(f" Loading calibration data: {args.calibration}")
|
| 1658 |
-
with open(args.calibration, 'r', encoding='utf-8', errors='replace') as f:
|
| 1659 |
-
cal_text = f.read()
|
| 1660 |
-
print(f" Text length: {len(cal_text):,} chars")
|
| 1661 |
-
|
| 1662 |
-
# ── Tokenize and chunk ──
|
| 1663 |
-
print(f" Tokenizing ({args.chunk_size} tokens/chunk, {args.chunks} chunks max)...")
|
| 1664 |
-
chunks = tokenizer.chunk_text(cal_text, args.chunk_size)
|
| 1665 |
-
if len(chunks) > args.chunks:
|
| 1666 |
-
chunks = chunks[:args.chunks]
|
| 1667 |
-
print(f" Prepared {len(chunks)} chunks")
|
| 1668 |
-
print()
|
| 1669 |
-
|
| 1670 |
-
# ── Forward pass ──
|
| 1671 |
-
print(" Running forward passes...")
|
| 1672 |
-
use_linear = not args.quadratic_attn
|
| 1673 |
-
runner = TransformerRunner(model, config, verbose=args.verbose, linear_attn=use_linear)
|
| 1674 |
-
if use_linear:
|
| 1675 |
-
print(f" Attention mode: HPC-linearized O(seq) — chunk_size={args.chunk_size}")
|
| 1676 |
-
else:
|
| 1677 |
-
print(f" Attention mode: full O(seq²) softmax — chunk_size={args.chunk_size}")
|
| 1678 |
-
|
| 1679 |
-
for i, chunk in enumerate(chunks):
|
| 1680 |
-
elapsed = time.time() - start_time
|
| 1681 |
-
eta = elapsed / max(i, 1) * (len(chunks) - i) if i > 0 else 0
|
| 1682 |
-
pct = (i + 1) / len(chunks) * 100
|
| 1683 |
-
bw = 40
|
| 1684 |
-
filled = int(bw * (i + 1) / len(chunks))
|
| 1685 |
-
bar = '█' * filled + '░' * (bw - filled)
|
| 1686 |
-
sys.stdout.write(
|
| 1687 |
-
f"\r [{bar}] {pct:5.1f}% ({i+1}/{len(chunks)}) "
|
| 1688 |
-
f"{elapsed:.0f}s ETA:{eta:.0f}s")
|
| 1689 |
-
sys.stdout.flush()
|
| 1690 |
-
|
| 1691 |
-
try:
|
| 1692 |
-
runner.forward(chunk)
|
| 1693 |
-
except Exception as e:
|
| 1694 |
-
print(f"\n WARNING: Chunk {i} failed: {e}")
|
| 1695 |
-
continue
|
| 1696 |
-
|
| 1697 |
-
print(f"\n Collected importance for {len(runner.importance)} tensors")
|
| 1698 |
-
print()
|
| 1699 |
-
|
| 1700 |
-
# ── HPC propagation ──
|
| 1701 |
-
if not args.no_hpc:
|
| 1702 |
-
print(" Running HPC cross-layer importance propagation...")
|
| 1703 |
-
importance = hpc_propagate_importance(
|
| 1704 |
-
runner.importance, config['n_layers'], verbose=args.verbose)
|
| 1705 |
-
else:
|
| 1706 |
-
importance = runner.importance
|
| 1707 |
-
|
| 1708 |
-
# ── Write output ──
|
| 1709 |
-
print(f"\n Writing imatrix: {args.output}")
|
| 1710 |
-
n_entries = write_imatrix(args.output, importance)
|
| 1711 |
-
|
| 1712 |
-
elapsed = time.time() - start_time
|
| 1713 |
-
out_size = os.path.getsize(args.output)
|
| 1714 |
-
|
| 1715 |
-
print()
|
| 1716 |
-
print(" ╔════════════════════════════════════════════════════════════════╗")
|
| 1717 |
-
print(" ║ IMATRIX GENERATION COMPLETE ║")
|
| 1718 |
-
print(" ╠════════════════════════════════════════════════════════════════╣")
|
| 1719 |
-
print(f" ║ Tensor entries: {n_entries:<42d} ║")
|
| 1720 |
-
print(f" ║ Chunks processed: {len(chunks):<42d} ║")
|
| 1721 |
-
print(f" ║ Output size: {out_size:>11,} bytes ({out_size/1024:.1f} KB)"
|
| 1722 |
-
f"{' '*(25-len(f'{out_size/1024:.1f}'))}║")
|
| 1723 |
-
print(f" ║ Total time: {elapsed:>38.1f} sec ║")
|
| 1724 |
-
print(" ╚════════════════════════════════════════════════════════════════╝")
|
| 1725 |
-
print()
|
| 1726 |
-
print(f" Output: {args.output}")
|
| 1727 |
-
print()
|
| 1728 |
-
|
| 1729 |
-
model.close()
|
| 1730 |
-
|
| 1731 |
-
|
| 1732 |
-
if __name__ == '__main__':
|
| 1733 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|