CompressedGemma commited on
Commit
00ba2db
·
verified ·
1 Parent(s): 7803d72

Delete generate_imatrix.py

Browse files
Files changed (1) hide show
  1. generate_imatrix.py +0 -1733
generate_imatrix.py DELETED
@@ -1,1733 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- HExState Importance Matrix Generator — HPC-Enhanced iMatrix from GGUF
4
-
5
- Runs transformer forward passes over calibration text to collect per-channel
6
- E[x²] activation statistics, then uses HPC triality BP to propagate importance
7
- across layers. Outputs llama.cpp-compatible .dat imatrix files.
8
-
9
- Usage:
10
- python3 generate_imatrix.py model.gguf calibration.txt -o imatrix.dat
11
- """
12
-
13
- import struct
14
- import sys
15
- import os
16
- import time
17
- import mmap
18
- import ctypes
19
- import numpy as np
20
- from collections import OrderedDict
21
-
22
- # ─── Constants ──────────────────────────────────────────────────────────────
23
- GGUF_MAGIC = 0x46554747
24
- ALIGNMENT = 32
25
- QK_K = 256
26
- QK4_0 = 32
27
- QK8_0 = 32
28
-
29
- GGML_TYPE_F32 = 0
30
- GGML_TYPE_F16 = 1
31
- GGML_TYPE_Q4_0 = 2
32
- GGML_TYPE_Q8_0 = 8
33
- GGML_TYPE_Q2_K = 10
34
- GGML_TYPE_BF16 = 30
35
-
36
- TYPE_BLOCK_SIZE = {
37
- 0: 1, 1: 1, 2: 32, 3: 32, 6: 32, 7: 32,
38
- 8: 32, 9: 32, 10: 256, 11: 256, 12: 256,
39
- 13: 256, 14: 256, 15: 256, 30: 1,
40
- }
41
- TYPE_BLOCK_BYTES = {
42
- 0: 4, 1: 2, 2: 18, 3: 20, 6: 20, 7: 22,
43
- 8: 34, 9: 36, 10: 84, 11: 110, 12: 144,
44
- 13: 176, 14: 210, 15: 292, 30: 2,
45
- }
46
- TYPE_NAME = {
47
- 0: "F32", 1: "F16", 2: "Q4_0", 8: "Q8_0", 10: "Q2_K", 30: "BF16",
48
- }
49
-
50
-
51
- # ─── GGUF Reader ────────────────────────────────────────────────────────────
52
-
53
- def align_offset(offset):
54
- return (offset + ALIGNMENT - 1) & ~(ALIGNMENT - 1)
55
-
56
- def read_string(f):
57
- slen = struct.unpack('<Q', f.read(8))[0]
58
- return f.read(slen).decode('utf-8', errors='replace')
59
-
60
- def read_kv_value(f, vtype):
61
- """Read and return a KV value."""
62
- if vtype == 0: return struct.unpack('<B', f.read(1))[0]
63
- elif vtype == 1: return struct.unpack('<b', f.read(1))[0]
64
- elif vtype == 2: return struct.unpack('<H', f.read(2))[0]
65
- elif vtype == 3: return struct.unpack('<h', f.read(2))[0]
66
- elif vtype == 4: return struct.unpack('<I', f.read(4))[0]
67
- elif vtype == 5: return struct.unpack('<i', f.read(4))[0]
68
- elif vtype == 6: return struct.unpack('<f', f.read(4))[0]
69
- elif vtype == 7: return bool(struct.unpack('<B', f.read(1))[0])
70
- elif vtype == 8: return read_string(f)
71
- elif vtype == 9:
72
- arr_type = struct.unpack('<I', f.read(4))[0]
73
- arr_len = struct.unpack('<Q', f.read(8))[0]
74
- return [read_kv_value(f, arr_type) for _ in range(arr_len)]
75
- elif vtype == 10: return struct.unpack('<Q', f.read(8))[0]
76
- elif vtype == 11: return struct.unpack('<q', f.read(8))[0]
77
- elif vtype == 12: return struct.unpack('<d', f.read(8))[0]
78
- else:
79
- raise ValueError(f"Unknown KV type {vtype}")
80
-
81
-
82
- class GGUFModel:
83
- """Loads a GGUF model with mmap'd tensor access."""
84
-
85
- def __init__(self, path):
86
- self.path = path
87
- self.file_size = os.path.getsize(path)
88
- self.kv = {}
89
- self.tensor_infos = OrderedDict()
90
- self.data_offset = 0
91
-
92
- self._f = open(path, 'rb')
93
- self._mm = mmap.mmap(self._f.fileno(), 0, access=mmap.ACCESS_READ)
94
- self._parse_header()
95
-
96
- def _parse_header(self):
97
- f = self._f
98
- f.seek(0)
99
- magic = struct.unpack('<I', f.read(4))[0]
100
- assert magic == GGUF_MAGIC, f"Bad GGUF magic: 0x{magic:08X}"
101
- version = struct.unpack('<I', f.read(4))[0]
102
- n_tensors = struct.unpack('<Q', f.read(8))[0]
103
- n_kv = struct.unpack('<Q', f.read(8))[0]
104
-
105
- # Read KV pairs
106
- for _ in range(n_kv):
107
- key = read_string(f)
108
- vtype = struct.unpack('<I', f.read(4))[0]
109
- value = read_kv_value(f, vtype)
110
- self.kv[key] = value
111
-
112
- # Read tensor info
113
- for _ in range(n_tensors):
114
- name = read_string(f)
115
- n_dims = struct.unpack('<I', f.read(4))[0]
116
- dims = [struct.unpack('<Q', f.read(8))[0] for _ in range(n_dims)]
117
- ttype = struct.unpack('<I', f.read(4))[0]
118
- offset = struct.unpack('<Q', f.read(8))[0]
119
- n_elements = 1
120
- for d in dims:
121
- n_elements *= d
122
- blk_sz = TYPE_BLOCK_SIZE.get(ttype, 1)
123
- blk_bytes = TYPE_BLOCK_BYTES.get(ttype, 4)
124
- n_blocks = (n_elements + blk_sz - 1) // blk_sz
125
- data_size = n_blocks * blk_bytes
126
- self.tensor_infos[name] = {
127
- 'dims': dims, 'n_dims': n_dims, 'type': ttype,
128
- 'offset': offset, 'n_elements': n_elements,
129
- 'data_size': data_size,
130
- }
131
-
132
- self.data_offset = align_offset(f.tell())
133
-
134
- def get_arch(self):
135
- arch = self.kv.get('general.architecture')
136
- if not arch:
137
- # Try to infer from tensor names
138
- if any('attn_gate' in n for n in self.tensor_infos):
139
- return 'gemma2'
140
- return 'llama'
141
- return arch
142
-
143
- def get_config(self):
144
- arch = self.get_arch()
145
- n_embd = self.kv.get(f'{arch}.embedding_length', 0)
146
- n_head = self.kv.get(f'{arch}.attention.head_count', 0)
147
- n_head_kv = self.kv.get(f'{arch}.attention.head_count_kv', 0)
148
-
149
- # Auto-detect head_dim: prefer derived from attn_gate over n_embd/n_head
150
- # (Qwen 3.6 has hybrid 10240 QKV output but attn_gate requires 6144.
151
- # 6144 / 24 heads = 256 real head_dim).
152
- head_dim = 0
153
- gate_name = 'blk.0.attn_gate.weight'
154
- if gate_name in self.tensor_infos:
155
- # attn_gate is [n_embd, n_head * head_dim]
156
- gate_cols = self.tensor_infos[gate_name]['dims'][1] # input dim
157
- if n_head > 0:
158
- head_dim = gate_cols // n_head
159
- if head_dim == 0 and n_head > 0:
160
- head_dim = n_embd // n_head
161
-
162
- return {
163
- 'arch': arch,
164
- 'n_layers': self.kv.get(f'{arch}.block_count', 0),
165
- 'n_embd': n_embd,
166
- 'n_head': n_head,
167
- 'n_head_kv': n_head_kv,
168
- 'n_ff': self.kv.get(f'{arch}.feed_forward_length', 0),
169
- 'vocab_size': self.kv.get(f'{arch}.vocab_size', 0),
170
- 'rms_eps': self.kv.get(f'{arch}.attention.layer_norm_rms_epsilon', 1e-6),
171
- 'rope_base': self.kv.get(f'{arch}.rope.freq_base', 10000.0),
172
- 'swa_window': self.kv.get(f'{arch}.attention.sliding_window', 0),
173
- 'head_dim': head_dim,
174
- 'expert_count': self.kv.get(f'{arch}.expert_count', 0),
175
- 'expert_used_count': self.kv.get(f'{arch}.expert_used_count', 0),
176
- }
177
-
178
- def get_tensor_f32(self, name):
179
- """Load a tensor as float32, dequantizing if needed."""
180
- if name not in self.tensor_infos:
181
- return None
182
- ti = self.tensor_infos[name]
183
- abs_offset = self.data_offset + ti['offset']
184
- raw = bytes(self._mm[abs_offset:abs_offset + ti['data_size']])
185
- try:
186
- return dequantize(raw, ti['type'], ti['n_elements'])
187
- except ValueError as e:
188
- print(f" Error dequantizing {name}: {e}")
189
- return None
190
-
191
- def get_tensor_shape(self, name):
192
- """Return the shape of a tensor (GGUF stores reversed dims)."""
193
- if name not in self.tensor_infos:
194
- return None
195
- dims = self.tensor_infos[name]['dims']
196
- # GGUF stores dims in reverse order (row-major): dims[0]=cols, dims[1]=rows
197
- return tuple(reversed(dims))
198
-
199
- def close(self):
200
- self._mm.close()
201
- self._f.close()
202
-
203
-
204
- # ─── Dequantization ─────────────────────────────────────────────────────────
205
-
206
- def dequantize(raw, ttype, n_elements):
207
- """Dequantize raw bytes to float32 numpy array."""
208
- if ttype == GGML_TYPE_F32:
209
- return np.frombuffer(raw, dtype=np.float32).copy()
210
- elif ttype == GGML_TYPE_F16:
211
- return np.frombuffer(raw, dtype=np.float16).astype(np.float32)
212
- elif ttype == GGML_TYPE_BF16:
213
- bf16 = np.frombuffer(raw, dtype=np.uint16)
214
- return (bf16.astype(np.uint32) << 16).view(np.float32).copy()
215
- elif ttype == GGML_TYPE_Q8_0:
216
- return dequant_q8_0(raw, n_elements)
217
- elif ttype == GGML_TYPE_Q4_0:
218
- return dequant_q4_0(raw, n_elements)
219
- elif ttype == GGML_TYPE_Q2_K:
220
- return dequant_q2k(raw, n_elements)
221
- else:
222
- raise ValueError(f"Unsupported quant type {ttype} ({TYPE_NAME.get(ttype, '?')})")
223
-
224
- def dequant_q8_0(raw, n_elements):
225
- n_blocks = n_elements // QK8_0
226
- data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 34)
227
- d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
228
- qs = data[:, 2:34].view(np.int8).astype(np.float32)
229
- return (d * qs).reshape(-1)[:n_elements]
230
-
231
- def dequant_q4_0(raw, n_elements):
232
- n_blocks = n_elements // QK4_0
233
- data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 18)
234
- d = data[:, 0:2].view(np.float16).astype(np.float32).reshape(n_blocks, 1)
235
- qs = data[:, 2:18] # 16 bytes = 32 nibbles
236
- lo = (qs & 0xF).astype(np.float32) - 8.0
237
- hi = (qs >> 4).astype(np.float32) - 8.0
238
- # Correct nibble interleaving: [lo0, hi0, lo1, hi1, ...]
239
- x = np.stack([lo, hi], axis=2).reshape(n_blocks, 32)
240
- return (d * x).reshape(-1)[:n_elements]
241
-
242
- def dequant_q2k(raw, n_elements):
243
- n_blocks = n_elements // QK_K
244
- data = np.frombuffer(raw, dtype=np.uint8).reshape(n_blocks, 84)
245
- scales_packed = data[:, 0:16] # [n_blocks, 16]
246
- qs = data[:, 16:80] # [n_blocks, 64]
247
- d_fp16 = data[:, 80:82].view(np.float16).astype(np.float32).reshape(n_blocks)
248
- dmin_fp16 = data[:, 82:84].view(np.float16).astype(np.float32).reshape(n_blocks)
249
-
250
- result = np.zeros((n_blocks, QK_K), dtype=np.float32)
251
- for blk in range(n_blocks):
252
- d = d_fp16[blk]
253
- dmin = dmin_fp16[blk]
254
- for half in range(2):
255
- for sub in range(4):
256
- j = half * 4 + sub # Corrected index: 0-3 and 4-7
257
- sc = int(scales_packed[blk, j]) & 0xF
258
- mn = int(scales_packed[blk, j]) >> 4
259
- d_sub = d * sc
260
- m_sub = dmin * mn
261
- for k in range(32):
262
- qi_byte = int(qs[blk, half * 32 + k])
263
- q = (qi_byte >> (sub * 2)) & 3
264
- idx = half * 128 + sub * 32 + k
265
- result[blk, idx] = d_sub * q - m_sub
266
- return result.reshape(-1)[:n_elements]
267
-
268
-
269
- # ─── Tokenizer ──────────────────────────────────────────────────────────────
270
-
271
- class SimpleTokenizer:
272
- """Minimal BPE tokenizer from GGUF metadata, with HPC acceleration."""
273
-
274
- def __init__(self, model):
275
- self.model_path = model._f.name
276
- self.tokens = model.kv.get('tokenizer.ggml.tokens', [])
277
- self.vocab_size = len(self.tokens)
278
- merges_raw = model.kv.get('tokenizer.ggml.merges', [])
279
-
280
- # Override for models (like Mistral v0.3) where gguf merges are missing
281
- if not merges_raw:
282
- try:
283
- import json
284
- model_dir = os.path.dirname(os.path.abspath(self.model_path))
285
- tok_path = os.path.join(model_dir, 'tokenizer.json')
286
- if os.path.exists(tok_path):
287
- with open(tok_path, 'r') as f:
288
- tok_data = json.load(f)
289
- merges_raw = tok_data.get('model', {}).get('merges', [])
290
- if merges_raw:
291
- print(f" Injected {len(merges_raw)} merges from local tokenizer.json!")
292
- except Exception as e:
293
- pass
294
-
295
- self.bos_id = model.kv.get('tokenizer.ggml.bos_token_id', 2)
296
- self.eos_id = model.kv.get('tokenizer.ggml.eos_token_id', 1)
297
-
298
- # Build token → id map
299
- self.token_to_id = {}
300
- for i, t in enumerate(self.tokens):
301
- if isinstance(t, str):
302
- self.token_to_id[t] = i
303
-
304
- # Build merge priority (Python fallback)
305
- self.merges = {}
306
- self._merge_list = [] # ordered list for C bridge
307
- for i, m in enumerate(merges_raw):
308
- if isinstance(m, str):
309
- parts = m.split(' ', 1)
310
- if len(parts) == 2:
311
- self.merges[(parts[0], parts[1])] = i
312
- # Resolve token IDs for C bridge
313
- a_id = self.token_to_id.get(parts[0], -1)
314
- b_id = self.token_to_id.get(parts[1], -1)
315
- merged_tok = parts[0] + parts[1]
316
- merged_id = self.token_to_id.get(merged_tok, -1)
317
- if a_id >= 0 and b_id >= 0 and merged_id >= 0:
318
- self._merge_list.append((a_id, b_id, merged_id, i))
319
-
320
- # Try to load HPC library for accelerated BPE
321
- self._hpc_lib = None
322
- try:
323
- script_dir = os.path.dirname(os.path.abspath(__file__))
324
- lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
325
- if os.path.exists(lib_path):
326
- lib = ctypes.CDLL(lib_path)
327
- if hasattr(lib, 'hexstate_bpe_tokenize'):
328
- self._hpc_lib = lib
329
- print(f" HPC·BPE engine loaded ({len(self._merge_list)} merge rules)")
330
- else:
331
- print(" HPC library found but missing hexstate_bpe_tokenize — rebuild needed")
332
- except Exception as e:
333
- print(f" HPC·BPE not available: {e}")
334
-
335
- def encode(self, text):
336
- """Encode text to token IDs using BPE (HPC-accelerated when available)."""
337
- if not text:
338
- return [self.bos_id]
339
-
340
- # Convert to byte-level tokens (SentencePiece style: ▁ = space)
341
- text = text.replace(' ', '▁')
342
- if not text.startswith('▁'):
343
- text = '▁' + text
344
-
345
- # ── HPC fast path: C library with OpenMP ──
346
- if self._hpc_lib and self._merge_list:
347
- import time as _time
348
- t0 = _time.time()
349
- print(f" HPC·BPE: tokenizing {len(text):,} chars...")
350
-
351
- # Convert characters to initial token IDs
352
- char_ids = np.array(
353
- [self.token_to_id.get(c, 0) for c in text],
354
- dtype=np.int32)
355
-
356
- # Build merge table as C struct array
357
- n_merges = len(self._merge_list)
358
- # BPEMerge struct: 4 × int32 = 16 bytes
359
- merge_buf = np.zeros(n_merges * 4, dtype=np.int32)
360
- for idx, (a, b, m, r) in enumerate(self._merge_list):
361
- merge_buf[idx * 4 + 0] = a
362
- merge_buf[idx * 4 + 1] = b
363
- merge_buf[idx * 4 + 2] = m
364
- merge_buf[idx * 4 + 3] = r
365
-
366
- # Output buffer
367
- output_ids = np.zeros(len(char_ids), dtype=np.int32)
368
- n_tokens = ctypes.c_int64(0)
369
-
370
- self._hpc_lib.hexstate_bpe_tokenize(
371
- char_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
372
- ctypes.c_int64(len(char_ids)),
373
- merge_buf.ctypes.data_as(ctypes.c_void_p),
374
- ctypes.c_int32(n_merges),
375
- output_ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32)),
376
- ctypes.byref(n_tokens),
377
- ctypes.c_int(1), # verbose
378
- )
379
-
380
- elapsed = _time.time() - t0
381
- ids = [self.bos_id] + output_ids[:n_tokens.value].tolist()
382
- print(f" HPC·BPE: {len(text):,} chars → {n_tokens.value:,} tokens [{elapsed:.1f}s]")
383
- return ids
384
-
385
- # ── Python fallback ──
386
- # Start with characters
387
- tokens = list(text)
388
-
389
- # Apply BPE merges — merge ALL instances of the best pair per pass
390
- initial_len = len(tokens)
391
- pass_num = 0
392
- import time as _time
393
- t0 = _time.time()
394
- while len(tokens) > 1:
395
- best_pair = None
396
- best_rank = float('inf')
397
- for i in range(len(tokens) - 1):
398
- pair = (tokens[i], tokens[i + 1])
399
- rank = self.merges.get(pair, float('inf'))
400
- if rank < best_rank:
401
- best_rank = rank
402
- best_pair = pair
403
- if best_pair is None or best_rank == float('inf'):
404
- break
405
- # Merge ALL occurrences of this pair in one pass
406
- a, b = best_pair
407
- prev_len = len(tokens)
408
- new_tokens = []
409
- i = 0
410
- while i < len(tokens):
411
- if i < len(tokens) - 1 and tokens[i] == a and tokens[i + 1] == b:
412
- new_tokens.append(a + b)
413
- i += 2
414
- else:
415
- new_tokens.append(tokens[i])
416
- i += 1
417
- tokens = new_tokens
418
- pass_num += 1
419
- if pass_num % 10 == 0:
420
- elapsed = _time.time() - t0
421
- merged = prev_len - len(tokens)
422
- sys.stdout.write(
423
- f"\r BPE pass {pass_num}: {len(tokens):,} tokens "
424
- f"(-{merged} merged, {len(tokens)/initial_len*100:.1f}%) "
425
- f"[{elapsed:.1f}s] ")
426
- sys.stdout.flush()
427
- if pass_num >= 10:
428
- elapsed = _time.time() - t0
429
- print(f"\r Tokenized: {pass_num} passes, {initial_len:,} chars → "
430
- f"{len(tokens):,} tokens [{elapsed:.1f}s]" + " " * 30)
431
-
432
- # Convert to IDs
433
- ids = [self.bos_id]
434
- for t in tokens:
435
- tid = self.token_to_id.get(t, 0)
436
- ids.append(tid)
437
- return ids
438
-
439
- def chunk_text(self, text, chunk_size=512):
440
- """Encode text and split into fixed-length chunks."""
441
- ids = self.encode(text)
442
- chunks = []
443
- # Use a more reasonable stride (75% overlap instead of 50% for better coverage)
444
- # or just 0% for pure speed. Let's go with 25% overlap as a middle ground.
445
- stride = chunk_size * 3 // 4
446
- for i in range(0, len(ids) - chunk_size + 1, stride):
447
- chunk = ids[i:i + chunk_size]
448
- chunks.append(np.array(chunk, dtype=np.int32))
449
-
450
- if not chunks and ids:
451
- # Pad short text
452
- padded = ids + [self.eos_id] * (chunk_size - len(ids))
453
- chunks.append(np.array(padded[:chunk_size], dtype=np.int32))
454
- return chunks
455
-
456
-
457
- # ─── Transformer Forward Pass ───────────────────────────────────────────────
458
-
459
- def rms_norm(x, weight, eps=1e-6):
460
- rms = np.sqrt(np.mean(x * x, axis=-1, keepdims=True) + eps)
461
- return (x / rms) * weight
462
-
463
- def rope_freqs(dim, seq_len, base=10000.0):
464
- freqs = 1.0 / (base ** (np.arange(0, dim, 2, dtype=np.float32) / dim))
465
- t = np.arange(seq_len, dtype=np.float32)
466
- freqs = np.outer(t, freqs) # [seq_len, dim/2]
467
- return np.cos(freqs), np.sin(freqs)
468
-
469
- def apply_rope(x, cos_f, sin_f):
470
- # x: [seq_len, n_heads, head_dim]
471
- d2 = x.shape[-1] // 2
472
- x0 = x[..., :d2]
473
- x1 = x[..., d2:]
474
- cos_f = cos_f[:x.shape[0], :d2]
475
- sin_f = sin_f[:x.shape[0], :d2]
476
- if x.ndim == 3:
477
- cos_f = cos_f[:, np.newaxis, :]
478
- sin_f = sin_f[:, np.newaxis, :]
479
- o0 = x0 * cos_f - x1 * sin_f
480
- o1 = x1 * cos_f + x0 * sin_f
481
- return np.concatenate([o0, o1], axis=-1)
482
-
483
- def softmax(x, axis=-1):
484
- x_max = np.max(x, axis=axis, keepdims=True)
485
- e = np.exp(x - x_max)
486
- return e / np.sum(e, axis=axis, keepdims=True)
487
-
488
- def silu(x):
489
- """SiLU / Swish activation — used by LLaMA, Mistral, Qwen, DeepSeek."""
490
- return x * (1.0 / (1.0 + np.exp(-np.clip(x, -88, 88))))
491
-
492
- def gelu(x):
493
- """GELU activation — used by Gemma, GPT-2."""
494
- return 0.5 * x * (1.0 + np.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * x**3)))
495
-
496
- # Architecture → activation function mapping
497
- ACTIVATION_MAP = {
498
- 'llama': silu, 'mistral': silu, 'qwen2': silu, 'qwen2moe': silu,
499
- 'phi3': silu, 'falcon': silu, 'deepseek': silu, 'deepseek2': silu,
500
- 'gemma': gelu, 'gemma2': gelu, 'gpt2': gelu,
501
- }
502
-
503
-
504
- class TransformerRunner:
505
- """Minimal Gemma transformer for importance collection."""
506
-
507
- def __init__(self, model, config, verbose=False, linear_attn=True):
508
- self.model = model
509
- self.cfg = config
510
- self.verbose = verbose
511
- self.head_dim = config.get('head_dim', config['n_embd'] // config['n_head'])
512
- self.act_fn = ACTIVATION_MAP.get(config['arch'], silu)
513
- self.linear_attn = linear_attn
514
-
515
- # Importance accumulators: tensor_name → (sum_x2, count)
516
- self.importance = {}
517
-
518
- # HPC C library for accelerated forward pass
519
- self._hpc_lib = None
520
- try:
521
- script_dir = os.path.dirname(os.path.abspath(__file__))
522
- lib_path = os.path.join(script_dir, 'libhexstate_q2k.so')
523
- if os.path.exists(lib_path):
524
- lib = ctypes.CDLL(lib_path)
525
- if hasattr(lib, 'hexstate_forward_layer'):
526
- self._hpc_lib = lib
527
- if verbose:
528
- print(" HPC·Forward engine loaded (hexstate_forward_layer)")
529
- except Exception:
530
- pass
531
-
532
- def _record(self, name, x):
533
- """Record E[x²] for this tensor's input activation."""
534
- # x shape: [..., n_cols] — record per-column (input channel)
535
- x_flat = x.reshape(-1, x.shape[-1])
536
- x2 = np.sum(x_flat ** 2, axis=0)
537
- if name in self.importance:
538
- self.importance[name] = (
539
- self.importance[name][0] + x2,
540
- self.importance[name][1] + x_flat.shape[0],
541
- )
542
- else:
543
- self.importance[name] = (x2.copy(), x_flat.shape[0])
544
-
545
- def _get_weight(self, name):
546
- """Load weight, trying GGUF name patterns."""
547
- w = self.model.get_tensor_f32(name)
548
- if w is None:
549
- return None
550
- shape = self.model.get_tensor_shape(name)
551
- if shape and len(shape) >= 2:
552
- return w.reshape(shape)
553
- return w
554
-
555
- def _layer_prefix(self, layer_idx):
556
- return f"blk.{layer_idx}"
557
-
558
- def _hpc_forward_layer(self, hidden, layer_idx):
559
- """Full layer forward pass via C hexstate_forward_layer.
560
-
561
- Loads weights, creates ctypes pointers, calls C, reads back importance.
562
- Returns updated hidden state.
563
- """
564
- pfx = self._layer_prefix(layer_idx)
565
- cfg = self.cfg
566
- lib = self._hpc_lib
567
- seq_len = hidden.shape[0]
568
- n_embd = cfg['n_embd']
569
- n_head = cfg['n_head']
570
- n_head_kv = cfg['n_head_kv']
571
- if isinstance(n_head_kv, list):
572
- n_head_kv = n_head_kv[layer_idx]
573
- head_dim = self.head_dim
574
- eps = cfg['rms_eps']
575
-
576
- FP = ctypes.POINTER(ctypes.c_float)
577
- I64P = ctypes.POINTER(ctypes.c_int64)
578
-
579
- def _fp(arr):
580
- if arr is None: return ctypes.cast(None, FP), None
581
- a = np.ascontiguousarray(arr, dtype=np.float32)
582
- return a.ctypes.data_as(FP), a
583
-
584
- def _imp(name, dim):
585
- """Get or create importance accumulator, return (pointer, count_ptr, holder)."""
586
- if name not in self.importance:
587
- self.importance[name] = (np.zeros(dim, dtype=np.float32), 0)
588
- imp_arr = np.ascontiguousarray(self.importance[name][0], dtype=np.float32)
589
- cnt = ctypes.c_int64(self.importance[name][1])
590
- return imp_arr.ctypes.data_as(FP), ctypes.byref(cnt), imp_arr, cnt
591
-
592
- # Make hidden contiguous and get pointer
593
- hidden = np.ascontiguousarray(hidden, dtype=np.float32)
594
- h_ptr = hidden.ctypes.data_as(FP)
595
-
596
- # Load all weights for this layer
597
- norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
598
- if norm_w is None:
599
- return hidden
600
-
601
- qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
602
- q_w = self._get_weight(f'{pfx}.attn_q.weight')
603
- k_w = self._get_weight(f'{pfx}.attn_k.weight')
604
- v_w = self._get_weight(f'{pfx}.attn_v.weight')
605
- gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
606
- o_w = self._get_weight(f'{pfx}.attn_output.weight')
607
- ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
608
- if ffn_norm_w is None:
609
- ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
610
- ffn_gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
611
- ffn_up_w = self._get_weight(f'{pfx}.ffn_up.weight')
612
- ffn_down_w = self._get_weight(f'{pfx}.ffn_down.weight')
613
-
614
- # Prepare ctypes args (keep refs to prevent GC)
615
- refs = []
616
- def fp(arr):
617
- p, a = _fp(arr)
618
- refs.append(a)
619
- return p
620
-
621
- norm_p = fp(norm_w)
622
- qkv_p = fp(qkv_w)
623
- q_p = fp(q_w)
624
- k_p = fp(k_w)
625
- v_p = fp(v_w)
626
- gate_p = fp(gate_w)
627
- o_p = fp(o_w)
628
- ffn_norm_p = fp(ffn_norm_w)
629
- ffn_gate_p = fp(ffn_gate_w)
630
- ffn_up_p = fp(ffn_up_w)
631
- ffn_down_p = fp(ffn_down_w)
632
-
633
- qkv_dim = qkv_w.shape[0] if qkv_w is not None else 0
634
- q_dim_v = q_w.shape[0] if q_w is not None else 0
635
- k_dim_v = k_w.shape[0] if k_w is not None else 0
636
- v_dim_v = v_w.shape[0] if v_w is not None else 0
637
- gate_rows = gate_w.shape[0] if gate_w is not None else 0
638
- o_cols = o_w.shape[1] if (o_w is not None and o_w.ndim >= 2) else 0
639
- ffn_d = ffn_gate_w.shape[0] if ffn_gate_w is not None else 0
640
-
641
- # Importance accumulators
642
- imp_refs = [] # Keep alive
643
- null_fp = ctypes.cast(None, FP)
644
- null_i64p = ctypes.cast(None, I64P)
645
-
646
- def make_imp(name, dim):
647
- if dim <= 0:
648
- return null_fp, null_i64p
649
- p, cp, arr, cnt = _imp(name, dim)
650
- imp_refs.append((name, arr, cnt))
651
- return p, cp
652
-
653
- imp_qkv_p, cnt_qkv_p = make_imp(f'{pfx}.attn_qkv.weight', n_embd if qkv_w is not None else 0)
654
- imp_q_p, cnt_q_p = make_imp(f'{pfx}.attn_q.weight', n_embd if q_w is not None else 0)
655
- imp_k_p, cnt_k_p = make_imp(f'{pfx}.attn_k.weight', n_embd if k_w is not None else 0)
656
- imp_v_p, cnt_v_p = make_imp(f'{pfx}.attn_v.weight', n_embd if v_w is not None else 0)
657
- imp_gate_p, cnt_gate_p = make_imp(f'{pfx}.attn_gate.weight', n_head * head_dim if gate_w is not None else 0)
658
- imp_o_p, cnt_o_p = make_imp(f'{pfx}.attn_output.weight', o_cols if o_w is not None else 0)
659
- imp_fg_p, cnt_fg_p = make_imp(f'{pfx}.ffn_gate.weight', n_embd if ffn_gate_w is not None else 0)
660
- imp_fu_p, cnt_fu_p = make_imp(f'{pfx}.ffn_up.weight', n_embd if ffn_up_w is not None else 0)
661
- imp_fd_p, cnt_fd_p = make_imp(f'{pfx}.ffn_down.weight', ffn_d if ffn_down_w is not None else 0)
662
-
663
- # Call C function — entire layer in one call (FFN part will be NULL if MoE)
664
- lib.hexstate_forward_layer(
665
- h_ptr,
666
- norm_p,
667
- qkv_p, ctypes.c_int64(qkv_dim),
668
- q_p, ctypes.c_int64(q_dim_v),
669
- k_p, ctypes.c_int64(k_dim_v),
670
- v_p, ctypes.c_int64(v_dim_v),
671
- gate_p, ctypes.c_int64(gate_rows),
672
- o_p, ctypes.c_int64(o_cols),
673
- ffn_norm_p,
674
- ffn_gate_p, ffn_up_p, ffn_down_p,
675
- ctypes.c_int64(ffn_d),
676
- imp_qkv_p, cnt_qkv_p,
677
- imp_q_p, cnt_q_p,
678
- imp_k_p, cnt_k_p,
679
- imp_v_p, cnt_v_p,
680
- imp_gate_p, cnt_gate_p,
681
- imp_o_p, cnt_o_p,
682
- imp_fg_p, cnt_fg_p,
683
- imp_fu_p, cnt_fu_p,
684
- imp_fd_p, cnt_fd_p,
685
- ctypes.c_int64(seq_len), ctypes.c_int64(n_embd),
686
- ctypes.c_int64(n_head), ctypes.c_int64(n_head_kv),
687
- ctypes.c_int64(head_dim), ctypes.c_float(eps))
688
-
689
- # Read back importance for the tensors that WERE processed in C
690
- for name, arr, cnt in imp_refs:
691
- # Extract value from ctypes byref pointer
692
- self.importance[name] = (arr.astype(np.float64), cnt.value)
693
-
694
- # Handle MoE FFN if C code skipped it
695
- if ffn_gate_w is None:
696
- # Re-normalize for FFN
697
- normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, eps)
698
- hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
699
-
700
- # Force-free per-layer weight buffers (~1.4 GB) before next layer
701
- del refs, imp_refs
702
- import gc; gc.collect()
703
-
704
- return hidden
705
-
706
- def _forward_moe_ffn(self, hidden, normed_ff, pfx):
707
- """Python-side MoE FFN handling (supports packed and shared experts)."""
708
- gate_inp_w = self._get_weight(f'{pfx}.ffn_gate_inp.weight')
709
- if gate_inp_w is None:
710
- return hidden
711
-
712
- self._record(f'{pfx}.ffn_gate_inp.weight', normed_ff)
713
- router_logits = normed_ff @ gate_inp_w.T
714
- n_experts = router_logits.shape[-1]
715
- probs = softmax(router_logits, axis=-1)
716
- topk = self.cfg.get('expert_used_count', 2)
717
- top_k_indices = np.argsort(probs, axis=-1)[:, -topk:]
718
-
719
- ff_out = np.zeros_like(normed_ff)
720
-
721
- # Check for packed experts (Qwen style)
722
- p_gate = self._get_weight(f'{pfx}.ffn_gate_exps.weight')
723
- p_up = self._get_weight(f'{pfx}.ffn_up_exps.weight')
724
- p_down = self._get_weight(f'{pfx}.ffn_down_exps.weight')
725
-
726
- for exp_id in range(n_experts):
727
- if p_gate is not None:
728
- ew_gate = p_gate[exp_id]
729
- ew_up = p_up[exp_id]
730
- ew_down = p_down[exp_id]
731
- else:
732
- ew_gate = self._get_weight(f'{pfx}.ffn_gate.{exp_id}.weight')
733
- ew_up = self._get_weight(f'{pfx}.ffn_up.{exp_id}.weight')
734
- ew_down = self._get_weight(f'{pfx}.ffn_down.{exp_id}.weight')
735
-
736
- if ew_gate is None: continue
737
-
738
- mask_exp = np.any(top_k_indices == exp_id, axis=-1)
739
- if not np.any(mask_exp): continue
740
-
741
- exp_input = normed_ff[mask_exp]
742
-
743
- # Record importance
744
- if p_gate is not None:
745
- self._record(f'{pfx}.ffn_gate_exps.weight', exp_input)
746
- self._record(f'{pfx}.ffn_up_exps.weight', exp_input)
747
- else:
748
- self._record(f'{pfx}.ffn_gate.{exp_id}.weight', exp_input)
749
- self._record(f'{pfx}.ffn_up.{exp_id}.weight', exp_input)
750
-
751
- g = self.act_fn(exp_input @ ew_gate.T)
752
- u = exp_input @ ew_up.T
753
- mid = g * u
754
-
755
- if p_gate is not None:
756
- self._record(f'{pfx}.ffn_down_exps.weight', mid)
757
- else:
758
- self._record(f'{pfx}.ffn_down.{exp_id}.weight', mid)
759
-
760
- exp_out = mid @ ew_down.T
761
- indices = np.where(mask_exp)[0]
762
- for i, tidx in enumerate(indices):
763
- w = probs[tidx, exp_id]
764
- ff_out[tidx] += w * exp_out[i]
765
-
766
- # Shared experts (Qwen style)
767
- sh_gate = self._get_weight(f'{pfx}.ffn_gate_shexp.weight')
768
- if sh_gate is not None:
769
- sh_up = self._get_weight(f'{pfx}.ffn_up_shexp.weight')
770
- sh_down = self._get_weight(f'{pfx}.ffn_down_shexp.weight')
771
- self._record(f'{pfx}.ffn_gate_shexp.weight', normed_ff)
772
- self._record(f'{pfx}.ffn_up_shexp.weight', normed_ff)
773
- g = self.act_fn(normed_ff @ sh_gate.T)
774
- u = normed_ff @ sh_up.T
775
- mid = g * u
776
- self._record(f'{pfx}.ffn_down_shexp.weight', mid)
777
- ff_out += mid @ sh_down.T
778
-
779
- return hidden + ff_out
780
-
781
- def _hpc_rms_norm(self, x, weight, eps):
782
- """RMS norm via HPC C library, falling back to numpy."""
783
- if self._hpc_lib and x.flags['C_CONTIGUOUS']:
784
- seq_len, dim = x.shape
785
- out = np.empty_like(x)
786
- w = np.ascontiguousarray(weight, dtype=np.float32)
787
- self._hpc_lib.hexstate_rms_norm(
788
- x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
789
- w.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
790
- out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
791
- ctypes.c_int64(seq_len), ctypes.c_int64(dim),
792
- ctypes.c_float(eps))
793
- return out
794
- return rms_norm(x, weight, eps)
795
-
796
- def _hpc_matmul_record(self, name, x, weight):
797
- """Fused matmul + importance recording via HPC C library.
798
-
799
- Uses HPCGraph phase-coherent importance modulation (see hexstate_matmul_record in C)
800
- for the E[x²] accumulation, but delegates the actual matmul to numpy BLAS
801
- for maximum speed on large matrices.
802
- Returns x @ weight.T while recording importance for `name`.
803
- """
804
- if self._hpc_lib and x.flags['C_CONTIGUOUS'] and weight.flags['C_CONTIGUOUS']:
805
- M, K = x.shape
806
- N = weight.shape[0] # weight is [N, K], computing x @ W.T -> [M, N]
807
-
808
- # HPC importance: C library builds HPCGraph over columns,
809
- # encodes x² as triality amplitudes, CZ-couples adjacent columns,
810
- # and modulates importance by hpc_marginal phase coherence.
811
- if name not in self.importance:
812
- self.importance[name] = (np.zeros(K, dtype=np.float64), 0)
813
- imp_f32 = self.importance[name][0].astype(np.float32)
814
- count = ctypes.c_int64(self.importance[name][1])
815
-
816
- # Pass real weights to C library for importance recording
817
- weight_ptr = weight.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
818
- # Dummy output — we only want the importance recording
819
- dummy_out = np.empty((M, 1), dtype=np.float32)
820
-
821
- self._hpc_lib.hexstate_matmul_record(
822
- x.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
823
- weight_ptr,
824
- dummy_out.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
825
- imp_f32.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
826
- ctypes.c_int64(M), ctypes.c_int64(K), ctypes.c_int64(N),
827
- ctypes.byref(count))
828
-
829
- self.importance[name] = (imp_f32.astype(np.float64), count.value)
830
-
831
- # Matmul via numpy BLAS (much faster than our C loop for large N)
832
- return x @ weight.T
833
-
834
- # Fallback: pure numpy
835
- self._record(name, x)
836
- return x @ weight.T
837
-
838
- def forward_layer_linear(self, hidden, layer_idx):
839
- """HPC-linearized forward: O(seq) attention for imatrix collection.
840
-
841
- Instead of full O(seq²) softmax attention, uses causal linear attention:
842
- each position's output is a running weighted average of V, where weights
843
- come from Q·K similarity in phase space. This preserves activation
844
- magnitude statistics (which is all imatrix needs) while being O(seq).
845
-
846
- Records identical importance stats as the full forward_layer.
847
- """
848
- pfx = self._layer_prefix(layer_idx)
849
- cfg = self.cfg
850
- n_head = cfg['n_head']
851
- n_head_kv = cfg['n_head_kv']
852
- if isinstance(n_head_kv, list):
853
- n_head_kv = n_head_kv[layer_idx]
854
- seq_len = hidden.shape[0]
855
-
856
- # ── Attention norm ──
857
- attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
858
- if attn_norm_w is None:
859
- return hidden
860
- normed = self._hpc_rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
861
-
862
- # ── Check for fused vs separate QKV ──
863
- qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
864
- gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
865
- q_w = self._get_weight(f'{pfx}.attn_q.weight')
866
- k_w = self._get_weight(f'{pfx}.attn_k.weight')
867
- v_w = self._get_weight(f'{pfx}.attn_v.weight')
868
- o_w = self._get_weight(f'{pfx}.attn_output.weight')
869
-
870
- if qkv_w is not None:
871
- # ── Fused QKV path (Qwen 3.6 hybrid layers) ──
872
- head_dim = self.head_dim
873
- q_dim = n_head * head_dim
874
- kv_dim = n_head_kv * head_dim
875
- qkv = self._hpc_matmul_record(f'{pfx}.attn_qkv.weight', normed, qkv_w)
876
- q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
877
- k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
878
- v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
879
-
880
- # GQA expand
881
- if n_head_kv < n_head:
882
- rep = n_head // n_head_kv
883
- k = np.repeat(k, rep, axis=1)
884
- v = np.repeat(v, rep, axis=1)
885
-
886
- # ── Linear attention: O(seq × head_dim²) ──
887
- # φ(x) = elu(x) + 1 (feature map for linear attention)
888
- q_feat = np.maximum(q, 0) + 1e-6 # [seq, n_head, head_dim]
889
- k_feat = np.maximum(k, 0) + 1e-6
890
-
891
- # Causal linear attention via running state (vectorized over heads):
892
- # S_t = S_{t-1} + k_t ⊗ v_t (outer product accumulator)
893
- # z_t = z_{t-1} + k_t (normalizer accumulator)
894
- # out_t = (q_t @ S_t) / (q_t · z_t)
895
- out = np.zeros_like(q) # [seq, n_head, head_dim]
896
- S = np.zeros((n_head, head_dim, head_dim), dtype=np.float32)
897
- z = np.zeros((n_head, head_dim), dtype=np.float32)
898
-
899
- for t in range(seq_len):
900
- # Vectorized over all heads: [n_head, head_dim]
901
- kt = k_feat[t] # [n_head, head_dim]
902
- vt = v[t] # [n_head, head_dim]
903
- qt = q_feat[t] # [n_head, head_dim]
904
- # S[h] += outer(kt[h], vt[h]) for all h at once
905
- S += kt[:, :, None] * vt[:, None, :] # [n_head, hd, hd]
906
- z += kt # [n_head, hd]
907
- # num = qt @ S -> [n_head, head_dim]
908
- num = np.einsum('hd,hde->he', qt, S)
909
- den = np.sum(qt * z, axis=-1, keepdims=True) + 1e-8 # [n_head, 1]
910
- out[t] = num / den
911
-
912
- attn_result = out.reshape(seq_len, -1) # [seq, n_head * head_dim]
913
-
914
- # Record and project
915
- if gate_w is not None:
916
- self._record(f'{pfx}.attn_gate.weight', attn_result)
917
- if gate_w.shape[1] == hidden.shape[-1]:
918
- attn_out = attn_result @ gate_w
919
- else:
920
- attn_out = attn_result @ gate_w.T
921
- else:
922
- attn_out = np.zeros_like(hidden)
923
-
924
- elif q_w is not None and k_w is not None and v_w is not None and o_w is not None:
925
- # ── Separate QKV path (standard transformer layers) ──
926
- q = self._hpc_matmul_record(f'{pfx}.attn_q.weight', normed, q_w)
927
- k = self._hpc_matmul_record(f'{pfx}.attn_k.weight', normed, k_w)
928
- v = self._hpc_matmul_record(f'{pfx}.attn_v.weight', normed, v_w)
929
-
930
- head_dim_q = q_w.shape[0] // n_head
931
- head_dim_kv = k_w.shape[0] // n_head_kv
932
-
933
- q = q.reshape(seq_len, n_head, head_dim_q)
934
- k = k.reshape(seq_len, n_head_kv, head_dim_kv)
935
- v = v.reshape(seq_len, n_head_kv, head_dim_kv)
936
-
937
- if n_head_kv < n_head:
938
- rep = n_head // n_head_kv
939
- k = np.repeat(k, rep, axis=1)
940
- v = np.repeat(v, rep, axis=1)
941
-
942
- # Linear attention with feature map
943
- q_feat = np.maximum(q, 0) + 1e-6
944
- k_feat = np.maximum(k, 0) + 1e-6
945
-
946
- out = np.zeros_like(v) # [seq, n_head, head_dim_kv]
947
- S = np.zeros((n_head, head_dim_kv, head_dim_kv), dtype=np.float32)
948
- z = np.zeros((n_head, head_dim_kv), dtype=np.float32)
949
-
950
- # Use min of q/k dims for the state accumulator
951
- feat_dim = min(head_dim_q, head_dim_kv)
952
- S = np.zeros((n_head, feat_dim, head_dim_kv), dtype=np.float32)
953
- z = np.zeros((n_head, feat_dim), dtype=np.float32)
954
-
955
- for t in range(seq_len):
956
- # Vectorized over all heads
957
- kf = k_feat[t, :, :feat_dim] # [n_head, feat_dim]
958
- qf = q_feat[t, :, :feat_dim] # [n_head, feat_dim]
959
- vt = v[t] # [n_head, head_dim_kv]
960
- S += kf[:, :, None] * vt[:, None, :] # [n_head, feat_dim, head_dim_kv]
961
- z += kf # [n_head, feat_dim]
962
- num = np.einsum('hd,hde->he', qf, S) # [n_head, head_dim_kv]
963
- den = np.sum(qf * z, axis=-1, keepdims=True) + 1e-8
964
- out[t] = num / den
965
-
966
- attn_result = out.reshape(seq_len, -1)
967
-
968
- # Pad/truncate to match o_w input size
969
- if attn_result.shape[-1] != o_w.shape[1]:
970
- if attn_result.shape[-1] < o_w.shape[1]:
971
- padded = np.zeros((seq_len, o_w.shape[1]), dtype=attn_result.dtype)
972
- padded[:, :attn_result.shape[-1]] = attn_result
973
- attn_result = padded
974
- else:
975
- attn_result = attn_result[:, :o_w.shape[1]]
976
-
977
- self._record(f'{pfx}.attn_output.weight', attn_result)
978
- attn_out = attn_result @ o_w.T
979
- else:
980
- return hidden
981
-
982
- hidden = hidden + attn_out
983
-
984
- # ── SSM path (Qwen 3.6 hybrid) ──
985
- ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
986
- ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
987
- ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
988
- if ssm_alpha_w is not None:
989
- self._record(f'{pfx}.ssm_alpha.weight', normed)
990
- if ssm_beta_w is not None:
991
- self._record(f'{pfx}.ssm_beta.weight', normed)
992
- if ssm_out_w is not None:
993
- if qkv_w is not None:
994
- qkv_full = normed @ qkv_w.T
995
- ssm_proxy = qkv_full[:, :ssm_out_w.shape[1]] if qkv_full.shape[-1] >= ssm_out_w.shape[1] else normed
996
- else:
997
- ssm_proxy = normed
998
- self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
999
- if ssm_out_w.shape[0] == hidden.shape[-1]:
1000
- hidden = hidden + ssm_proxy @ ssm_out_w.T
1001
-
1002
- # ── FFN ──
1003
- ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
1004
- if ffn_norm_w is None:
1005
- ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
1006
- if ffn_norm_w is None:
1007
- return hidden
1008
-
1009
- normed_ff = self._hpc_rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
1010
-
1011
- gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
1012
- up_w = self._get_weight(f'{pfx}.ffn_up.weight')
1013
- down_w = self._get_weight(f'{pfx}.ffn_down.weight')
1014
-
1015
- if gate_fw is not None and up_w is not None and down_w is not None:
1016
- gate_out = self.act_fn(self._hpc_matmul_record(f'{pfx}.ffn_gate.weight', normed_ff, gate_fw))
1017
- up_out = self._hpc_matmul_record(f'{pfx}.ffn_up.weight', normed_ff, up_w)
1018
- ff_mid = gate_out * up_out
1019
- self._record(f'{pfx}.ffn_down.weight', ff_mid)
1020
- ff_out = ff_mid @ down_w.T
1021
- hidden = hidden + ff_out
1022
-
1023
- return hidden
1024
-
1025
- def forward_layer(self, hidden, layer_idx, cos_f, sin_f):
1026
- """Forward pass through one transformer layer. Returns new hidden state."""
1027
- pfx = self._layer_prefix(layer_idx)
1028
- cfg = self.cfg
1029
- n_head = cfg['n_head']
1030
- n_head_kv = cfg['n_head_kv']
1031
- if isinstance(n_head_kv, list):
1032
- n_head_kv = n_head_kv[layer_idx]
1033
- head_dim = self.head_dim
1034
- seq_len = hidden.shape[0]
1035
-
1036
- # ── Attention ──
1037
- attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
1038
- if attn_norm_w is None:
1039
- return hidden # Skip if weights missing
1040
-
1041
- normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
1042
-
1043
- # Q/K/V projections — record importance on the INPUT (normed)
1044
- q_w = self._get_weight(f'{pfx}.attn_q.weight')
1045
- k_w = self._get_weight(f'{pfx}.attn_k.weight')
1046
- v_w = self._get_weight(f'{pfx}.attn_v.weight')
1047
- o_w = self._get_weight(f'{pfx}.attn_output.weight')
1048
-
1049
- if q_w is None or k_w is None or v_w is None or o_w is None:
1050
- return hidden
1051
-
1052
- self._record(f'{pfx}.attn_q.weight', normed)
1053
- self._record(f'{pfx}.attn_k.weight', normed)
1054
- self._record(f'{pfx}.attn_v.weight', normed)
1055
-
1056
- q = normed @ q_w.T # [seq, q_w.shape[0]]
1057
- k = normed @ k_w.T # [seq, k_w.shape[0]]
1058
- v = normed @ v_w.T
1059
-
1060
- # Dynamic head_dim based on tensor size
1061
- head_dim_q = q_w.shape[0] // n_head
1062
- head_dim_kv = k_w.shape[0] // n_head_kv
1063
-
1064
- q = q.reshape(seq_len, n_head, head_dim_q)
1065
- k = k.reshape(seq_len, n_head_kv, head_dim_kv)
1066
- v = v.reshape(seq_len, n_head_kv, head_dim_kv)
1067
-
1068
- # Apply RoPE
1069
- if head_dim_q != head_dim:
1070
- cos_q, sin_q = rope_freqs(head_dim_q, seq_len, cfg['rope_base'])
1071
- q = apply_rope(q, cos_q, sin_q)
1072
- else:
1073
- q = apply_rope(q, cos_f, sin_f)
1074
-
1075
- if head_dim_kv != head_dim:
1076
- cos_k, sin_k = rope_freqs(head_dim_kv, seq_len, cfg['rope_base'])
1077
- k = apply_rope(k, cos_k, sin_k)
1078
- else:
1079
- k = apply_rope(k, cos_f, sin_f)
1080
-
1081
- # GQA: repeat KV heads
1082
- if n_head_kv < n_head:
1083
- rep = n_head // n_head_kv
1084
- k = np.repeat(k, rep, axis=1)
1085
- v = np.repeat(v, rep, axis=1)
1086
-
1087
- q_t = q.transpose(1, 0, 2) # [n_head, seq, head_dim_q]
1088
- k_t = k.transpose(1, 0, 2) # [n_head, seq, head_dim_kv]
1089
- v_t = v.transpose(1, 0, 2) # [n_head, seq, head_dim_kv]
1090
-
1091
- scale = 1.0 / np.sqrt(head_dim_q)
1092
-
1093
- # If Q and K head dims differ, there might be a projection or it's not standard SDP.
1094
- # But for importance calculation, if we just need to get the attention magnitude:
1095
- # We can pad K to match Q, or truncate Q to match K. We only need an approximation.
1096
- if head_dim_q != head_dim_kv:
1097
- if head_dim_q > head_dim_kv:
1098
- k_t_padded = np.zeros_like(q_t)
1099
- k_t_padded[..., :head_dim_kv] = k_t
1100
- k_t = k_t_padded
1101
- else:
1102
- q_t_padded = np.zeros_like(k_t)
1103
- q_t_padded[..., :head_dim_q] = q_t
1104
- q_t = q_t_padded
1105
-
1106
- attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale # [n_head, seq, seq]
1107
-
1108
- # Causal mask (with optional sliding window)
1109
- mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
1110
- swa = cfg.get('swa_window', 0)
1111
- if swa and swa > 0:
1112
- for i in range(seq_len):
1113
- for j in range(0, max(0, i - swa)):
1114
- mask[i, j] = -1e9
1115
- attn = attn + mask[np.newaxis, :, :]
1116
- attn = softmax(attn, axis=-1)
1117
-
1118
- out = np.matmul(attn, v_t) # [n_head, seq, head_dim_kv]
1119
-
1120
- # Output projection input is out_w.T -> [in_features, out_features]
1121
- # In_features is out_w.shape[1]
1122
- out = out.transpose(1, 0, 2).reshape(seq_len, -1) # [seq, n_head * head_dim_kv]
1123
-
1124
- # Pad or truncate out to match expected input size of o_w
1125
- if out.shape[-1] != o_w.shape[1]:
1126
- if out.shape[-1] < o_w.shape[1]:
1127
- out_padded = np.zeros((seq_len, o_w.shape[1]), dtype=out.dtype)
1128
- out_padded[:, :out.shape[-1]] = out
1129
- out = out_padded
1130
- else:
1131
- out = out[:, :o_w.shape[1]]
1132
-
1133
- self._record(f'{pfx}.attn_output.weight', out)
1134
- attn_out = out @ o_w.T
1135
-
1136
- hidden = hidden + attn_out
1137
-
1138
- # ── FFN ──
1139
- ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
1140
- if ffn_norm_w is None:
1141
- return hidden
1142
-
1143
- normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
1144
-
1145
- gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
1146
- up_w = self._get_weight(f'{pfx}.ffn_up.weight')
1147
- down_w = self._get_weight(f'{pfx}.ffn_down.weight')
1148
-
1149
- if gate_w is not None and up_w is not None and down_w is not None:
1150
- self._record(f'{pfx}.ffn_gate.weight', normed_ff)
1151
- self._record(f'{pfx}.ffn_up.weight', normed_ff)
1152
-
1153
- gate_out = self.act_fn(normed_ff @ gate_w.T)
1154
- up_out = normed_ff @ up_w.T
1155
- ff_mid = gate_out * up_out
1156
-
1157
- self._record(f'{pfx}.ffn_down.weight', ff_mid)
1158
- ff_out = ff_mid @ down_w.T
1159
- hidden = hidden + ff_out
1160
- else:
1161
- # MoE path
1162
- hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
1163
-
1164
- return hidden
1165
-
1166
- def forward_linear_attn_layer(self, hidden, layer_idx):
1167
- """Forward pass through a DeltaNet (gated linear attention) layer.
1168
-
1169
- Used by Qwen 3.5/3.6 for ~75% of layers. Records importance stats
1170
- for all SSM projection weights.
1171
- """
1172
- pfx = self._layer_prefix(layer_idx)
1173
- cfg = self.cfg
1174
- seq_len = hidden.shape[0]
1175
-
1176
- # ── Attention norm ──
1177
- attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
1178
- if attn_norm_w is None:
1179
- return hidden
1180
- normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
1181
-
1182
- # ── DeltaNet projections ──
1183
- qkv_w = self._get_weight(f'{pfx}.ssm_in_qkv.weight')
1184
- z_w = self._get_weight(f'{pfx}.ssm_in_z.weight')
1185
- a_w = self._get_weight(f'{pfx}.ssm_in_a.weight')
1186
- b_w = self._get_weight(f'{pfx}.ssm_in_b.weight')
1187
- out_w = self._get_weight(f'{pfx}.ssm_out.weight')
1188
-
1189
- if qkv_w is None or out_w is None:
1190
- return hidden
1191
-
1192
- # Record importance on input activations
1193
- self._record(f'{pfx}.ssm_in_qkv.weight', normed)
1194
- if z_w is not None:
1195
- self._record(f'{pfx}.ssm_in_z.weight', normed)
1196
- if a_w is not None:
1197
- self._record(f'{pfx}.ssm_in_a.weight', normed)
1198
- if b_w is not None:
1199
- self._record(f'{pfx}.ssm_in_b.weight', normed)
1200
-
1201
- # Approximate forward: project through QKV and output
1202
- # (Full DeltaNet recurrence is complex; for importance collection
1203
- # we just need the activation magnitudes at each projection)
1204
- qkv = normed @ qkv_w.T
1205
-
1206
- # For importance: record output projection input
1207
- # Use qkv as a proxy for the recurrent state output
1208
- n_out = out_w.shape[1] if out_w.ndim >= 2 else hidden.shape[-1]
1209
- if qkv.shape[-1] >= n_out:
1210
- out_input = qkv[:, :n_out]
1211
- else:
1212
- out_input = qkv
1213
- self._record(f'{pfx}.ssm_out.weight', out_input)
1214
-
1215
- attn_out = out_input @ out_w.T
1216
- hidden = hidden + attn_out
1217
-
1218
- # ── FFN (same as standard transformer) ──
1219
- ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
1220
- if ffn_norm_w is None:
1221
- return hidden
1222
-
1223
- normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
1224
-
1225
- gate_w = self._get_weight(f'{pfx}.ffn_gate.weight')
1226
- up_w = self._get_weight(f'{pfx}.ffn_up.weight')
1227
- down_w = self._get_weight(f'{pfx}.ffn_down.weight')
1228
-
1229
- if gate_w is not None and up_w is not None and down_w is not None:
1230
- self._record(f'{pfx}.ffn_gate.weight', normed_ff)
1231
- self._record(f'{pfx}.ffn_up.weight', normed_ff)
1232
- gate_out = self.act_fn(normed_ff @ gate_w.T)
1233
- up_out = normed_ff @ up_w.T
1234
- ff_mid = gate_out * up_out
1235
- self._record(f'{pfx}.ffn_down.weight', ff_mid)
1236
- ff_out = ff_mid @ down_w.T
1237
- hidden = hidden + ff_out
1238
- else:
1239
- hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
1240
-
1241
- return hidden
1242
-
1243
- def forward_qwen35_layer(self, hidden, layer_idx, cos_f, sin_f):
1244
- """Forward pass through a Qwen 3.6 hybrid layer (attention + SSM).
1245
-
1246
- Qwen 3.6 uses:
1247
- - Fused attn_qkv.weight (Q+K+V in one tensor)
1248
- - attn_gate.weight (gated attention output, not attn_output)
1249
- - SSM tensors: ssm_alpha, ssm_beta, ssm_conv1d, ssm_out
1250
- - post_attention_norm.weight (instead of ffn_norm)
1251
- """
1252
- pfx = self._layer_prefix(layer_idx)
1253
- cfg = self.cfg
1254
- n_head = cfg['n_head']
1255
- n_head_kv = cfg['n_head_kv']
1256
- if isinstance(n_head_kv, list):
1257
- n_head_kv = n_head_kv[layer_idx]
1258
- head_dim = self.head_dim
1259
- seq_len = hidden.shape[0]
1260
-
1261
- # ── Attention norm ──
1262
- attn_norm_w = self._get_weight(f'{pfx}.attn_norm.weight')
1263
- if attn_norm_w is None:
1264
- return hidden
1265
- normed = rms_norm(hidden, attn_norm_w, cfg['rms_eps'])
1266
-
1267
- # ── Fused QKV projection ──
1268
- qkv_w = self._get_weight(f'{pfx}.attn_qkv.weight')
1269
- gate_w = self._get_weight(f'{pfx}.attn_gate.weight')
1270
-
1271
- attn_out_vec = np.zeros_like(hidden)
1272
- if qkv_w is not None:
1273
- self._record(f'{pfx}.attn_qkv.weight', normed)
1274
-
1275
- qkv = normed @ qkv_w.T # [seq, (n_head + 2*n_head_kv) * head_dim]
1276
-
1277
- # Split into Q, K, V
1278
- q_dim = n_head * head_dim
1279
- kv_dim = n_head_kv * head_dim
1280
- q = qkv[:, :q_dim].reshape(seq_len, n_head, head_dim)
1281
- k = qkv[:, q_dim:q_dim + kv_dim].reshape(seq_len, n_head_kv, head_dim)
1282
- v = qkv[:, q_dim + kv_dim:q_dim + 2 * kv_dim].reshape(seq_len, n_head_kv, head_dim)
1283
-
1284
- # RoPE
1285
- q = apply_rope(q, cos_f, sin_f)
1286
- k = apply_rope(k, cos_f, sin_f)
1287
-
1288
- # GQA: repeat KV heads
1289
- if n_head_kv < n_head:
1290
- rep = n_head // n_head_kv
1291
- k = np.repeat(k, rep, axis=1)
1292
- v = np.repeat(v, rep, axis=1)
1293
-
1294
- # Scaled dot-product attention
1295
- q_t = q.transpose(1, 0, 2) # [n_head, seq, head_dim]
1296
- k_t = k.transpose(1, 0, 2)
1297
- v_t = v.transpose(1, 0, 2)
1298
-
1299
- scale = 1.0 / np.sqrt(head_dim)
1300
- attn = np.matmul(q_t, k_t.transpose(0, 2, 1)) * scale
1301
-
1302
- # Causal mask (with optional SWA)
1303
- mask = np.triu(np.full((seq_len, seq_len), -1e9, dtype=np.float32), k=1)
1304
- swa = cfg.get('swa_window', 0)
1305
- if swa and swa > 0:
1306
- for i in range(seq_len):
1307
- for j in range(0, max(0, i - swa)):
1308
- mask[i, j] = -1e9
1309
- attn = attn + mask[np.newaxis, :, :]
1310
- attn = softmax(attn, axis=-1)
1311
-
1312
- out = np.matmul(attn, v_t)
1313
- attn_result = out.transpose(1, 0, 2).reshape(seq_len, -1) # [seq, n_head*head_dim]
1314
-
1315
- # Gated attention output
1316
- if gate_w is not None:
1317
- self._record(f'{pfx}.attn_gate.weight', attn_result)
1318
- # Some GGUF tensors are transposed. Ensure output matches hidden dim.
1319
- if gate_w.shape[1] == hidden.shape[-1]:
1320
- attn_out_vec = attn_result @ gate_w
1321
- else:
1322
- attn_out_vec = attn_result @ gate_w.T
1323
-
1324
- # ── SSM path ──
1325
- ssm_alpha_w = self._get_weight(f'{pfx}.ssm_alpha.weight')
1326
- ssm_beta_w = self._get_weight(f'{pfx}.ssm_beta.weight')
1327
- ssm_conv_w = self._get_weight(f'{pfx}.ssm_conv1d.weight')
1328
- ssm_out_w = self._get_weight(f'{pfx}.ssm_out.weight')
1329
-
1330
- ssm_out_vec = np.zeros_like(hidden)
1331
- if ssm_alpha_w is not None:
1332
- self._record(f'{pfx}.ssm_alpha.weight', normed)
1333
- if ssm_beta_w is not None:
1334
- self._record(f'{pfx}.ssm_beta.weight', normed)
1335
- if ssm_conv_w is not None:
1336
- # ssm_conv1d input is the QKV projection (reuse from attention)
1337
- if qkv_w is not None:
1338
- qkv_for_ssm = normed @ qkv_w.T
1339
- self._record(f'{pfx}.ssm_conv1d.weight', qkv_for_ssm)
1340
- if ssm_out_w is not None:
1341
- # SSM output projection — use qkv output as proxy for recurrent output
1342
- # (qkv is 10240, ssm_out expects 6144)
1343
- if 'qkv' in locals() and qkv.shape[-1] >= ssm_out_w.shape[1]:
1344
- ssm_proxy = qkv[:, :ssm_out_w.shape[1]]
1345
- else:
1346
- # Fallback zero pad
1347
- ssm_proxy = np.zeros((seq_len, ssm_out_w.shape[1]), dtype=np.float32)
1348
-
1349
- self._record(f'{pfx}.ssm_out.weight', ssm_proxy)
1350
-
1351
- # Note: We do NOT need to actually add the SSM output vector to hidden
1352
- # for importance matrix calculation. We just need to record the inputs
1353
- # to all quantized layers. The actual output isn't critical since we
1354
- # aren't doing loss backprop. But if we do, it must match hidden's dimension.
1355
- if ssm_out_w.shape[0] == hidden.shape[-1]:
1356
- ssm_out_vec = ssm_proxy @ ssm_out_w.T
1357
-
1358
- # Combine attention + SSM
1359
- hidden = hidden + attn_out_vec + ssm_out_vec
1360
-
1361
- # ── FFN (uses post_attention_norm instead of ffn_norm) ──
1362
- ffn_norm_w = self._get_weight(f'{pfx}.post_attention_norm.weight')
1363
- if ffn_norm_w is None:
1364
- ffn_norm_w = self._get_weight(f'{pfx}.ffn_norm.weight')
1365
- if ffn_norm_w is None:
1366
- return hidden
1367
-
1368
- normed_ff = rms_norm(hidden, ffn_norm_w, cfg['rms_eps'])
1369
-
1370
- gate_fw = self._get_weight(f'{pfx}.ffn_gate.weight')
1371
- up_w = self._get_weight(f'{pfx}.ffn_up.weight')
1372
- down_w = self._get_weight(f'{pfx}.ffn_down.weight')
1373
-
1374
- if gate_fw is not None and up_w is not None and down_w is not None:
1375
- self._record(f'{pfx}.ffn_gate.weight', normed_ff)
1376
- self._record(f'{pfx}.ffn_up.weight', normed_ff)
1377
- gate_out = self.act_fn(normed_ff @ gate_fw.T)
1378
- up_out = normed_ff @ up_w.T
1379
- ff_mid = gate_out * up_out
1380
- self._record(f'{pfx}.ffn_down.weight', ff_mid)
1381
- ff_out = ff_mid @ down_w.T
1382
- hidden = hidden + ff_out
1383
- else:
1384
- hidden = self._forward_moe_ffn(hidden, normed_ff, pfx)
1385
-
1386
- return hidden
1387
-
1388
- def forward(self, token_ids):
1389
- """Full forward pass, collecting importance statistics."""
1390
- cfg = self.cfg
1391
- seq_len = len(token_ids)
1392
-
1393
- # Embedding
1394
- embed_w = self._get_weight('token_embd.weight')
1395
- if embed_w is None:
1396
- raise RuntimeError("Missing token_embd.weight")
1397
-
1398
- hidden = embed_w[token_ids].copy() # [seq_len, n_embd]
1399
- del embed_w # Free ~5 GB embedding table before layer loop
1400
-
1401
- # RoPE frequencies
1402
- cos_f, sin_f = rope_freqs(self.head_dim, seq_len, cfg['rope_base'])
1403
-
1404
- # Process each layer
1405
- for layer_idx in range(cfg['n_layers']):
1406
- pfx = f"blk.{layer_idx}"
1407
-
1408
- if self._hpc_lib and self.linear_attn:
1409
- # Pure HPC C forward: entire layer in one C call
1410
- hidden = self._hpc_forward_layer(hidden, layer_idx)
1411
- elif self.linear_attn:
1412
- # Python HPC-linearized attention: O(seq) per layer
1413
- hidden = self.forward_layer_linear(hidden, layer_idx)
1414
- else:
1415
- has_fused_qkv = f'{pfx}.attn_qkv.weight' in self.model.tensor_infos
1416
- has_separate_q = f'{pfx}.attn_q.weight' in self.model.tensor_infos
1417
- has_linear_attn = f'{pfx}.ssm_in_qkv.weight' in self.model.tensor_infos
1418
-
1419
- if has_fused_qkv:
1420
- hidden = self.forward_qwen35_layer(hidden, layer_idx, cos_f, sin_f)
1421
- elif has_linear_attn and not has_separate_q:
1422
- hidden = self.forward_linear_attn_layer(hidden, layer_idx)
1423
- else:
1424
- hidden = self.forward_layer(hidden, layer_idx, cos_f, sin_f)
1425
- if self.verbose and (layer_idx + 1) % 4 == 0:
1426
- print(f" Layer {layer_idx + 1}/{cfg['n_layers']}", end='\r')
1427
-
1428
- # Output projection — check existence without loading the full 5 GB tensor
1429
- if 'output.weight' in self.model.tensor_infos:
1430
- self._record('output.weight', hidden)
1431
-
1432
- return hidden
1433
-
1434
-
1435
- # ─── HPC Cross-Layer Importance Propagation ─────────────────────────────────
1436
-
1437
- def hpc_propagate_importance(importance_dict, n_layers, verbose=False):
1438
- """Use HPC-inspired BP to propagate importance across layers.
1439
-
1440
- Each layer's raw E[x²] statistics are smoothed via cross-layer coupling
1441
- through the residual stream. Layers with high importance AND high-importance
1442
- neighbors get boosted; isolated spikes get damped.
1443
- """
1444
- # Group tensors by layer
1445
- layer_energies = np.zeros(n_layers, dtype=np.float64)
1446
- layer_tensor_count = np.zeros(n_layers, dtype=np.int32)
1447
-
1448
- for name, (sum_x2, count) in importance_dict.items():
1449
- parts = name.split('.')
1450
- if len(parts) >= 2 and parts[0] == 'blk':
1451
- try:
1452
- layer_idx = int(parts[1])
1453
- if 0 <= layer_idx < n_layers:
1454
- mean_imp = np.mean(sum_x2 / max(count, 1))
1455
- layer_energies[layer_idx] += mean_imp
1456
- layer_tensor_count[layer_idx] += 1
1457
- except ValueError:
1458
- pass
1459
-
1460
- for i in range(n_layers):
1461
- if layer_tensor_count[i] > 0:
1462
- layer_energies[i] /= layer_tensor_count[i]
1463
-
1464
- if np.max(layer_energies) < 1e-30:
1465
- return importance_dict
1466
-
1467
- layer_energies /= np.max(layer_energies)
1468
-
1469
- # BP-inspired iterative smoothing with residual stream coupling
1470
- multipliers = np.ones(n_layers, dtype=np.float64)
1471
- temperature = 0.5
1472
-
1473
- for _ in range(50):
1474
- new_mult = np.ones(n_layers, dtype=np.float64)
1475
- for i in range(n_layers):
1476
- e_self = layer_energies[i]
1477
- e_nbr = 0.0
1478
- n_nbr = 0
1479
- if i > 0:
1480
- e_nbr += layer_energies[i-1] * multipliers[i-1]
1481
- n_nbr += 1
1482
- if i < n_layers - 1:
1483
- e_nbr += layer_energies[i+1] * multipliers[i+1]
1484
- n_nbr += 1
1485
- if n_nbr > 0:
1486
- e_nbr /= n_nbr
1487
- # Clamp energy to prevent exponential explosion (max exp(5) ~ 148)
1488
- energy = np.clip((e_self + 0.3 * e_nbr) / temperature, -10, 5)
1489
- new_mult[i] = np.exp(energy)
1490
-
1491
- mean_m = np.mean(new_mult)
1492
- if mean_m > 1e-30:
1493
- new_mult /= mean_m
1494
- multipliers = 0.7 * multipliers + 0.3 * new_mult
1495
-
1496
- if verbose:
1497
- print(f"\n HPC layer multipliers (first 8): "
1498
- f"{' '.join(f'{m:.3f}' for m in multipliers[:8])}...")
1499
- print(f" Range: [{np.min(multipliers):.3f}, {np.max(multipliers):.3f}]")
1500
-
1501
- adjusted = {}
1502
- for name, (sum_x2, count) in importance_dict.items():
1503
- parts = name.split('.')
1504
- if len(parts) >= 2 and parts[0] == 'blk':
1505
- try:
1506
- layer_idx = int(parts[1])
1507
- if 0 <= layer_idx < n_layers:
1508
- adjusted[name] = (sum_x2 * multipliers[layer_idx], count)
1509
- continue
1510
- except ValueError:
1511
- pass
1512
- adjusted[name] = (sum_x2, count)
1513
-
1514
- return adjusted
1515
-
1516
-
1517
- # ─── iMatrix Output Writer ──────────────────────────────────────────────────
1518
-
1519
- def write_imatrix(path, importance_dict):
1520
- """Write llama.cpp-compatible legacy binary imatrix file."""
1521
- entries = []
1522
- for name, (sum_x2, count) in sorted(importance_dict.items()):
1523
- values = sum_x2.astype(np.float32)
1524
- entries.append((name, values, int(count)))
1525
-
1526
- with open(path, 'wb') as f:
1527
- f.write(struct.pack('<i', len(entries)))
1528
- for name, values, n_samples in entries:
1529
- name_bytes = name.encode('utf-8')
1530
- f.write(struct.pack('<i', len(name_bytes)))
1531
- f.write(name_bytes)
1532
- f.write(struct.pack('<i', len(values)))
1533
- f.write(struct.pack('<i', n_samples))
1534
- f.write(values.tobytes())
1535
-
1536
- return len(entries)
1537
-
1538
-
1539
- def load_hf_config(config_path):
1540
- """Load a HuggingFace config.json and extract architecture info.
1541
-
1542
- Maps HF keys to internal generate_imatrix.py keys:
1543
- hidden_size -> n_embd
1544
- num_hidden_layers -> n_layers
1545
- num_attention_heads -> n_head
1546
- num_key_value_heads -> n_head_kv
1547
- intermediate_size -> n_ff
1548
- vocab_size -> vocab_size
1549
- rms_norm_eps -> rms_eps
1550
- rope_theta -> rope_base
1551
- model_type -> arch
1552
- """
1553
- import json
1554
- with open(config_path, 'r') as f:
1555
- raw = json.load(f)
1556
-
1557
- src = raw
1558
- if 'text_config' in raw and 'hidden_size' not in raw:
1559
- src = raw['text_config']
1560
-
1561
- cfg = {}
1562
- cfg['arch'] = src.get('model_type', raw.get('model_type', 'unknown'))
1563
- cfg['n_embd'] = src.get('hidden_size', 0)
1564
- cfg['n_layers'] = src.get('num_hidden_layers', 0)
1565
- cfg['n_head'] = src.get('num_attention_heads', 0)
1566
- cfg['n_head_kv'] = src.get('num_key_value_heads', 0)
1567
- cfg['n_ff'] = src.get('intermediate_size', 0)
1568
- cfg['vocab_size'] = src.get('vocab_size', 0)
1569
- cfg['rms_eps'] = src.get('rms_norm_eps', 1e-6)
1570
-
1571
- rope_params = src.get('rope_parameters', {})
1572
- cfg['rope_base'] = rope_params.get('rope_theta',
1573
- src.get('rope_theta', 10000.0))
1574
-
1575
- cfg['expert_count'] = src.get('num_local_experts', src.get('num_experts', 0))
1576
- cfg['expert_used_count'] = src.get('num_experts_per_tok', 0)
1577
-
1578
- # head_dim fallback
1579
- if src.get('head_dim'):
1580
- cfg['head_dim'] = src['head_dim']
1581
- elif cfg['n_head'] > 0:
1582
- cfg['head_dim'] = cfg['n_embd'] // cfg['n_head']
1583
-
1584
- return cfg
1585
-
1586
-
1587
- # ─── Main ───────────────────────────────────────────────────────────────────
1588
-
1589
- def main():
1590
- import argparse
1591
- parser = argparse.ArgumentParser(
1592
- description='HExState iMatrix Generator — HPC-enhanced importance matrix from GGUF')
1593
- parser.add_argument('model', help='Input GGUF model file')
1594
- parser.add_argument('calibration', help='Calibration text file')
1595
- parser.add_argument('-o', '--output', default='imatrix.dat',
1596
- help='Output imatrix file (default: imatrix.dat)')
1597
- parser.add_argument('--config', help='Optional HuggingFace config.json')
1598
- parser.add_argument('--chunks', type=int, default=10,
1599
- help='Number of token chunks to process (default: 10)')
1600
- parser.add_argument('--chunk-size', type=int, default=4096,
1601
- help='Tokens per chunk (default: 4096)')
1602
- parser.add_argument('--no-hpc', action='store_true',
1603
- help='Disable HPC cross-layer propagation')
1604
- parser.add_argument('--quadratic-attn', action='store_true',
1605
- help='Use full O(seq²) attention instead of HPC-linearized O(seq)')
1606
- parser.add_argument('--verbose', action='store_true',
1607
- help='Per-layer statistics')
1608
- args = parser.parse_args()
1609
-
1610
- print()
1611
- print(" ╔════════════════════════════════════════════════════════════════╗")
1612
- print(" ║ HExState Importance Matrix Generator ║")
1613
- print(" ║ HPC-Enhanced E[x²] Collection from GGUF ║")
1614
- print(" ╚════════════════════════════════════════════════════════════════╝")
1615
- print()
1616
-
1617
- start_time = time.time()
1618
-
1619
- # ── Load model ──
1620
- print(f" Loading model: {args.model}")
1621
- model = GGUFModel(args.model)
1622
- config = model.get_config()
1623
-
1624
- # ── Load/Merge config.json ──
1625
- cfg_path = args.config
1626
- if not cfg_path:
1627
- # Auto-lookup in model directory
1628
- model_dir = os.path.dirname(os.path.abspath(args.model))
1629
- potential_cfg = os.path.join(model_dir, 'config.json')
1630
- if os.path.exists(potential_cfg):
1631
- cfg_path = potential_cfg
1632
-
1633
- if cfg_path:
1634
- print(f" Merging config from: {cfg_path}")
1635
- hf_cfg = load_hf_config(cfg_path)
1636
- # Override GGUF values with HF config values where they exist and are non-zero
1637
- for k, v in hf_cfg.items():
1638
- if v is not None:
1639
- config[k] = v
1640
-
1641
- print(f" Architecture: {config['arch']}")
1642
- print(f" Layers: {config['n_layers']}")
1643
- print(f" Hidden: {config['n_embd']}")
1644
- print(f" Heads: {config['n_head']} (KV: {config['n_head_kv']})")
1645
- print(f" FFN: {config['n_ff']}")
1646
- print(f" Vocab: {config['vocab_size']}")
1647
- print(f" Tensors: {len(model.tensor_infos)}")
1648
- print()
1649
-
1650
- # ── Load tokenizer ──
1651
- print(" Loading tokenizer from GGUF metadata...")
1652
- tokenizer = SimpleTokenizer(model)
1653
- print(f" Vocab size: {tokenizer.vocab_size}")
1654
- print()
1655
-
1656
- # ── Load calibration text ──
1657
- print(f" Loading calibration data: {args.calibration}")
1658
- with open(args.calibration, 'r', encoding='utf-8', errors='replace') as f:
1659
- cal_text = f.read()
1660
- print(f" Text length: {len(cal_text):,} chars")
1661
-
1662
- # ── Tokenize and chunk ──
1663
- print(f" Tokenizing ({args.chunk_size} tokens/chunk, {args.chunks} chunks max)...")
1664
- chunks = tokenizer.chunk_text(cal_text, args.chunk_size)
1665
- if len(chunks) > args.chunks:
1666
- chunks = chunks[:args.chunks]
1667
- print(f" Prepared {len(chunks)} chunks")
1668
- print()
1669
-
1670
- # ── Forward pass ──
1671
- print(" Running forward passes...")
1672
- use_linear = not args.quadratic_attn
1673
- runner = TransformerRunner(model, config, verbose=args.verbose, linear_attn=use_linear)
1674
- if use_linear:
1675
- print(f" Attention mode: HPC-linearized O(seq) — chunk_size={args.chunk_size}")
1676
- else:
1677
- print(f" Attention mode: full O(seq²) softmax — chunk_size={args.chunk_size}")
1678
-
1679
- for i, chunk in enumerate(chunks):
1680
- elapsed = time.time() - start_time
1681
- eta = elapsed / max(i, 1) * (len(chunks) - i) if i > 0 else 0
1682
- pct = (i + 1) / len(chunks) * 100
1683
- bw = 40
1684
- filled = int(bw * (i + 1) / len(chunks))
1685
- bar = '█' * filled + '░' * (bw - filled)
1686
- sys.stdout.write(
1687
- f"\r [{bar}] {pct:5.1f}% ({i+1}/{len(chunks)}) "
1688
- f"{elapsed:.0f}s ETA:{eta:.0f}s")
1689
- sys.stdout.flush()
1690
-
1691
- try:
1692
- runner.forward(chunk)
1693
- except Exception as e:
1694
- print(f"\n WARNING: Chunk {i} failed: {e}")
1695
- continue
1696
-
1697
- print(f"\n Collected importance for {len(runner.importance)} tensors")
1698
- print()
1699
-
1700
- # ── HPC propagation ──
1701
- if not args.no_hpc:
1702
- print(" Running HPC cross-layer importance propagation...")
1703
- importance = hpc_propagate_importance(
1704
- runner.importance, config['n_layers'], verbose=args.verbose)
1705
- else:
1706
- importance = runner.importance
1707
-
1708
- # ── Write output ──
1709
- print(f"\n Writing imatrix: {args.output}")
1710
- n_entries = write_imatrix(args.output, importance)
1711
-
1712
- elapsed = time.time() - start_time
1713
- out_size = os.path.getsize(args.output)
1714
-
1715
- print()
1716
- print(" ╔════════════════════════════════════════════════════════════════╗")
1717
- print(" ║ IMATRIX GENERATION COMPLETE ║")
1718
- print(" ╠════════════════════════════════════════════════════════════════╣")
1719
- print(f" ║ Tensor entries: {n_entries:<42d} ║")
1720
- print(f" ║ Chunks processed: {len(chunks):<42d} ║")
1721
- print(f" ║ Output size: {out_size:>11,} bytes ({out_size/1024:.1f} KB)"
1722
- f"{' '*(25-len(f'{out_size/1024:.1f}'))}║")
1723
- print(f" ║ Total time: {elapsed:>38.1f} sec ║")
1724
- print(" ╚════════════════════════════════════════════════════════════════╝")
1725
- print()
1726
- print(f" Output: {args.output}")
1727
- print()
1728
-
1729
- model.close()
1730
-
1731
-
1732
- if __name__ == '__main__':
1733
- main()