Harvest fused QKV projection from n1

Browse files

Files changed (13) hide show

AGILLM-4.md +14 -0
N1_HARVEST.md +33 -5
README.md +1 -0
local_sweep_after_qkv_sdpa.json +22 -0
local_sweep_after_qkv_sublinear.json +22 -0
local_verify_m_fold_after_qkv_agillm4.json +118 -0
local_verify_m_fold_after_qkv_fix_agillm4.json +118 -0
local_verify_qkv_agillm4.json +119 -0
local_verify_qkv_all_backends_agillm4.json +177 -0
local_verify_qkv_sublinear_agillm4.json +31 -0
nB300_agillm4.py +186 -14
verify_m_fold_agillm4.py +4 -3
verify_qkv_agillm4.py +246 -0

AGILLM-4.md CHANGED Viewed

@@ -167,6 +167,20 @@ python /workspace/agillm-4/verify_m_fold_agillm4.py \
   --backends manual,sdpa
 ```
 See `N1_HARVEST.md` for the staged port order.
 ## Intelligence per FLOP

   --backends manual,sdpa
 ```
+Second harvested feature: fused QKV projection. AGILLM-4 now uses one
+`Linear(d, 3d)` and chunks the result instead of running three separate
+`Linear(d, d)` projections. Legacy `q.weight/k.weight/v.weight` checkpoints
+load into the fused `qkv.weight` layout, and matching legacy AdamW moments are
+remapped on full resume.
+Verification:
+```bash
+python /workspace/agillm-4/verify_qkv_agillm4.py \
+  --presets pico_1x,micro_3x \
+  --backends manual,sdpa,sublinear
+```
 See `N1_HARVEST.md` for the staged port order.
 ## Intelligence per FLOP

N1_HARVEST.md CHANGED Viewed

@@ -34,15 +34,43 @@ python agillm-4/verify_m_fold_agillm4.py \
 The verifier checks forward output, loss, input gradients, parameter gradients,
 cached append equivalence, cache key width, and metric-cache invalidation.
-## Next Candidates
 ### 2. Fused QKV Projection
 n1 fuses separate `q/k/v` linear layers into one `qkv` linear while keeping
-checkpoint compatibility by folding old state-dict keys on load. This should be
-the next port after M-fold because it reduces three projection GEMMs to one.
-Risk: checkpoint key migration. Keep this separate from the M-fold port.
 ### 3. Combined ALiBi + Mask Cache

 The verifier checks forward output, loss, input gradients, parameter gradients,
 cached append equivalence, cache key width, and metric-cache invalidation.
 ### 2. Fused QKV Projection
+Status: done.
 n1 fuses separate `q/k/v` linear layers into one `qkv` linear while keeping
+checkpoint compatibility by folding old state-dict keys on load. AGILLM-4 now
+does the same. The parameter count and function are unchanged:
+```text
+[x Wq.T, x Wk.T, x Wv.T] == split(x [Wq; Wk; Wv].T)
+```
+Checkpoint compatibility:
+- legacy `*.q.weight`, `*.k.weight`, `*.v.weight` triples load into
+  `*.qkv.weight`
+- warm-start shape filtering fuses legacy triples before filtering
+- legacy AdamW q/k/v moment tensors are concatenated into qkv optimizer state
+  when a full resume can be proven to match the old parameter layout
+- if optimizer remap cannot be proven, model weights still load and optimizer
+  state is reset with a warning
+Verification:
+```bash
+python agillm-4/verify_qkv_agillm4.py \
+  --presets pico_1x,micro_3x \
+  --backends manual,sdpa,sublinear \
+  --cached_len 8 \
+  --new_len 4
+```
+The verifier checks fused-vs-unfused forward output, loss, input gradients,
+parameter gradients, strict legacy state-dict loading, `_safe_load_any`
+warm-start loading, and optimizer-state remap.
+## Next Candidates
 ### 3. Combined ALiBi + Mask Cache

README.md CHANGED Viewed

@@ -20,6 +20,7 @@ and extended for:
 - AR+SAT every step with sequential backward to reduce peak VRAM
 - SDPA and experimental sublinear local+landmark attention backends
 - exact M-fold expansion attention harvested from n1.py, with local verifier
 - profiling tools for memory, throughput, AR cost, SAT cost, and optimizer cost
 - synthetic long-context curriculum generation for recall and multi-hop tests

 - AR+SAT every step with sequential backward to reduce peak VRAM
 - SDPA and experimental sublinear local+landmark attention backends
 - exact M-fold expansion attention harvested from n1.py, with local verifier
+- fused QKV projection harvested from n1.py, with legacy checkpoint loading
 - profiling tools for memory, throughput, AR cost, SAT cost, and optimizer cost
 - synthetic long-context curriculum generation for recall and multi-hop tests

local_sweep_after_qkv_sdpa.json ADDED Viewed

	@@ -0,0 +1,22 @@

+[
+  {
+    "alloc_gb": 0.027,
+    "amp": false,
+    "attn_backend": "sdpa",
+    "batch_size": 1,
+    "block": 64,
+    "elapsed_s": 0.538,
+    "error": null,
+    "grad_checkpoint": true,
+    "loss": 18.4698,
+    "ok": true,
+    "peak_alloc_gb": 0.031,
+    "peak_reserved_gb": 0.057,
+    "reserved_gb": 0.057,
+    "sublinear_chunk": 128,
+    "sublinear_max_anchors": 256,
+    "sublinear_stride": 64,
+    "sublinear_window": 256,
+    "tokens_per_s_synthetic": 119.1
+  }
+]

local_sweep_after_qkv_sublinear.json ADDED Viewed

	@@ -0,0 +1,22 @@

+[
+  {
+    "alloc_gb": 0.027,
+    "amp": false,
+    "attn_backend": "sublinear",
+    "batch_size": 1,
+    "block": 64,
+    "elapsed_s": 0.698,
+    "error": null,
+    "grad_checkpoint": true,
+    "loss": 18.251,
+    "ok": true,
+    "peak_alloc_gb": 0.031,
+    "peak_reserved_gb": 0.057,
+    "reserved_gb": 0.057,
+    "sublinear_chunk": 16,
+    "sublinear_max_anchors": 16,
+    "sublinear_stride": 8,
+    "sublinear_window": 16,
+    "tokens_per_s_synthetic": 91.7
+  }
+]

local_verify_m_fold_after_qkv_agillm4.json ADDED Viewed

	@@ -0,0 +1,118 @@

+[
+  {
+    "backend": "manual",
+    "d": 32,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 5.960464477539063e-08,
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 0.0
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 32,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 5.960464477539063e-08,
+      "causal_alibi_forward": 7.450580596923828e-08,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 1.862645149230957e-09,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "none_forward": 8.940696716308594e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 9.313225746154785e-10,
+      "none_x_grad": 8.731149137020111e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 1.862645149230957e-09,
+      "sat_alibi_param_grad": 9.313225746154785e-10,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "manual",
+    "d": 128,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 6.51925802230835e-08,
+      "causal_alibi_forward": 5.960464477539063e-08,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 4.656612873077393e-10,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "metric_cache_cleared_on_train": 0.0,
+      "metric_cache_reused": 0.0,
+      "none_forward": 5.960464477539063e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 2.3283064365386963e-10,
+      "none_x_grad": 1.4551915228366852e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 1.862645149230957e-09,
+      "sat_alibi_param_grad": 5.820766091346741e-10,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 128,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 7.450580596923828e-08,
+      "causal_alibi_forward": 1.043081283569336e-07,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 4.656612873077393e-10,
+      "causal_alibi_x_grad": 8.731149137020111e-11,
+      "metric_cache_cleared_on_train": 0.0,
+      "metric_cache_reused": 0.0,
+      "none_forward": 8.940696716308594e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 2.3283064365386963e-10,
+      "none_x_grad": 1.8189894035458565e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 6.984919309616089e-10,
+      "sat_alibi_x_grad": 7.275957614183426e-11
+    },
+    "tol": 0.0002
+  }
+]

local_verify_m_fold_after_qkv_fix_agillm4.json ADDED Viewed

	@@ -0,0 +1,118 @@

+[
+  {
+    "backend": "manual",
+    "d": 32,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 5.960464477539063e-08,
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 0.0
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 32,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 5.960464477539063e-08,
+      "causal_alibi_forward": 7.450580596923828e-08,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 1.862645149230957e-09,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "none_forward": 8.940696716308594e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 9.313225746154785e-10,
+      "none_x_grad": 8.731149137020111e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 1.862645149230957e-09,
+      "sat_alibi_param_grad": 9.313225746154785e-10,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "manual",
+    "d": 128,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 6.51925802230835e-08,
+      "causal_alibi_forward": 5.960464477539063e-08,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 4.656612873077393e-10,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "metric_cache_cleared_on_train": 0.0,
+      "metric_cache_reused": 0.0,
+      "none_forward": 5.960464477539063e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 2.3283064365386963e-10,
+      "none_x_grad": 1.4551915228366852e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 1.862645149230957e-09,
+      "sat_alibi_param_grad": 5.820766091346741e-10,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 128,
+    "dk": 16,
+    "expected_k_width": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "cache_k_width": 16.0,
+      "cache_v_width": 16.0,
+      "cached_append_forward": 7.450580596923828e-08,
+      "causal_alibi_forward": 1.043081283569336e-07,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 4.656612873077393e-10,
+      "causal_alibi_x_grad": 8.731149137020111e-11,
+      "metric_cache_cleared_on_train": 0.0,
+      "metric_cache_reused": 0.0,
+      "none_forward": 8.940696716308594e-08,
+      "none_loss": 0.0,
+      "none_param_grad": 2.3283064365386963e-10,
+      "none_x_grad": 1.8189894035458565e-11,
+      "sat_alibi_forward": 1.1920928955078125e-07,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 6.984919309616089e-10,
+      "sat_alibi_x_grad": 7.275957614183426e-11
+    },
+    "tol": 0.0002
+  }
+]

local_verify_qkv_agillm4.json ADDED Viewed

	@@ -0,0 +1,119 @@

+[
+  {
+    "backend": "manual",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "optimizer_remap": 0.0,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "manual",
+    "d": 128,
+    "dk": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 1.4551915228366852e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 128,
+    "dk": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 1.4551915228366852e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  }
+]

local_verify_qkv_all_backends_agillm4.json ADDED Viewed

	@@ -0,0 +1,177 @@

+[
+  {
+    "backend": "manual",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "optimizer_remap": 0.0,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sublinear",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "manual",
+    "d": 128,
+    "dk": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 1.4551915228366852e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sdpa",
+    "d": 128,
+    "dk": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 1.4551915228366852e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  },
+  {
+    "backend": "sublinear",
+    "d": 128,
+    "dk": 16,
+    "heads": 8,
+    "ok": true,
+    "preset": "micro_3x",
+    "rank": 48,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 5.820766091346741e-11,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 1.8189894035458565e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 5.820766091346741e-11
+    },
+    "tol": 0.0002
+  }
+]

local_verify_qkv_sublinear_agillm4.json ADDED Viewed

	@@ -0,0 +1,31 @@

+[
+  {
+    "backend": "sublinear",
+    "d": 32,
+    "dk": 16,
+    "heads": 2,
+    "ok": true,
+    "preset": "pico_1x",
+    "rank": 16,
+    "rows": {
+      "causal_alibi_forward": 0.0,
+      "causal_alibi_loss": 0.0,
+      "causal_alibi_param_grad": 0.0,
+      "causal_alibi_x_grad": 2.3283064365386963e-10,
+      "legacy_load_forward": 0.0,
+      "legacy_load_missing_unexpected": 0.0,
+      "legacy_load_qkv_weight": 0.0,
+      "none_forward": 0.0,
+      "none_loss": 0.0,
+      "none_param_grad": 0.0,
+      "none_x_grad": 5.820766091346741e-11,
+      "safe_load_any_loaded": 0.0,
+      "safe_load_any_qkv": 0.0,
+      "sat_alibi_forward": 0.0,
+      "sat_alibi_loss": 0.0,
+      "sat_alibi_param_grad": 0.0,
+      "sat_alibi_x_grad": 2.3283064365386963e-10
+    },
+    "tol": 0.0002
+  }
+]

nB300_agillm4.py CHANGED Viewed

@@ -4,7 +4,7 @@
 # Enhanced inference: checkpoint name, tok/s, UK time
 from __future__ import annotations
-import argparse, json, math, pathlib, random, time, os, sys, threading, hashlib, re, subprocess
 from pathlib import Path
 from contextlib import nullcontext
 from typing import Dict, Any, List, Optional, Tuple
@@ -1147,9 +1147,10 @@ class TuneableAttentionMHA(nn.Module):
         self.sublinear_stride = max(0, int(sublinear_stride))
         self.sublinear_max_anchors = max(0, int(sublinear_max_anchors))
         self.sublinear_chunk = max(1, int(sublinear_chunk))
-        self.q = nn.Linear(d, d, bias=False)
-        self.k = nn.Linear(d, d, bias=False)
-        self.v = nn.Linear(d, d, bias=False)
         self.U = nn.Parameter(torch.randn(self.dk, r))
         nn.init.orthogonal_(self.U)
         self.proj = nn.Linear(h * self.dk, d, bias=False)
@@ -1164,6 +1165,25 @@ class TuneableAttentionMHA(nn.Module):
         self._metric_cache_data_ptr: int = -1
         self._metric_cache_shape: Tuple[int, int] = (-1, -1)
     def _proj_qk(self, x):
         B, N, _ = x.shape
         return (x.view(B, N, self.h, self.dk).transpose(1, 2) @ self.U)
@@ -1219,9 +1239,10 @@ class TuneableAttentionMHA(nn.Module):
         outputs = []
         scale = 1.0 / math.sqrt(self.dk)
-        if self.sublinear_stride > 0 and self.sublinear_max_anchors > 0:
             anchors = torch.arange(
-                self.sublinear_stride - 1,
                 k_len,
                 self.sublinear_stride,
                 device=device,
@@ -1273,9 +1294,8 @@ class TuneableAttentionMHA(nn.Module):
         return torch.cat(outputs, dim=2)
     def forward(self, x, mask=None, rel_bias_tokens=None, kv_cache=None, use_cache=False):
-        q_lin = self.q(x)
-        k_lin = self.k(x)
-        v_new = self._reshape_v(self.v(x))
         if self.r > self.dk:
             q = self._reshape_heads(q_lin) @ self._get_metric()
             k_new = self._reshape_heads(k_lin)
@@ -1498,6 +1518,141 @@ def _strip_orig_mod_prefix(state: dict) -> dict:
         for k, v in state.items()
     }
 def save_delta(core, ar_h, sat_h, step: int, seen_tok: int, save_dir: pathlib.Path, phase_name: str):
     """Save weight-only delta in background thread. Non-blocking."""
     global _delta_thread
@@ -1552,7 +1707,7 @@ def load_delta(path: pathlib.Path, core, ar_h, sat_h):
     ck = torch.load(path, map_location="cpu", weights_only=False)
     if not ck.get("delta"):
         raise ValueError(f"{path.name} is not a delta checkpoint")
-    core.load_state_dict(_strip_orig_mod_prefix(ck["weights"]["core"]))
     ar_h.load_state_dict(_strip_orig_mod_prefix(ck["weights"]["ar"]))
     sat_h.load_state_dict(_strip_orig_mod_prefix(ck["weights"]["sat"]))
     return ck.get("step", 0), ck.get("seen_tok", 0)
@@ -1586,11 +1741,25 @@ def load_ckpt(path, core, ar_h, sat_h, opt, scaler):
     p = _resolve_ckpt(path) or path
     ck = _try_load(p, map_location="cpu")
     if ck is None: raise FileNotFoundError(f"No valid checkpoint at {p}")
-    core.load_state_dict(_strip_orig_mod_prefix(ck["core"]))
     ar_h.load_state_dict(_strip_orig_mod_prefix(ck["ar"]))
     sat_h.load_state_dict(_strip_orig_mod_prefix(ck["sat"]))
-    opt.load_state_dict(ck["opt"])
-    scaler.load_state_dict(ck["scaler"])
     # Restore tokenizer from checkpoint if available
     if "tokenizer_json" in ck:
         try:
@@ -1614,8 +1783,11 @@ def _safe_load_any(path: pathlib.Path, tgt: nn.Module, key: str | None = None):
     sd = ck.get(key, ck) if key else ck
     if isinstance(sd, dict) and "state_dict" in sd: sd = sd["state_dict"]
     sd = _strip_orig_mod_prefix(sd)
     tgt_sd = tgt.state_dict()
-    filt = {k: v for k, v in sd.items() if k in tgt_sd and v.shape == tgt_sd[k].shape}
     if filt: tgt.load_state_dict(filt, strict=False)
     return len(filt)

 # Enhanced inference: checkpoint name, tok/s, UK time
 from __future__ import annotations
+import argparse, copy, json, math, pathlib, random, time, os, sys, threading, hashlib, re, subprocess
 from pathlib import Path
 from contextlib import nullcontext
 from typing import Dict, Any, List, Optional, Tuple
         self.sublinear_stride = max(0, int(sublinear_stride))
         self.sublinear_max_anchors = max(0, int(sublinear_max_anchors))
         self.sublinear_chunk = max(1, int(sublinear_chunk))
+        # Exact n1 harvest: one fused QKV projection is mathematically the same
+        # as three independent bias-free Linear(d, d) projections with their
+        # weights stacked along out_features.
+        self.qkv = nn.Linear(d, 3 * d, bias=False)
         self.U = nn.Parameter(torch.randn(self.dk, r))
         nn.init.orthogonal_(self.U)
         self.proj = nn.Linear(h * self.dk, d, bias=False)
         self._metric_cache_data_ptr: int = -1
         self._metric_cache_shape: Tuple[int, int] = (-1, -1)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        qkv_key = prefix + "qkv.weight"
+        if qkv_key not in state_dict:
+            qk = prefix + "q.weight"
+            kk = prefix + "k.weight"
+            vk = prefix + "v.weight"
+            if qk in state_dict and kk in state_dict and vk in state_dict:
+                fused = _cat_legacy_weight_blocks([state_dict[qk], state_dict[kk], state_dict[vk]])
+                if fused is not None:
+                    state_dict[qkv_key] = fused
+                    state_dict.pop(qk)
+                    state_dict.pop(kk)
+                    state_dict.pop(vk)
+        return super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs,
+        )
     def _proj_qk(self, x):
         B, N, _ = x.shape
         return (x.view(B, N, self.h, self.dk).transpose(1, 2) @ self.U)
         outputs = []
         scale = 1.0 / math.sqrt(self.dk)
+        anchor_start = self.sublinear_stride - 1
+        if self.sublinear_stride > 0 and self.sublinear_max_anchors > 0 and anchor_start < k_len:
             anchors = torch.arange(
+                anchor_start,
                 k_len,
                 self.sublinear_stride,
                 device=device,
         return torch.cat(outputs, dim=2)
     def forward(self, x, mask=None, rel_bias_tokens=None, kv_cache=None, use_cache=False):
+        q_lin, k_lin, v_lin = self.qkv(x).chunk(3, dim=-1)
+        v_new = self._reshape_v(v_lin)
         if self.r > self.dk:
             q = self._reshape_heads(q_lin) @ self._get_metric()
             k_new = self._reshape_heads(k_lin)
         for k, v in state.items()
     }
+def _cat_legacy_weight_blocks(blocks: list) -> Optional[torch.Tensor]:
+    if not blocks or not all(torch.is_tensor(t) for t in blocks):
+        return None
+    first = blocks[0]
+    tail_shape = tuple(first.shape[1:])
+    if any(t.dtype != first.dtype or t.device != first.device for t in blocks):
+        return None
+    if any(t.ndim != first.ndim or tuple(t.shape[1:]) != tail_shape for t in blocks):
+        return None
+    return torch.cat(blocks, dim=0).contiguous()
+def _fuse_qkv_in_state_dict(sd: dict) -> dict:
+    """Fold legacy q/k/v.weight triples into qkv.weight before loading/filtering."""
+    if not isinstance(sd, dict):
+        return sd
+    prefixes = set()
+    for key in list(sd.keys()):
+        for suffix in (".q.weight", ".k.weight", ".v.weight"):
+            if isinstance(key, str) and key.endswith(suffix):
+                prefixes.add(key[: -len(suffix)])
+    for prefix in prefixes:
+        qk, kk, vk = prefix + ".q.weight", prefix + ".k.weight", prefix + ".v.weight"
+        fk = prefix + ".qkv.weight"
+        if qk in sd and kk in sd and vk in sd and fk not in sd:
+            fused = _cat_legacy_weight_blocks([sd[qk], sd[kk], sd[vk]])
+            if fused is not None:
+                sd[fk] = fused
+                sd.pop(qk)
+                sd.pop(kk)
+                sd.pop(vk)
+    return sd
+def _split_qkv_in_state_dict_for_test(sd: dict) -> dict:
+    out = dict(sd)
+    for key in list(out.keys()):
+        if not isinstance(key, str) or not key.endswith(".qkv.weight"):
+            continue
+        base = key[: -len(".qkv.weight")]
+        q, k, v = out.pop(key).chunk(3, dim=0)
+        out[base + ".q.weight"] = q.clone()
+        out[base + ".k.weight"] = k.clone()
+        out[base + ".v.weight"] = v.clone()
+    return out
+def _clone_opt_value(value):
+    if torch.is_tensor(value):
+        return value.detach().clone()
+    return copy.deepcopy(value)
+def _optimizer_param_name_lookup(core, ar_h, sat_h) -> dict[int, str]:
+    out = {}
+    for prefix, module in (("core", core), ("ar", ar_h), ("sat", sat_h)):
+        for name, param in module.named_parameters():
+            out.setdefault(id(param), f"{prefix}.{name}")
+    return out
+def _optimizer_group_param_names(opt, core, ar_h, sat_h) -> List[List[str]]:
+    lookup = _optimizer_param_name_lookup(core, ar_h, sat_h)
+    return [
+        [lookup.get(id(param), f"<unknown:{id(param)}>") for param in group["params"]]
+        for group in opt.param_groups
+    ]
+def _legacy_names_for_current_param(name: str) -> List[str]:
+    if name.endswith(".qkv.weight"):
+        base = name[: -len(".qkv.weight")]
+        return [base + ".q.weight", base + ".k.weight", base + ".v.weight"]
+    return [name]
+def _fuse_legacy_optimizer_param_state(states: List[dict]) -> Optional[dict]:
+    if len(states) < 2 or any(not isinstance(state, dict) for state in states):
+        return None
+    common = set(states[0])
+    for state in states[1:]:
+        common &= set(state)
+    out = {}
+    for key in common:
+        vals = [state[key] for state in states]
+        if all(torch.is_tensor(v) for v in vals):
+            shape = vals[0].shape
+            if vals[0].ndim > 0 and all(v.shape == shape for v in vals[1:]):
+                out[key] = torch.cat([v.detach().clone() for v in vals], dim=0).contiguous()
+            else:
+                out[key] = vals[0].detach().clone()
+        else:
+            out[key] = copy.deepcopy(vals[0])
+    return out
+def _fuse_legacy_qkv_optimizer_state(opt_state: dict, opt, core, ar_h, sat_h) -> Optional[dict]:
+    """Remap pre-QKV-fusion AdamW state to the current fused parameter layout."""
+    if not isinstance(opt_state, dict) or "state" not in opt_state or "param_groups" not in opt_state:
+        return None
+    current_sd = opt.state_dict()
+    current_names = _optimizer_group_param_names(opt, core, ar_h, sat_h)
+    legacy_names = [
+        [legacy for name in group_names for legacy in _legacy_names_for_current_param(name)]
+        for group_names in current_names
+    ]
+    if len(legacy_names) != len(opt_state.get("param_groups", [])):
+        return None
+    legacy_name_to_pid = {}
+    for group_idx, names in enumerate(legacy_names):
+        old_params = list(opt_state["param_groups"][group_idx].get("params", []))
+        if len(names) != len(old_params):
+            return None
+        for name, pid in zip(names, old_params):
+            legacy_name_to_pid[name] = pid
+    new_groups = []
+    for group_idx, current_group in enumerate(current_sd["param_groups"]):
+        new_group = copy.deepcopy(opt_state["param_groups"][group_idx])
+        new_group["params"] = list(current_group["params"])
+        if "param_names" in new_group:
+            new_group["param_names"] = list(current_names[group_idx])
+        new_groups.append(new_group)
+    old_states = opt_state.get("state", {})
+    new_states = {}
+    for group_names, current_group in zip(current_names, current_sd["param_groups"]):
+        for name, new_pid in zip(group_names, current_group["params"]):
+            legacy_set = _legacy_names_for_current_param(name)
+            if len(legacy_set) > 1:
+                old_pids = [legacy_name_to_pid.get(legacy) for legacy in legacy_set]
+                if all(pid in old_states for pid in old_pids):
+                    fused = _fuse_legacy_optimizer_param_state([old_states[pid] for pid in old_pids])
+                    if fused is not None:
+                        new_states[new_pid] = fused
+                continue
+            old_pid = legacy_name_to_pid.get(name)
+            if old_pid in old_states:
+                new_states[new_pid] = {key: _clone_opt_value(value) for key, value in old_states[old_pid].items()}
+    return {"state": new_states, "param_groups": new_groups}
 def save_delta(core, ar_h, sat_h, step: int, seen_tok: int, save_dir: pathlib.Path, phase_name: str):
     """Save weight-only delta in background thread. Non-blocking."""
     global _delta_thread
     ck = torch.load(path, map_location="cpu", weights_only=False)
     if not ck.get("delta"):
         raise ValueError(f"{path.name} is not a delta checkpoint")
+    core.load_state_dict(_fuse_qkv_in_state_dict(_strip_orig_mod_prefix(ck["weights"]["core"])))
     ar_h.load_state_dict(_strip_orig_mod_prefix(ck["weights"]["ar"]))
     sat_h.load_state_dict(_strip_orig_mod_prefix(ck["weights"]["sat"]))
     return ck.get("step", 0), ck.get("seen_tok", 0)
     p = _resolve_ckpt(path) or path
     ck = _try_load(p, map_location="cpu")
     if ck is None: raise FileNotFoundError(f"No valid checkpoint at {p}")
+    core.load_state_dict(_fuse_qkv_in_state_dict(_strip_orig_mod_prefix(ck["core"])))
     ar_h.load_state_dict(_strip_orig_mod_prefix(ck["ar"]))
     sat_h.load_state_dict(_strip_orig_mod_prefix(ck["sat"]))
+    try:
+        opt.load_state_dict(ck["opt"])
+    except Exception as exc:
+        fused_opt = _fuse_legacy_qkv_optimizer_state(ck.get("opt"), opt, core, ar_h, sat_h)
+        if fused_opt is not None:
+            try:
+                opt.load_state_dict(fused_opt)
+                print("[ckpt] Converted legacy q/k/v optimizer state to fused qkv layout")
+            except Exception as exc2:
+                print(f"[ckpt] WARNING: optimizer state incompatible; resetting optimizer ({type(exc).__name__}: {exc}; qkv remap failed: {type(exc2).__name__}: {exc2})")
+        else:
+            print(f"[ckpt] WARNING: optimizer state incompatible; resetting optimizer ({type(exc).__name__}: {exc})")
+    try:
+        scaler.load_state_dict(ck["scaler"])
+    except Exception as exc:
+        print(f"[ckpt] WARNING: scaler state incompatible; resetting scaler ({type(exc).__name__}: {exc})")
     # Restore tokenizer from checkpoint if available
     if "tokenizer_json" in ck:
         try:
     sd = ck.get(key, ck) if key else ck
     if isinstance(sd, dict) and "state_dict" in sd: sd = sd["state_dict"]
     sd = _strip_orig_mod_prefix(sd)
+    sd = _fuse_qkv_in_state_dict(dict(sd)) if isinstance(sd, dict) else sd
+    if not isinstance(sd, dict):
+        return 0
     tgt_sd = tgt.state_dict()
+    filt = {k: v for k, v in sd.items() if k in tgt_sd and hasattr(v, "shape") and v.shape == tgt_sd[k].shape}
     if filt: tgt.load_state_dict(filt, strict=False)
     return len(filt)

verify_m_fold_agillm4.py CHANGED Viewed

@@ -24,9 +24,10 @@ def causal_mask_cached(new_len: int, cached_len: int):
 def old_expanded_forward(mha: nb.TuneableAttentionMHA, x: torch.Tensor, mask=None, rel_bias_tokens=None):
     bsz, seq, _ = x.shape
-    q = mha._reshape_heads(mha.q(x)) @ mha.U
-    k = mha._reshape_heads(mha.k(x)) @ mha.U
-    v = mha._reshape_v(mha.v(x))
     att = (q @ k.transpose(-1, -2)) / math.sqrt(mha.dk)
     if mha.use_relpos and rel_bias_tokens is not None:
         att = att + nb.alibi_bias(mha.h, rel_bias_tokens)[:, :, -seq:, :]

 def old_expanded_forward(mha: nb.TuneableAttentionMHA, x: torch.Tensor, mask=None, rel_bias_tokens=None):
     bsz, seq, _ = x.shape
+    q_lin, k_lin, v_lin = mha.qkv(x).chunk(3, dim=-1)
+    q = mha._reshape_heads(q_lin) @ mha.U
+    k = mha._reshape_heads(k_lin) @ mha.U
+    v = mha._reshape_v(v_lin)
     att = (q @ k.transpose(-1, -2)) / math.sqrt(mha.dk)
     if mha.use_relpos and rel_bias_tokens is not None:
         att = att + nb.alibi_bias(mha.h, rel_bias_tokens)[:, :, -seq:, :]

verify_qkv_agillm4.py ADDED Viewed

	@@ -0,0 +1,246 @@

+#!/usr/bin/env python3
+from __future__ import annotations
+import argparse
+import copy
+import json
+import math
+import os
+import tempfile
+from pathlib import Path
+from types import SimpleNamespace
+os.environ.setdefault("AGILLM_SYNTHETIC_TOKENIZER", "1")
+import torch
+import torch.nn.functional as F
+import nB300_agillm4 as nb
+def unfused_reference(mha: nb.TuneableAttentionMHA, x: torch.Tensor, mask=None, rel_bias_tokens=None):
+    bsz, seq, _ = x.shape
+    wq, wk, wv = mha.qkv.weight.chunk(3, dim=0)
+    q_lin = x @ wq.T
+    k_lin = x @ wk.T
+    v_lin = x @ wv.T
+    v = mha._reshape_v(v_lin)
+    if mha.r > mha.dk:
+        q = mha._reshape_heads(q_lin) @ mha._get_metric()
+        k = mha._reshape_heads(k_lin)
+    else:
+        q = mha._reshape_heads(q_lin) @ mha.U
+        k = mha._reshape_heads(k_lin) @ mha.U
+    attn_bias = None
+    if mha.use_relpos and rel_bias_tokens is not None:
+        attn_bias = nb.alibi_bias(mha.h, rel_bias_tokens)[:, :, -seq:, :]
+    if mask is not None:
+        attn_bias = mask if attn_bias is None else attn_bias + mask
+    if mha.attn_backend == "sdpa":
+        try:
+            z = F.scaled_dot_product_attention(
+                q, k, v,
+                attn_mask=attn_bias,
+                dropout_p=0.0,
+                scale=1.0 / math.sqrt(mha.dk),
+            )
+        except TypeError:
+            q_scaled = q * math.sqrt(q.size(-1) / mha.dk)
+            z = F.scaled_dot_product_attention(q_scaled, k, v, attn_mask=attn_bias, dropout_p=0.0)
+    elif mha.attn_backend == "sublinear":
+        z = mha._sublinear_attention(q, k, v, attn_mask=attn_bias)
+    else:
+        att = (q @ k.transpose(-1, -2)) / math.sqrt(mha.dk)
+        if attn_bias is not None:
+            att = att + attn_bias
+        z = att.softmax(-1) @ v
+    z = z.transpose(1, 2).reshape(bsz, seq, -1)
+    return mha.drop(mha.proj(z))
+def max_param_grad_diff(module, ref_grads: dict[str, torch.Tensor]) -> float:
+    out = 0.0
+    for name, param in module.named_parameters():
+        if param.grad is None:
+            continue
+        out = max(out, (param.grad.detach() - ref_grads[name]).abs().max().item())
+    return out
+def optimizer_for(core, ar_h, sat_h):
+    args = SimpleNamespace(optimizer="adamw")
+    return nb.make_optimizer(args, core, ar_h, sat_h, nb.LR_CORE, nb.LR_HEAD)
+def split_qkv_optimizer_state_for_test(opt_state: dict, group_names: list[list[str]]) -> dict:
+    out = {"state": {}, "param_groups": []}
+    next_pid = 0
+    for src_group, names in zip(opt_state["param_groups"], group_names):
+        dst_group = copy.deepcopy(src_group)
+        dst_params = []
+        for src_pid, name in zip(src_group["params"], names):
+            src_state = opt_state.get("state", {}).get(src_pid, {})
+            if name.endswith(".qkv.weight"):
+                base = name[: -len(".qkv.weight")]
+                legacy_names = [base + ".q.weight", base + ".k.weight", base + ".v.weight"]
+                split_states = [{}, {}, {}]
+                for key, value in src_state.items():
+                    if torch.is_tensor(value) and value.ndim > 0 and value.shape[0] % 3 == 0:
+                        chunks = value.detach().chunk(3, dim=0)
+                        for idx in range(3):
+                            split_states[idx][key] = chunks[idx].clone().contiguous()
+                    else:
+                        for idx in range(3):
+                            split_states[idx][key] = nb._clone_opt_value(value)
+                for legacy_name, split_state in zip(legacy_names, split_states):
+                    dst_params.append(next_pid)
+                    if split_state:
+                        out["state"][next_pid] = split_state
+                    next_pid += 1
+            else:
+                dst_params.append(next_pid)
+                if src_state:
+                    out["state"][next_pid] = {key: nb._clone_opt_value(value) for key, value in src_state.items()}
+                next_pid += 1
+        dst_group["params"] = dst_params
+        out["param_groups"].append(dst_group)
+    return out
+def run_optimizer_remap_check(cfg: dict) -> dict:
+    core = nb.Encoder(cfg, attn_backend="manual").to(nb.DEV).train()
+    ar_h = nb.ARHead(cfg["d"]).to(nb.DEV).train()
+    sat_h = nb.SATHead(cfg["d"]).to(nb.DEV).train()
+    opt = optimizer_for(core, ar_h, sat_h)
+    ids = torch.randint(0, nb.VOCAB, (1, 8), device=nb.DEV)
+    loss = ar_h(core(ids, nb.causal_mask(ids.size(1)))).float().square().mean()
+    loss.backward()
+    opt.step()
+    opt.zero_grad(set_to_none=True)
+    current_names = nb._optimizer_group_param_names(opt, core, ar_h, sat_h)
+    legacy_opt = split_qkv_optimizer_state_for_test(opt.state_dict(), current_names)
+    fresh = nb.Encoder(cfg, attn_backend="manual").to(nb.DEV)
+    fresh_ar = nb.ARHead(cfg["d"]).to(nb.DEV)
+    fresh_sat = nb.SATHead(cfg["d"]).to(nb.DEV)
+    fresh_opt = optimizer_for(fresh, fresh_ar, fresh_sat)
+    fused = nb._fuse_legacy_qkv_optimizer_state(legacy_opt, fresh_opt, fresh, fresh_ar, fresh_sat)
+    ok = fused is not None
+    if ok:
+        fresh_opt.load_state_dict(fused)
+        loaded = fresh_opt.state_dict()
+        qkv_shape_ok = False
+        names = nb._optimizer_group_param_names(fresh_opt, fresh, fresh_ar, fresh_sat)
+        for group, group_names in zip(loaded["param_groups"], names):
+            for pid, name in zip(group["params"], group_names):
+                if name.endswith(".qkv.weight") and pid in loaded["state"]:
+                    exp_avg = loaded["state"][pid].get("exp_avg")
+                    qkv_shape_ok = torch.is_tensor(exp_avg) and tuple(exp_avg.shape) == tuple(dict(fresh.named_parameters())[name[len("core."):]].shape)
+        ok = ok and qkv_shape_ok
+    return {"optimizer_remap": 0.0 if ok else 1.0}
+def verify_case(args, preset: str, backend: str) -> dict:
+    torch.manual_seed(args.seed)
+    cfg = nb.PRESETS[preset].copy()
+    d, h, r = cfg["d"], cfg["heads"], cfg["rank"]
+    seq = args.cached_len + args.new_len
+    mha = nb.TuneableAttentionMHA(d, h, r, attn_backend=backend).to(nb.DEV).eval()
+    rows = {}
+    for case_name, mask, rel_tokens in [
+        ("none", None, None),
+        ("causal_alibi", nb.causal_mask(seq), seq),
+        ("sat_alibi", nb.sat_mask(seq), seq),
+    ]:
+        x_fused = torch.randn(2, seq, d, device=nb.DEV, requires_grad=True)
+        x_ref = x_fused.detach().clone().requires_grad_(True)
+        y_fused = mha(x_fused, mask, rel_bias_tokens=rel_tokens)
+        y_ref = unfused_reference(mha, x_ref, mask=mask, rel_bias_tokens=rel_tokens)
+        loss_fused = y_fused.square().mean()
+        loss_ref = y_ref.square().mean()
+        loss_fused.backward()
+        fused_x_grad = x_fused.grad.detach().clone()
+        fused_param_grads = {
+            name: param.grad.detach().clone()
+            for name, param in mha.named_parameters()
+            if param.grad is not None
+        }
+        mha.zero_grad(set_to_none=True)
+        loss_ref.backward()
+        ref_x_grad = x_ref.grad.detach().clone()
+        rows[f"{case_name}_forward"] = (y_fused - y_ref).abs().max().item()
+        rows[f"{case_name}_loss"] = abs(loss_fused.item() - loss_ref.item())
+        rows[f"{case_name}_x_grad"] = (fused_x_grad - ref_x_grad).abs().max().item()
+        rows[f"{case_name}_param_grad"] = max_param_grad_diff(mha, fused_param_grads)
+        mha.zero_grad(set_to_none=True)
+    legacy_sd = nb._split_qkv_in_state_dict_for_test(mha.state_dict())
+    fresh = nb.TuneableAttentionMHA(d, h, r, attn_backend=backend).to(nb.DEV).eval()
+    missing, unexpected = fresh.load_state_dict(dict(legacy_sd), strict=True)
+    rows["legacy_load_missing_unexpected"] = float(len(missing) + len(unexpected))
+    rows["legacy_load_qkv_weight"] = (fresh.qkv.weight.detach() - mha.qkv.weight.detach()).abs().max().item()
+    with torch.no_grad():
+        x = torch.randn(2, seq, d, device=nb.DEV)
+        rows["legacy_load_forward"] = (fresh(x) - mha(x)).abs().max().item()
+    core = nb.Encoder(cfg, attn_backend=backend).to(nb.DEV).eval()
+    legacy_core_sd = nb._split_qkv_in_state_dict_for_test(core.state_dict())
+    dst = nb.Encoder(cfg, attn_backend=backend).to(nb.DEV).eval()
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ckpt = Path(tmpdir) / "legacy_core.pt"
+        torch.save({"core": legacy_core_sd}, ckpt)
+        loaded = nb._safe_load_any(ckpt, dst, key="core")
+    rows["safe_load_any_loaded"] = 0.0 if loaded > 0 else 1.0
+    rows["safe_load_any_qkv"] = max(
+        (a - b).abs().max().item()
+        for (name, a), b in [
+            ((name, param.detach()), dict(dst.named_parameters())[name].detach())
+            for name, param in core.named_parameters()
+            if name.endswith(".qkv.weight")
+        ]
+    )
+    if preset == "pico_1x" and backend == "manual":
+        rows.update(run_optimizer_remap_check(cfg))
+    ok = all(value <= args.tol for value in rows.values())
+    return {
+        "preset": preset,
+        "backend": backend,
+        "d": d,
+        "heads": h,
+        "rank": r,
+        "dk": d // h,
+        "ok": ok,
+        "tol": args.tol,
+        "rows": rows,
+    }
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Verify AGILLM-4 fused QKV harvest from n1.py")
+    parser.add_argument("--presets", default="pico_1x,micro_3x")
+    parser.add_argument("--backends", default="manual,sdpa")
+    parser.add_argument("--cached_len", type=int, default=8)
+    parser.add_argument("--new_len", type=int, default=4)
+    parser.add_argument("--seed", type=int, default=5678)
+    parser.add_argument("--tol", type=float, default=2e-4)
+    parser.add_argument("--json_out", default="")
+    args = parser.parse_args()
+    results = []
+    all_ok = True
+    for preset in [item.strip() for item in args.presets.split(",") if item.strip()]:
+        for backend in [item.strip() for item in args.backends.split(",") if item.strip()]:
+            result = verify_case(args, preset, backend)
+            results.append(result)
+            all_ok = all_ok and result["ok"]
+            print(json.dumps(result, sort_keys=True), flush=True)
+    if args.json_out:
+        Path(args.json_out).write_text(json.dumps(results, indent=2, sort_keys=True), encoding="utf-8")
+    return 0 if all_ok else 1
+if __name__ == "__main__":
+    raise SystemExit(main())