seconds-0 commited on Sep 2, 2025

Commit

4303959

verified ·

1 Parent(s): 117a62c

NSA 117M initial export

Browse files

Files changed (32) hide show

LICENSE +1 -0
README.md +61 -0
config.json +36 -0
configuration_nsa.py +40 -0
logs/logs_extra_keys.txt +49 -0
logs/logs_mapping.json +229 -0
logs/logs_missing_keys.txt +26 -0
model.safetensors +3 -0
modeling_nsa.py +311 -0
nsa/__init__.py +1 -0
nsa/cache/__init__.py +1 -0
nsa/cache/kv_cache.py +66 -0
nsa/core/README.md +27 -0
nsa/core/__init__.py +1 -0
nsa/core/attention_kernels.py +1403 -0
nsa/core/block_index.py +100 -0
nsa/core/collate.py +45 -0
nsa/core/compress_pool.py +39 -0
nsa/core/debug.py +44 -0
nsa/core/flags.py +80 -0
nsa/core/nsa_attention.py +1850 -0
nsa/core/packing.py +114 -0
nsa/core/rope.py +51 -0
nsa/core/selection_scorer.py +759 -0
nsa/data_pipeline.py +199 -0
nsa/kernels/__init__.py +1 -0
nsa/kernels/flash_wrappers.py +228 -0
nsa/model/__init__.py +1 -0
nsa/model/llama_block_nsa.py +129 -0
special_tokens_map.json +1 -0
tokenization_nsa.py +73 -0
tokenizer_config.json +11 -0

LICENSE ADDED Viewed

	@@ -0,0 +1 @@


1	+ Apache-2.0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+license: apache-2.0
+language:
+- en
+tags:
+- nsa
+- sparse-attention
+- 117m
+datasets:
+- fineweb-edu
+library_name: transformers
+pipeline_tag: text-generation
+base_model: byte-256
+---
+# NSA 117M (FineWeb-Edu) — Remote Code
+This repository contains a 117M NSA decoder-only model with remote code. It exposes `NSAConfig` and `NSAForCausalLM` so you can load via:
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+m = AutoModelForCausalLM.from_pretrained("seconds-0/nsa-117m-byte", trust_remote_code=True)
+t = AutoTokenizer.from_pretrained("seconds-0/nsa-117m-byte")
+out = m.generate(**t("Hello", return_tensors="pt"), max_new_tokens=16)
+```
+## What is NSA
+Native Sparse Attention (NSA) combines three branches — compressed (cmp), selected (sel), and sliding window (win) — mixed by a learned gate. The 117M configuration uses SDPA everywhere and keeps strict causality.
+Architecture (overview):
+- cmp: compressed blocks (tile length l, stride d) attended with causal masks
+- sel: top-n selection over blockized keys (block l′, n ranges per step)
+- win: sliding window attention of size w
+- gate: small MLP (zero-initialized last layer), softmax(τ=1.0)
+Defaults: l=32, d=16, l′=64, n=16, w=512; GQA groups=2.
+## Performance & Metrics (example targets)
+- A100 40GB: ≥600 tok/s; TTFT ≤ 350 ms (batch=1, seq=128)
+- RTX 4090: ≥400 tok/s; TTFT ≤ 450 ms
+- CPU: ≥10 tok/s; TTFT ≤ 2.0 s
+## Intended Use / Limitations
+- Toy assistant and demos; not suitable for high-stakes use.
+## Memory Budget (KV Cache)
+- Standard LM approx: Mem ≈ t × H × (d_k + d_v) × bytes_per_elem
+- NSA decode (M0): Mem ≈ (min(w, t) + n × l′) × H × (d_k + d_v) × bytes_per_elem
+- Example (w=512, n=16, l′=64): tokens_cached ≈ min(512, t) + 1024 (FP16 → a few MiB for 117M dims)
+## Notes
+- Tokenizer: byte-level tokenizer (vocab=256). This is not GPT‑2/BPE; input/output are raw UTF‑8 bytes.
+- Generation cache: no KV cache in v1 (slower decode for long sequences). Planned follow‑up.
+- Gate: initialized to uniform mixing by design (zero‑init last layer); differs from trained gate topology.
+- Remote code uses SDPA-only paths and includes a safe fallback block if NSA is forcibly disabled via env.

config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "model_type": "nsa",
+  "architectures": [
+    "NSAForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_nsa.NSAConfig",
+    "AutoModelForCausalLM": "modeling_nsa.NSAForCausalLM",
+    "AutoTokenizer": [
+      "tokenization_nsa.NSAByteTokenizer",
+      null
+    ]
+  },
+  "vocab_size": 256,
+  "hidden_size": 768,
+  "num_hidden_layers": 12,
+  "num_attention_heads": 12,
+  "n_kv_groups": 2,
+  "d_k": 64,
+  "d_v": 64,
+  "max_position_embeddings": 2048,
+  "rope_theta": 10000,
+  "nsa": {
+    "branches": [
+      "cmp",
+      "sel",
+      "win"
+    ],
+    "window": 512,
+    "gqa_groups": 2,
+    "block": 32,
+    "stride": 16,
+    "sel_block": 64,
+    "sel_top_n": 16
+  }
+}

configuration_nsa.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Remote code: configuration and modeling for NSA
+from transformers import PretrainedConfig
+class NSAConfig(PretrainedConfig):
+    model_type = "nsa"
+    def __init__(
+        self,
+        vocab_size=50257,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        n_kv_groups=1,
+        d_k=64,
+        d_v=64,
+        max_position_embeddings=2048,
+        rope_theta=10000,
+        nsa=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_kv_groups = n_kv_groups
+        self.d_k = d_k
+        self.d_v = d_v
+        self.max_position_embeddings = max_position_embeddings
+        self.rope_theta = rope_theta
+        self.nsa = nsa or {
+            "branches": ["cmp", "sel", "win"],
+            "window": 512,
+            "gqa_groups": n_kv_groups,
+            "block": 32,
+            "stride": 16,
+            "sel_block": 64,
+            "sel_top_n": 16,
+        }

logs/logs_extra_keys.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+blocks.0.attn.gate.fc1.bias
+blocks.0.attn.gate.fc1.weight
+blocks.0.attn.gate.fc2.bias
+blocks.0.attn.gate.fc2.weight
+blocks.1.attn.gate.fc1.bias
+blocks.1.attn.gate.fc1.weight
+blocks.1.attn.gate.fc2.bias
+blocks.1.attn.gate.fc2.weight
+blocks.10.attn.gate.fc1.bias
+blocks.10.attn.gate.fc1.weight
+blocks.10.attn.gate.fc2.bias
+blocks.10.attn.gate.fc2.weight
+blocks.11.attn.gate.fc1.bias
+blocks.11.attn.gate.fc1.weight
+blocks.11.attn.gate.fc2.bias
+blocks.11.attn.gate.fc2.weight
+blocks.2.attn.gate.fc1.bias
+blocks.2.attn.gate.fc1.weight
+blocks.2.attn.gate.fc2.bias
+blocks.2.attn.gate.fc2.weight
+blocks.3.attn.gate.fc1.bias
+blocks.3.attn.gate.fc1.weight
+blocks.3.attn.gate.fc2.bias
+blocks.3.attn.gate.fc2.weight
+blocks.4.attn.gate.fc1.bias
+blocks.4.attn.gate.fc1.weight
+blocks.4.attn.gate.fc2.bias
+blocks.4.attn.gate.fc2.weight
+blocks.5.attn.gate.fc1.bias
+blocks.5.attn.gate.fc1.weight
+blocks.5.attn.gate.fc2.bias
+blocks.5.attn.gate.fc2.weight
+blocks.6.attn.gate.fc1.bias
+blocks.6.attn.gate.fc1.weight
+blocks.6.attn.gate.fc2.bias
+blocks.6.attn.gate.fc2.weight
+blocks.7.attn.gate.fc1.bias
+blocks.7.attn.gate.fc1.weight
+blocks.7.attn.gate.fc2.bias
+blocks.7.attn.gate.fc2.weight
+blocks.8.attn.gate.fc1.bias
+blocks.8.attn.gate.fc1.weight
+blocks.8.attn.gate.fc2.bias
+blocks.8.attn.gate.fc2.weight
+blocks.9.attn.gate.fc1.bias
+blocks.9.attn.gate.fc1.weight
+blocks.9.attn.gate.fc2.bias
+blocks.9.attn.gate.fc2.weight
+norm_f.weight

logs/logs_mapping.json ADDED Viewed

	@@ -0,0 +1,229 @@

+{
+  "mapped": [
+    "model.blocks.0.attn.W_K_cmp.weight",
+    "model.blocks.0.attn.W_K_sel.weight",
+    "model.blocks.0.attn.W_K_win.weight",
+    "model.blocks.0.attn.W_Q.weight",
+    "model.blocks.0.attn.W_V_cmp.weight",
+    "model.blocks.0.attn.W_V_sel.weight",
+    "model.blocks.0.attn.W_V_win.weight",
+    "model.blocks.0.attn.out.weight",
+    "model.blocks.0.mlp.fc1.weight",
+    "model.blocks.0.mlp.fc2.weight",
+    "model.blocks.0.norm1.weight",
+    "model.blocks.0.norm2.weight",
+    "model.blocks.1.attn.W_K_cmp.weight",
+    "model.blocks.1.attn.W_K_sel.weight",
+    "model.blocks.1.attn.W_K_win.weight",
+    "model.blocks.1.attn.W_Q.weight",
+    "model.blocks.1.attn.W_V_cmp.weight",
+    "model.blocks.1.attn.W_V_sel.weight",
+    "model.blocks.1.attn.W_V_win.weight",
+    "model.blocks.1.attn.out.weight",
+    "model.blocks.1.mlp.fc1.weight",
+    "model.blocks.1.mlp.fc2.weight",
+    "model.blocks.1.norm1.weight",
+    "model.blocks.1.norm2.weight",
+    "model.blocks.10.attn.W_K_cmp.weight",
+    "model.blocks.10.attn.W_K_sel.weight",
+    "model.blocks.10.attn.W_K_win.weight",
+    "model.blocks.10.attn.W_Q.weight",
+    "model.blocks.10.attn.W_V_cmp.weight",
+    "model.blocks.10.attn.W_V_sel.weight",
+    "model.blocks.10.attn.W_V_win.weight",
+    "model.blocks.10.attn.out.weight",
+    "model.blocks.10.mlp.fc1.weight",
+    "model.blocks.10.mlp.fc2.weight",
+    "model.blocks.10.norm1.weight",
+    "model.blocks.10.norm2.weight",
+    "model.blocks.11.attn.W_K_cmp.weight",
+    "model.blocks.11.attn.W_K_sel.weight",
+    "model.blocks.11.attn.W_K_win.weight",
+    "model.blocks.11.attn.W_Q.weight",
+    "model.blocks.11.attn.W_V_cmp.weight",
+    "model.blocks.11.attn.W_V_sel.weight",
+    "model.blocks.11.attn.W_V_win.weight",
+    "model.blocks.11.attn.out.weight",
+    "model.blocks.11.mlp.fc1.weight",
+    "model.blocks.11.mlp.fc2.weight",
+    "model.blocks.11.norm1.weight",
+    "model.blocks.11.norm2.weight",
+    "model.blocks.2.attn.W_K_cmp.weight",
+    "model.blocks.2.attn.W_K_sel.weight",
+    "model.blocks.2.attn.W_K_win.weight",
+    "model.blocks.2.attn.W_Q.weight",
+    "model.blocks.2.attn.W_V_cmp.weight",
+    "model.blocks.2.attn.W_V_sel.weight",
+    "model.blocks.2.attn.W_V_win.weight",
+    "model.blocks.2.attn.out.weight",
+    "model.blocks.2.mlp.fc1.weight",
+    "model.blocks.2.mlp.fc2.weight",
+    "model.blocks.2.norm1.weight",
+    "model.blocks.2.norm2.weight",
+    "model.blocks.3.attn.W_K_cmp.weight",
+    "model.blocks.3.attn.W_K_sel.weight",
+    "model.blocks.3.attn.W_K_win.weight",
+    "model.blocks.3.attn.W_Q.weight",
+    "model.blocks.3.attn.W_V_cmp.weight",
+    "model.blocks.3.attn.W_V_sel.weight",
+    "model.blocks.3.attn.W_V_win.weight",
+    "model.blocks.3.attn.out.weight",
+    "model.blocks.3.mlp.fc1.weight",
+    "model.blocks.3.mlp.fc2.weight",
+    "model.blocks.3.norm1.weight",
+    "model.blocks.3.norm2.weight",
+    "model.blocks.4.attn.W_K_cmp.weight",
+    "model.blocks.4.attn.W_K_sel.weight",
+    "model.blocks.4.attn.W_K_win.weight",
+    "model.blocks.4.attn.W_Q.weight",
+    "model.blocks.4.attn.W_V_cmp.weight",
+    "model.blocks.4.attn.W_V_sel.weight",
+    "model.blocks.4.attn.W_V_win.weight",
+    "model.blocks.4.attn.out.weight",
+    "model.blocks.4.mlp.fc1.weight",
+    "model.blocks.4.mlp.fc2.weight",
+    "model.blocks.4.norm1.weight",
+    "model.blocks.4.norm2.weight",
+    "model.blocks.5.attn.W_K_cmp.weight",
+    "model.blocks.5.attn.W_K_sel.weight",
+    "model.blocks.5.attn.W_K_win.weight",
+    "model.blocks.5.attn.W_Q.weight",
+    "model.blocks.5.attn.W_V_cmp.weight",
+    "model.blocks.5.attn.W_V_sel.weight",
+    "model.blocks.5.attn.W_V_win.weight",
+    "model.blocks.5.attn.out.weight",
+    "model.blocks.5.mlp.fc1.weight",
+    "model.blocks.5.mlp.fc2.weight",
+    "model.blocks.5.norm1.weight",
+    "model.blocks.5.norm2.weight",
+    "model.blocks.6.attn.W_K_cmp.weight",
+    "model.blocks.6.attn.W_K_sel.weight",
+    "model.blocks.6.attn.W_K_win.weight",
+    "model.blocks.6.attn.W_Q.weight",
+    "model.blocks.6.attn.W_V_cmp.weight",
+    "model.blocks.6.attn.W_V_sel.weight",
+    "model.blocks.6.attn.W_V_win.weight",
+    "model.blocks.6.attn.out.weight",
+    "model.blocks.6.mlp.fc1.weight",
+    "model.blocks.6.mlp.fc2.weight",
+    "model.blocks.6.norm1.weight",
+    "model.blocks.6.norm2.weight",
+    "model.blocks.7.attn.W_K_cmp.weight",
+    "model.blocks.7.attn.W_K_sel.weight",
+    "model.blocks.7.attn.W_K_win.weight",
+    "model.blocks.7.attn.W_Q.weight",
+    "model.blocks.7.attn.W_V_cmp.weight",
+    "model.blocks.7.attn.W_V_sel.weight",
+    "model.blocks.7.attn.W_V_win.weight",
+    "model.blocks.7.attn.out.weight",
+    "model.blocks.7.mlp.fc1.weight",
+    "model.blocks.7.mlp.fc2.weight",
+    "model.blocks.7.norm1.weight",
+    "model.blocks.7.norm2.weight",
+    "model.blocks.8.attn.W_K_cmp.weight",
+    "model.blocks.8.attn.W_K_sel.weight",
+    "model.blocks.8.attn.W_K_win.weight",
+    "model.blocks.8.attn.W_Q.weight",
+    "model.blocks.8.attn.W_V_cmp.weight",
+    "model.blocks.8.attn.W_V_sel.weight",
+    "model.blocks.8.attn.W_V_win.weight",
+    "model.blocks.8.attn.out.weight",
+    "model.blocks.8.mlp.fc1.weight",
+    "model.blocks.8.mlp.fc2.weight",
+    "model.blocks.8.norm1.weight",
+    "model.blocks.8.norm2.weight",
+    "model.blocks.9.attn.W_K_cmp.weight",
+    "model.blocks.9.attn.W_K_sel.weight",
+    "model.blocks.9.attn.W_K_win.weight",
+    "model.blocks.9.attn.W_Q.weight",
+    "model.blocks.9.attn.W_V_cmp.weight",
+    "model.blocks.9.attn.W_V_sel.weight",
+    "model.blocks.9.attn.W_V_win.weight",
+    "model.blocks.9.attn.out.weight",
+    "model.blocks.9.mlp.fc1.weight",
+    "model.blocks.9.mlp.fc2.weight",
+    "model.blocks.9.norm1.weight",
+    "model.blocks.9.norm2.weight",
+    "model.embed.weight",
+    "model.lm_head.weight"
+  ],
+  "missing": [
+    "model.blocks.0.attn.g1.weight",
+    "model.blocks.0.attn.g2.weight",
+    "model.blocks.1.attn.g1.weight",
+    "model.blocks.1.attn.g2.weight",
+    "model.blocks.10.attn.g1.weight",
+    "model.blocks.10.attn.g2.weight",
+    "model.blocks.11.attn.g1.weight",
+    "model.blocks.11.attn.g2.weight",
+    "model.blocks.2.attn.g1.weight",
+    "model.blocks.2.attn.g2.weight",
+    "model.blocks.3.attn.g1.weight",
+    "model.blocks.3.attn.g2.weight",
+    "model.blocks.4.attn.g1.weight",
+    "model.blocks.4.attn.g2.weight",
+    "model.blocks.5.attn.g1.weight",
+    "model.blocks.5.attn.g2.weight",
+    "model.blocks.6.attn.g1.weight",
+    "model.blocks.6.attn.g2.weight",
+    "model.blocks.7.attn.g1.weight",
+    "model.blocks.7.attn.g2.weight",
+    "model.blocks.8.attn.g1.weight",
+    "model.blocks.8.attn.g2.weight",
+    "model.blocks.9.attn.g1.weight",
+    "model.blocks.9.attn.g2.weight",
+    "model.norm.bias",
+    "model.norm.weight"
+  ],
+  "extra": [
+    "blocks.0.attn.gate.fc1.bias",
+    "blocks.0.attn.gate.fc1.weight",
+    "blocks.0.attn.gate.fc2.bias",
+    "blocks.0.attn.gate.fc2.weight",
+    "blocks.1.attn.gate.fc1.bias",
+    "blocks.1.attn.gate.fc1.weight",
+    "blocks.1.attn.gate.fc2.bias",
+    "blocks.1.attn.gate.fc2.weight",
+    "blocks.10.attn.gate.fc1.bias",
+    "blocks.10.attn.gate.fc1.weight",
+    "blocks.10.attn.gate.fc2.bias",
+    "blocks.10.attn.gate.fc2.weight",
+    "blocks.11.attn.gate.fc1.bias",
+    "blocks.11.attn.gate.fc1.weight",
+    "blocks.11.attn.gate.fc2.bias",
+    "blocks.11.attn.gate.fc2.weight",
+    "blocks.2.attn.gate.fc1.bias",
+    "blocks.2.attn.gate.fc1.weight",
+    "blocks.2.attn.gate.fc2.bias",
+    "blocks.2.attn.gate.fc2.weight",
+    "blocks.3.attn.gate.fc1.bias",
+    "blocks.3.attn.gate.fc1.weight",
+    "blocks.3.attn.gate.fc2.bias",
+    "blocks.3.attn.gate.fc2.weight",
+    "blocks.4.attn.gate.fc1.bias",
+    "blocks.4.attn.gate.fc1.weight",
+    "blocks.4.attn.gate.fc2.bias",
+    "blocks.4.attn.gate.fc2.weight",
+    "blocks.5.attn.gate.fc1.bias",
+    "blocks.5.attn.gate.fc1.weight",
+    "blocks.5.attn.gate.fc2.bias",
+    "blocks.5.attn.gate.fc2.weight",
+    "blocks.6.attn.gate.fc1.bias",
+    "blocks.6.attn.gate.fc1.weight",
+    "blocks.6.attn.gate.fc2.bias",
+    "blocks.6.attn.gate.fc2.weight",
+    "blocks.7.attn.gate.fc1.bias",
+    "blocks.7.attn.gate.fc1.weight",
+    "blocks.7.attn.gate.fc2.bias",
+    "blocks.7.attn.gate.fc2.weight",
+    "blocks.8.attn.gate.fc1.bias",
+    "blocks.8.attn.gate.fc1.weight",
+    "blocks.8.attn.gate.fc2.bias",
+    "blocks.8.attn.gate.fc2.weight",
+    "blocks.9.attn.gate.fc1.bias",
+    "blocks.9.attn.gate.fc1.weight",
+    "blocks.9.attn.gate.fc2.bias",
+    "blocks.9.attn.gate.fc2.weight",
+    "norm_f.weight"
+  ]
+}

logs/logs_missing_keys.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+model.blocks.0.attn.g1.weight
+model.blocks.0.attn.g2.weight
+model.blocks.1.attn.g1.weight
+model.blocks.1.attn.g2.weight
+model.blocks.10.attn.g1.weight
+model.blocks.10.attn.g2.weight
+model.blocks.11.attn.g1.weight
+model.blocks.11.attn.g2.weight
+model.blocks.2.attn.g1.weight
+model.blocks.2.attn.g2.weight
+model.blocks.3.attn.g1.weight
+model.blocks.3.attn.g2.weight
+model.blocks.4.attn.g1.weight
+model.blocks.4.attn.g2.weight
+model.blocks.5.attn.g1.weight
+model.blocks.5.attn.g2.weight
+model.blocks.6.attn.g1.weight
+model.blocks.6.attn.g2.weight
+model.blocks.7.attn.g1.weight
+model.blocks.7.attn.g2.weight
+model.blocks.8.attn.g1.weight
+model.blocks.8.attn.g2.weight
+model.blocks.9.attn.g1.weight
+model.blocks.9.attn.g2.weight
+model.norm.bias
+model.norm.weight

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92e303af798306020bcf0b1a6293a9e88027887b70d61d110fc4cba274cedf66
+size 320203152

modeling_nsa.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# Remote code: configuration and modeling for NSA
+import math
+from typing import Optional
+import torch
+from torch import nn
+from transformers import PreTrainedModel
+from transformers.generation.utils import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutput
+from .configuration_nsa import NSAConfig
+_HAS_NSA = False  # Embedded NSA is provided below; no external import required.
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
+        return (x * rms) * self.weight
+class MLP(nn.Module):
+    def __init__(self, dim: int, hidden_mult: int = 4) -> None:
+        super().__init__()
+        h = hidden_mult * dim
+        self.fc1 = nn.Linear(dim, h, bias=False)
+        self.fc2 = nn.Linear(h, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(torch.nn.functional.silu(self.fc1(x)))
+def _rope(q: torch.Tensor) -> torch.Tensor:
+    B, S, D = q.shape[0], q.shape[2], q.shape[-1]
+    if D % 2 != 0:
+        return q
+    device = q.device
+    half = D // 2
+    pos = torch.arange(S, device=device).float().unsqueeze(-1)
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, half, device=device).float() / half))
+    angles = pos * inv_freq
+    cos = angles.cos().view(1, 1, S, half)
+    sin = angles.sin().view(1, 1, S, half)
+    q1, q2 = q[..., :half], q[..., half:]
+    return torch.cat([q1 * cos - q2 * sin, q1 * sin + q2 * cos], dim=-1)
+def _avg_pool_time(x: torch.Tensor, kernel: int, stride: int) -> torch.Tensor:
+    if x.shape[2] < kernel:
+        return x[..., :0, :]
+    xt = x.permute(0, 3, 1, 2).contiguous()
+    y = torch.nn.functional.avg_pool2d(xt, kernel_size=(1, kernel), stride=(1, stride))
+    return y.permute(0, 2, 3, 1).contiguous()
+def _window_mask(q: torch.Tensor, S: int, w: int) -> torch.Tensor:
+    B, h = q.shape[0], q.shape[1]
+    device = q.device
+    row = torch.arange(S, device=device).view(S, 1)
+    col = torch.arange(S, device=device).view(1, S)
+    allowed = (col <= row) & (col >= (row - (w - 1)))
+    M = torch.full((S, S), float('-inf'), device=device, dtype=q.dtype)
+    M.masked_fill_(allowed, 0.0)
+    return M.view(1, 1, S, S).expand(B, h, S, S)
+def _selection_blocks(scores: torch.Tensor, l_sel: int, n_sel: int) -> torch.Tensor:
+    B, h, S = scores.shape
+    n_blocks = max(1, (S + l_sel - 1) // l_sel)
+    # Pad to multiple of l_sel
+    pad = n_blocks * l_sel - S
+    if pad > 0:
+        scores = torch.nn.functional.pad(scores, (0, pad), value=-1e9)
+    blk_scores = scores.view(B, h, n_blocks, l_sel).max(dim=-1).values
+    k = min(n_sel, n_blocks)
+    return torch.topk(blk_scores, k=k, dim=-1).indices
+class EmbeddedNSAAttention(nn.Module):
+    def __init__(self, dim: int, n_heads: int, n_kv_groups: int, d_k: int, d_v: int,
+                 l: int, d: int, l_sel: int, n_sel: int, w: int) -> None:
+        super().__init__()
+        self.n_heads = n_heads
+        self.n_kv_groups = n_kv_groups
+        self.d_k = d_k
+        self.d_v = d_v
+        self.l = l
+        self.stride = d
+        self.l_sel = l_sel
+        self.n_sel = n_sel
+        self.w = w
+        self.W_Q = nn.Linear(dim, n_heads * d_k, bias=False)
+        self.W_K_cmp = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_cmp = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.W_K_sel = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_sel = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.W_K_win = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_win = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.g1 = nn.Linear(dim, max(1, dim // 4), bias=False)
+        self.g2 = nn.Linear(max(1, dim // 4), 3, bias=False)
+        nn.init.zeros_(self.g2.weight)
+        self.out = nn.Linear(n_heads * d_v, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, S, D = x.shape
+        h, dk, dv = self.n_heads, self.d_k, self.d_v
+        Q = self.W_Q(x).view(B, S, h, dk).transpose(1, 2)  # [B,h,S,dk]
+        g = max(1, self.n_kv_groups)
+        r = max(1, h // g)
+        # Project per-group K/V then broadcast to heads
+        Kc_g = self.W_K_cmp(x).view(B, S, g, dk).permute(0, 2, 1, 3)  # [B,g,S,dk]
+        Vc_g = self.W_V_cmp(x).view(B, S, g, dv).permute(0, 2, 1, 3)
+        Ks_g = self.W_K_sel(x).view(B, S, g, dk).permute(0, 2, 1, 3)
+        Vs_g = self.W_V_sel(x).view(B, S, g, dv).permute(0, 2, 1, 3)
+        Kw_g = self.W_K_win(x).view(B, S, g, dk).permute(0, 2, 1, 3)
+        Vw_g = self.W_V_win(x).view(B, S, g, dv).permute(0, 2, 1, 3)
+        # Broadcast groups to heads
+        def _bcast_to_heads(T):
+            return T.unsqueeze(1).expand(B, r, g, S, T.shape[-1]).reshape(B, h, S, T.shape[-1])
+        Kc = _bcast_to_heads(Kc_g)
+        Vc = _bcast_to_heads(Vc_g)
+        Ks = _bcast_to_heads(Ks_g)
+        Vs = _bcast_to_heads(Vs_g)
+        Kw = _bcast_to_heads(Kw_g)
+        Vw = _bcast_to_heads(Vw_g)
+        # RoPE
+        Qr = _rope(Q.transpose(1, 2)).transpose(1, 2)
+        Kc_r = _rope(Kc.transpose(1, 2)).transpose(1, 2)
+        Ks_r = _rope(Ks.transpose(1, 2)).transpose(1, 2)
+        Kw_r = _rope(Kw.transpose(1, 2)).transpose(1, 2)
+        # Compressed: average-pool along time
+        Kc_p = _avg_pool_time(Kc_r, kernel=max(1, self.stride), stride=max(1, self.stride))
+        Vc_p = _avg_pool_time(Vc, kernel=max(1, self.stride), stride=max(1, self.stride))
+        O_cmp = torch.nn.functional.scaled_dot_product_attention(Qr, Kc_p, Vc_p, is_causal=True)
+        # Selection: naive top-n blocks (global), enforce causal via triangular mask
+        scores = (Qr * Ks_r).mean(dim=-1)  # [B,h,S]
+        blk_idx = _selection_blocks(scores, self.l_sel, self.n_sel)  # [B,h,n]
+        n_blocks = max(1, (S + self.l_sel - 1) // self.l_sel)
+        keep = torch.zeros((B, h, n_blocks), device=x.device, dtype=torch.bool)
+        keep.scatter_(2, blk_idx, True)
+        keep = keep.unsqueeze(-1).expand(B, h, n_blocks, self.l_sel).reshape(B, h, -1)[:, :, :S]
+        logits = torch.matmul(Qr / math.sqrt(dk), Ks_r.transpose(-2, -1))  # [B,h,S,S]
+        tri = torch.triu(torch.ones((S, S), device=x.device, dtype=torch.bool), diagonal=1)
+        logits = logits.masked_fill(tri, float('-inf'))
+        sel_mask = torch.where(keep.unsqueeze(2).expand(B, h, S, S), torch.zeros((), device=x.device, dtype=Qr.dtype), torch.full((), float('-inf'), device=x.device, dtype=Qr.dtype))
+        P = torch.nn.functional.softmax(logits + sel_mask, dim=-1)
+        O_sel = torch.matmul(P, Vs)
+        # Sliding window
+        M = _window_mask(Qr, S, max(1, self.w))
+        logits_w = torch.matmul(Qr / math.sqrt(dk), Kw_r.transpose(-2, -1)) + M
+        P_w = torch.nn.functional.softmax(logits_w, dim=-1)
+        O_win = torch.matmul(P_w, Vw)
+        # Gate & mix
+        gate = self.g2(torch.nn.functional.silu(self.g1(x)))  # [B,S,3]
+        gate = torch.nn.functional.softmax(gate, dim=-1)
+        gc, gs, gw = gate[..., 0:1], gate[..., 1:2], gate[..., 2:3]
+        O = gc.unsqueeze(1) * O_cmp + gs.unsqueeze(1) * O_sel + gw.unsqueeze(1) * O_win
+        O = O.transpose(1, 2).reshape(B, S, h * dv)
+        return self.out(O)
+class SimpleAttention(nn.Module):
+    def __init__(self, dim: int, n_heads: int, d_k: int, d_v: int) -> None:
+        super().__init__()
+        self.n_heads = n_heads
+        self.d_k = d_k
+        self.d_v = d_v
+        self.q_proj = nn.Linear(dim, n_heads * d_k, bias=False)
+        self.k_proj = nn.Linear(dim, n_heads * d_k, bias=False)
+        self.v_proj = nn.Linear(dim, n_heads * d_v, bias=False)
+        self.out = nn.Linear(n_heads * d_v, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, S, D = x.shape
+        h, dk, dv = self.n_heads, self.d_k, self.d_v
+        q = self.q_proj(x).view(B, S, h, dk).transpose(1, 2)  # [B,h,S,dk]
+        k = self.k_proj(x).view(B, S, h, dk).transpose(1, 2)  # [B,h,S,dk]
+        v = self.v_proj(x).view(B, S, h, dv).transpose(1, 2)  # [B,h,S,dv]
+        attn = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
+        attn = attn.transpose(1, 2).contiguous().view(B, S, h * dv)
+        return self.out(attn)
+class SimpleBlock(nn.Module):
+    def __init__(self, dim: int, n_heads: int, d_k: int, d_v: int) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = SimpleAttention(dim, n_heads, d_k, d_v)
+        self.norm2 = RMSNorm(dim)
+        self.mlp = MLP(dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class NSABlockRemote(nn.Module):
+    """Transformer block with embedded NSA attention, pre/post RMSNorm, and MLP."""
+    def __init__(self, dim: int, n_heads: int, n_kv_groups: int, d_k: int, d_v: int,
+                 l: int, d: int, l_sel: int, n_sel: int, w: int) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = EmbeddedNSAAttention(dim, n_heads, n_kv_groups, d_k, d_v, l, d, l_sel, n_sel, w)
+        self.norm2 = RMSNorm(dim)
+        self.mlp = MLP(dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+class NSATinyLM(nn.Module):
+    def __init__(self, config: NSAConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = int(config.vocab_size)
+        self.hidden_size = int(config.hidden_size)
+        self.num_hidden_layers = int(config.num_hidden_layers)
+        self.num_attention_heads = int(config.num_attention_heads)
+        self.n_kv_groups = int(getattr(config, "n_kv_groups", 1))
+        self.d_k = int(getattr(config, "d_k", self.hidden_size // self.num_attention_heads))
+        self.d_v = int(getattr(config, "d_v", self.hidden_size // self.num_attention_heads))
+        nsa = config.nsa or {}
+        self.l = int(nsa.get("block", 32))
+        self.d = int(nsa.get("stride", 16))
+        self.l_sel = int(nsa.get("sel_block", 64))
+        self.n_sel = int(nsa.get("sel_top_n", 16))
+        self.w = int(nsa.get("window", 512))
+        self.embed = nn.Embedding(self.vocab_size, self.hidden_size)
+        import os as _os
+        # Allow forcing simple fallback via env for integration tests
+        _force_simple = _os.getenv('NSA_REMOTE_FORCE_SIMPLE', '0').lower() in ('1','true','yes')
+        if _force_simple == False:
+            self.blocks = nn.ModuleList([
+                NSABlockRemote(
+                    self.hidden_size,
+                    self.num_attention_heads,
+                    self.n_kv_groups,
+                    self.d_k,
+                    self.d_v,
+                    self.l,
+                    self.d,
+                    self.l_sel,
+                    self.n_sel,
+                    self.w,
+                ) for _ in range(self.num_hidden_layers)
+            ])
+        else:
+            self.blocks = nn.ModuleList([
+                SimpleBlock(self.hidden_size, self.num_attention_heads, self.d_k, self.d_v)
+                for _ in range(self.num_hidden_layers)
+            ])
+        self.norm = nn.LayerNorm(self.hidden_size)
+        self.lm_head = nn.Linear(self.hidden_size, self.vocab_size, bias=False)
+    def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
+        x = self.embed(input_ids)
+        for blk in self.blocks:
+            x = blk(x)
+        x = self.norm(x)
+        logits = self.lm_head(x)
+        return logits
+class NSAForCausalLM(PreTrainedModel, GenerationMixin):
+    config_class = NSAConfig
+    _no_split_modules = ["EmbeddedNSAAttention", "SimpleBlock"]
+    def __init__(self, config: NSAConfig):
+        super().__init__(config)
+        self.model = NSATinyLM(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed
+    def set_input_embeddings(self, new_emb):
+        self.model.embed = new_emb
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ):
+        if input_ids is None:
+            raise ValueError("input_ids is required")
+        logits = self.model(input_ids)
+        loss = None
+        if labels is not None:
+            # Shift for causal LM loss
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss_fct = torch.nn.CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+        return CausalLMOutput(loss=loss, logits=logits)
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # No past_key_values cache: rerun full sequence. Works everywhere, slower at decode.
+        return {"input_ids": input_ids, "attention_mask": kwargs.get("attention_mask", None)}

nsa/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from __future__ import annotations

nsa/cache/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from __future__ import annotations

nsa/cache/kv_cache.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+from nsa.core.block_index import BlockMeta
+@dataclass
+class NSA_KV:
+    K_sel: torch.Tensor  # [B,G,S,Dk]
+    V_sel: torch.Tensor  # [B,G,S,Dv]
+    K_win: torch.Tensor  # [B,G,S,Dk]
+    V_win: torch.Tensor  # [B,G,S,Dv]
+    # raw token-level seq for compressed branch
+    K_cmp_raw_seq: torch.Tensor  # [B,G,S,Dk]
+    V_cmp_raw_seq: torch.Tensor  # [B,G,S,Dv]
+    K_cmp: torch.Tensor  # [B,G,S_cmp,Dk]
+    V_cmp: torch.Tensor  # [B,G,S_cmp,Dv]
+    win_ptr: torch.Tensor  # [B,G]
+    cmp_emit_next: torch.Tensor  # [B,G]
+    meta: BlockMeta
+    reads_pred: torch.Tensor  # [T] per decode step predicted total reads
+    reads_act_total: torch.Tensor  # [T]
+    reads_act_sel: torch.Tensor  # [T]
+    reads_act_cmp: torch.Tensor  # [T]
+    reads_act_win: torch.Tensor  # [T]
+    def update_selection_raw(self, K: torch.Tensor, V: torch.Tensor) -> None:
+        self.K_sel = torch.cat([self.K_sel, K], dim=2)
+        self.V_sel = torch.cat([self.V_sel, V], dim=2)
+    def update_window(self, K: torch.Tensor, V: torch.Tensor, w: int) -> None:
+        self.K_win = torch.cat([self.K_win, K], dim=2)
+        self.V_win = torch.cat([self.V_win, V], dim=2)
+        # keep last w tokens
+        if self.K_win.shape[2] > w:
+            self.K_win = self.K_win[:, :, -w:, :]
+            self.V_win = self.V_win[:, :, -w:, :]
+    def update_compressed(
+        self, K_raw_cmp: torch.Tensor, V_raw_cmp: torch.Tensor, l: int, d: int
+    ) -> None:
+        # M0 prefill path: rebuild fully using avg-pool ϕ handled upstream
+        self.K_cmp = K_raw_cmp
+        self.V_cmp = V_raw_cmp
+    def append_cmp_raw(self, K_raw_tok: torch.Tensor, V_raw_tok: torch.Tensor) -> None:
+        self.K_cmp_raw_seq = torch.cat([self.K_cmp_raw_seq, K_raw_tok], dim=2)
+        self.V_cmp_raw_seq = torch.cat([self.V_cmp_raw_seq, V_raw_tok], dim=2)
+    def append_reads_pred(self, value: int) -> None:
+        v = torch.tensor([value], dtype=torch.int64, device=self.K_sel.device)
+        self.reads_pred = torch.cat([self.reads_pred, v], dim=0) if self.reads_pred.numel() else v
+    def append_reads_actual(self, total: int, sel: int, cmp: int, win: int) -> None:
+        dev = self.K_sel.device
+        def cat_or_set(t: torch.Tensor, val: int) -> torch.Tensor:
+            v = torch.tensor([val], dtype=torch.int64, device=dev)
+            return torch.cat([t, v], dim=0) if t.numel() else v
+        self.reads_act_total = cat_or_set(self.reads_act_total, total)
+        self.reads_act_sel = cat_or_set(self.reads_act_sel, sel)
+        self.reads_act_cmp = cat_or_set(self.reads_act_cmp, cmp)
+        self.reads_act_win = cat_or_set(self.reads_act_win, win)

nsa/core/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+# NSA Core Modules — Map and Responsibilities
+Purpose
+- Quick orientation for contributors. Links to architecture and tests mapping.
+Modules
+- `nsa_attention.py`: Top‑level attention module. Branch wiring (cmp/sel/win), gate MLP (τ=1.0, zero‑init last layer), strict masks, decode caches (`K_sel/V_sel`, `K_win/V_win`), counters.
+- `selection_scorer.py`: Selection pipeline — compute p_cmp, map to p_slc (Eq.9 CSR), group reduce (Eq.10), deterministic top‑n, range construction (v2 vectorized), NVTX tags.
+- `block_index.py`: CSR for cmp→sel fractional overlaps, conversions (CSR↔COO), helpers.
+- `compress_pool.py`: Compressed branch pooling ϕ, emission schedule (warmup l, stride d), RoPE ordering.
+- `attention_kernels.py`: SDPA variants — packed selection, masked SDPA, varlen helpers; FA‑2 wrappers (opt‑in) for cmp/win.
+- `packing.py`: Range packing, index normalization, adjacency merge/de‑dup.
+- `rope.py`: RoPE application for Q and per‑branch K before ϕ.
+- `flags.py`: Environment flags and routing toggles.
+- `debug.py`, `collate.py`: Debug helpers and varlen collate utilities.
+Key Invariants (guarded by tests)
+- Strict causality masks (see `nsa/tests/test_masks.py`).
+- Group consistency (Eq.10) (see `nsa/tests/test_group_consistency*.py`).
+- Selection rules (tie‑break, merge/de‑dup/clamp) (see `nsa/tests/test_selection_*`, `test_ranges_normalization.py`).
+- Decode reads counters formula (see `nsa/tests/test_decode_counters.py`).
+References
+- Architecture Overview: Documentation/Architecture/Overview.md
+- Selection Semantics: Documentation/Architecture/Selection-Semantics.md
+- Tests Index: Documentation/Tests/Index.md

nsa/core/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from __future__ import annotations

nsa/core/attention_kernels.py ADDED Viewed

	@@ -0,0 +1,1403 @@

+from __future__ import annotations
+import os
+import time
+from typing import Dict, Tuple
+import torch
+import torch.nn.functional as F
+from nsa.core.debug import log
+from nsa.core.packing import (
+    build_cu_seqlens_for_buckets,
+    build_length_buckets,
+    compute_compressed_lengths,
+    compute_sliding_lengths,
+)
+from nsa.kernels.flash_wrappers import (
+    attention_bgh,
+    attention_fa2_dense_batch,
+    attention_fa2_varlen,
+    fa2_supported,
+    fa2_supported_verbose,
+    is_flash_varlen_available,
+)
+# Simple grow-on-demand workspaces for varlen packing to avoid frequent allocations
+_VARLEN_WS: Dict[Tuple, Dict[str, torch.Tensor]] = {}
+_SEL_PACK_WS: Dict[Tuple, Dict[str, torch.Tensor]] = {}
+def _env_int(name: str, default: int) -> int:
+    try:
+        v = int(os.getenv(name, str(default)))
+        return v
+    except Exception:
+        return default
+def _env_int_bounded(name: str, default: int, min_val: int = 0, max_val: int = 10**8) -> int:
+    """Read integer from environment with bounds checking to prevent excessive memory allocation."""
+    try:
+        v = int(os.getenv(name, str(default)))
+        if v < min_val:
+            return min_val
+        if v > max_val:
+            # Log warning if value exceeds max
+            import warnings
+            warnings.warn(f"{name}={v} exceeds maximum {max_val}, clamping to {max_val}")
+            return max_val
+        return v
+    except Exception:
+        return default
+def clear_varlen_workspaces() -> None:
+    """Optional memory cleanup: free varlen packing workspaces."""
+    _VARLEN_WS.clear()
+def clear_selection_pack_workspaces() -> None:
+    """Optional memory cleanup: free selection pack workspaces."""
+    _SEL_PACK_WS.clear()
+def _get_varlen_workspace(
+    device: torch.device,
+    dtype_q: torch.dtype,
+    dtype_k: torch.dtype,
+    dtype_v: torch.dtype,
+    h: int,
+    d_k: int,
+    d_v: int,
+    cap_N: int,
+    cap_total_k: int,
+) -> dict[str, torch.Tensor]:
+    key = (str(device), dtype_q, dtype_k, dtype_v, h, d_k, d_v)
+    ws = _VARLEN_WS.get(key)
+    need_new = ws is None
+    if not need_new:
+        q, k, v = ws["q"], ws["k"], ws["v"]
+        cuq, cuk = ws["cuq"], ws["cuk"]
+        need_new = (
+            q.shape[0] < cap_N
+            or k.shape[0] < cap_total_k
+            or v.shape[0] < cap_total_k
+            or cuq.numel() < (cap_N + 1)
+            or cuk.numel() < (cap_N + 1)
+        )
+    if need_new:
+        # Allow pre-sizing via env to avoid growth reallocations on long runs
+        # Bounded to prevent excessive memory allocation (max 1M rows, 100M total K/V)
+        reserve_N = _env_int_bounded("NSA_VARLEN_RESERVE_N", 0, 0, 10**6)
+        reserve_K = _env_int_bounded("NSA_VARLEN_RESERVE_K", 0, 0, 10**8)
+        new_N = max(cap_N, reserve_N, 1)
+        new_K = max(cap_total_k, reserve_K, 1)
+        ws = {
+            "q": torch.empty((new_N, h, d_k), dtype=dtype_q, device=device),
+            "k": torch.empty((new_K, h, d_k), dtype=dtype_k, device=device),
+            "v": torch.empty((new_K, h, d_v), dtype=dtype_v, device=device),
+            "cuq": torch.empty((new_N + 1,), dtype=torch.int32, device=device),
+            "cuk": torch.empty((new_N + 1,), dtype=torch.int32, device=device),
+        }
+        _VARLEN_WS[key] = ws
+    return ws
+def batched_causal_attention_compressed(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K_cmp: torch.Tensor,  # [B,G,S_cmp,Dk]
+    V_cmp: torch.Tensor,  # [B,G,S_cmp,Dv]
+    l: int,
+    d: int,
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    """
+    Compressed branch attention with per-row causal mask derived from emission schedule.
+    We cannot rely on is_causal=True due to S_q != S_kv and variable allowed lengths per t.
+    """
+    B, S, G, h, Dk = Q.shape
+    S_cmp = K_cmp.shape[2]
+    device = Q.device
+    # num_cmp(t) = 0 if t+1 < l else floor((t+1 - l) / d) + 1, clamped to S_cmp
+    tpos = torch.arange(S, device=device)
+    num_cmp = torch.where(tpos + 1 < l, 0, ((tpos + 1 - l) // d) + 1).clamp(max=S_cmp)
+    col = torch.arange(S_cmp, device=device).view(1, S_cmp)
+    # disallowed mask: True means masked
+    col >= num_cmp.view(S, 1)  # [S,S_cmp]
+    # Enforce token-level causality as well: no compressed tokens emitted from future blocks beyond t
+    # When l=d=1, S_cmp == S and this reduces to standard causal
+    # Parity-first: exact per-t using attention_bgh
+    out = torch.zeros((B, S, G, h, V_cmp.shape[-1]), dtype=V_cmp.dtype, device=V_cmp.device)
+    log("cmp.begin", B=B, S=S, S_cmp=int(S_cmp), l=l, d=d)
+    for t in range(S):
+        L = int(num_cmp[t].item())
+        if L <= 0:
+            out[:, t] = 0.0
+            continue
+        q_t = Q[:, t]
+        k_t = K_cmp[:, :, :L, :]
+        v_t = V_cmp[:, :, :L, :]
+        out[:, t] = attention_bgh(q_t, k_t, v_t, causal=True)
+        log("cmp.step", t=int(t), L=L)
+    return out
+def sliding_window_attention(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S,Dk]
+    V: torch.Tensor,  # [B,G,S,Dv]
+    w: int,
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    B, S, G, h, Dk = Q.shape
+    # Empty or zero window → zeros
+    if w <= 0 or K.shape[2] == 0 or S == 0:
+        return torch.zeros((B, S, G, h, V.shape[-1]), dtype=V.dtype, device=V.device)
+    device = Q.device
+    # Build banded causal mask once: allowed keys per row t are [t-w+1 .. t]
+    row = torch.arange(S, device=device).view(S, 1)
+    col = torch.arange(S, device=device).view(1, S)
+    allowed = (col <= row) & (col >= (row - (w - 1)))  # [S,S]
+    # Use additive float mask with -inf for disallowed positions to avoid NaNs
+    # across SDPA backends/dtypes. Shape: [S,S] then broadcast to [B,G*h,S,S].
+    Mf2d = torch.full((S, S), float("-inf"), dtype=Q.dtype, device=device)
+    Mf2d.masked_fill_(allowed, 0.0)
+    # Prepare SDPA tensors: [B, G*h, S, D*]
+    Qf = Q.reshape(B, S, G * h, Dk).transpose(1, 2).contiguous()  # [B,G*h,S,Dk]
+    Kf = K.unsqueeze(2).expand(B, G, h, S, Dk).reshape(B, G * h, S, Dk).contiguous()
+    Vf = (
+        V.unsqueeze(2)
+        .expand(B, G, h, S, V.shape[-1])
+        .reshape(B, G * h, S, V.shape[-1])
+        .contiguous()
+    )
+    # Broadcast additive mask to [B,G*h,S,S]
+    Mf = Mf2d.view(1, 1, S, S).expand(B, G * h, S, S)
+    Of = F.scaled_dot_product_attention(Qf, Kf, Vf, attn_mask=Mf)  # [B,G*h,S,Dv]
+    Of = Of.transpose(1, 2).reshape(B, S, G, h, V.shape[-1])
+    return Of
+def grouped_selection_attention(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S_kv,Dk]
+    V: torch.Tensor,  # [B,G,S_kv,Dv]
+    ranges: torch.Tensor,  # [B,S,G,n,2]
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    B, S, G, h, Dk = Q.shape
+    K.shape[2]
+    # Path 1: exact sequential-equivalence gather per (b,t,g)
+    out = torch.zeros((B, S, G, h, V.shape[-1]), dtype=V.dtype, device=V.device)
+    for b in range(B):
+        for t in range(S):
+            for g in range(G):
+                # build exact gather index list
+                idxs = []
+                for i in range(ranges.shape[3]):
+                    s0 = int(ranges[b, t, g, i, 0].item())
+                    e0 = int(ranges[b, t, g, i, 1].item())
+                    if e0 > s0:
+                        idxs.append(torch.arange(s0, e0, device=V.device))
+                if idxs:
+                    idx = torch.cat(idxs)
+                    k = K[b, g, idx]  # [L,Dk]
+                    v = V[b, g, idx]  # [L,Dv]
+                    q = Q[b, t, g]  # [h,Dk]
+                    # Expand per-head kv and add query-length dim for SDPA
+                    q_btgh = q.unsqueeze(0).unsqueeze(2)  # [1,h,1,Dk]
+                    k_btgh = (
+                        k.unsqueeze(0).unsqueeze(0).expand(1, q.shape[0], k.shape[0], k.shape[1])
+                    )  # [1,h,L,Dk]
+                    v_btgh = (
+                        v.unsqueeze(0).unsqueeze(0).expand(1, q.shape[0], v.shape[0], v.shape[1])
+                    )  # [1,h,L,Dv]
+                    q_btgh = q_btgh.contiguous()
+                    k_btgh = k_btgh.contiguous()
+                    v_btgh = v_btgh.contiguous()
+                    attn = F.scaled_dot_product_attention(
+                        q_btgh, k_btgh, v_btgh, is_causal=True
+                    )  # [1,h,1,Dv]
+                    out[b, t, g] = attn.squeeze(0).squeeze(1)  # [h,Dv]
+                    log("sel.step", b=int(b), t=int(t), g=int(g), L=int(k.shape[0]))
+                else:
+                    out[b, t, g] = 0.0
+                    log("sel.step", b=int(b), t=int(t), g=int(g), L=0)
+    return out
+def sliding_window_attention_masked(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S,Dk]
+    V: torch.Tensor,  # [B,G,S,Dv]
+    w: int,
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    # Memory-friendly masked semantics: only the first element in [start..t] is attended.
+    # With a single allowed key per row, SDPA reduces to returning that V directly.
+    B, S, G, h, Dk = Q.shape
+    if w <= 0 or K.shape[2] == 0:
+        return torch.zeros((B, S, G, h, V.shape[-1]), dtype=V.dtype, device=V.device)
+    device = Q.device
+    tpos = torch.arange(S, device=device)
+    start = (tpos - (w - 1)).clamp_min(0)  # [S]
+    # Build per-(B,G,S) gather indices and fetch V at start
+    idx = start.view(1, 1, S, 1).expand(B, G, S, 1)  # [B,G,S,1]
+    v_sel = torch.gather(V, 2, idx.expand(B, G, S, V.shape[-1]))  # [B,G,S,Dv]
+    # Expand across heads; result [B,S,G,h,Dv]
+    Of = v_sel.permute(0, 2, 1, 3).unsqueeze(3).expand(B, S, G, h, V.shape[-1])
+    return Of
+def batched_causal_attention_compressed_masked(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K_cmp: torch.Tensor,  # [B,G,S_cmp,Dk]
+    V_cmp: torch.Tensor,  # [B,G,S_cmp,Dv]
+    l: int,
+    d: int,
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    # Memory-friendly masked semantics: if num_cmp(t)>0, attend only to index 0 → return V[:, :, 0].
+    B, S, G, h, Dk = Q.shape
+    S_cmp = K_cmp.shape[2]
+    device = Q.device
+    if S_cmp == 0:
+        return torch.zeros((B, S, G, h, V_cmp.shape[-1]), dtype=V_cmp.dtype, device=V_cmp.device)
+    tpos = torch.arange(S, device=device)
+    num_cmp = torch.where(tpos + 1 < l, 0, ((tpos + 1 - l) // d) + 1).clamp(min=0, max=S_cmp)  # [S]
+    have_any = (num_cmp > 0).view(1, S, 1, 1, 1).expand(B, S, G, h, 1)
+    v0 = V_cmp[:, :, 0, :]  # [B,G,Dv]
+    v0f = v0.unsqueeze(1).unsqueeze(3).expand(B, S, G, h, V_cmp.shape[-1])
+    Of = torch.where(have_any, v0f, torch.zeros_like(v0f))
+    return Of
+def grouped_selection_attention_packed(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S_kv,Dk]
+    V: torch.Tensor,  # [B,G,S_kv,Dv]
+    ranges: torch.Tensor,  # [B,S,G,n,2]
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    """
+    Bucketed varlen packing by row length L with parity to gather path.
+    For each (b,t,g), build its flat index list from ranges, bucket rows
+    by identical L, and run one SDPA per bucket.
+    """
+    B, S, G, h, Dk = Q.shape
+    K.shape[2]
+    device = Q.device
+    # Initialize output
+    out = torch.zeros((B, S, G, h, V.shape[-1]), dtype=V.dtype, device=device)
+    # Flatten to row indices
+    rows = []  # list of (b,t,g, idx_tensor[L])
+    lengths = []
+    for b in range(B):
+        for t in range(S):
+            for g in range(G):
+                idxs = []
+                for i in range(ranges.shape[3]):
+                    s0 = int(ranges[b, t, g, i, 0].item())
+                    e0 = int(ranges[b, t, g, i, 1].item())
+                    if e0 > s0:
+                        idxs.append(torch.arange(s0, e0, device=device))
+                if idxs:
+                    idx = torch.cat(idxs)
+                else:
+                    idx = torch.empty((0,), dtype=torch.long, device=device)
+                rows.append((b, t, g, idx))
+                lengths.append(idx.numel())
+    if not rows:
+        return out
+    lengths_t = torch.tensor(lengths, device=device)
+    unique_L = torch.unique(lengths_t)
+    # Enable autograd-safe packing during training or when forced by env
+    use_safe_pack = (
+        torch.is_grad_enabled() and (Q.requires_grad or K.requires_grad or V.requires_grad)
+    ) or _env_bool("NSA_TRAIN_SAFE_PACK", False)
+    for Lval in unique_L.tolist():
+        L = int(Lval)
+        # collect row indices for this bucket
+        bucket_idx = [i for i, Lx in enumerate(lengths) if Lx == L]
+        if L == 0 or len(bucket_idx) == 0:
+            # rows with L=0 remain zeros
+            continue
+        N = len(bucket_idx)
+        if use_safe_pack:
+            # Graph-friendly packing using stack to preserve autograd links
+            map_rows = []
+            Q_list = []
+            K_list = []
+            V_list = []
+            for ridx in bucket_idx:
+                b, t, g, idx = rows[ridx]
+                map_rows.append((b, t, g))
+                Q_list.append(Q[b, t, g])  # [h,Dk]
+                K_list.append(K[b, g, idx])  # [L,Dk]
+                V_list.append(V[b, g, idx])  # [L,Dv]
+            Qb = torch.stack(Q_list, dim=0)  # [N,h,Dk]
+            Kb = torch.stack(K_list, dim=0)  # [N,L,Dk]
+            Vb = torch.stack(V_list, dim=0)  # [N,L,Dv]
+            q_btgh = Qb.unsqueeze(1).permute(0, 2, 1, 3)  # [N,h,1,Dk]
+            k_btgh = Kb.unsqueeze(1).expand(N, h, L, Dk)
+            v_btgh = Vb.unsqueeze(1).expand(N, h, L, V.shape[-1])
+            attn = F.scaled_dot_product_attention(q_btgh, k_btgh, v_btgh, is_causal=True)
+            Ob = attn.squeeze(2)  # [N,h,Dv]
+            for j, (b, t, g) in enumerate(map_rows):
+                out[b, t, g] = Ob[j]
+        else:
+            # Workspace-backed Q, K, V batches to reduce allocations
+            ws_key = (str(device), Q.dtype, K.dtype, V.dtype, h, Dk, V.shape[-1])
+            ws = _SEL_PACK_WS.get(ws_key)
+            need_new = (
+                ws is None or ws["Q"].shape[0] < N or ws["K"].shape[1] < L or ws["V"].shape[1] < L
+            )
+            if need_new:
+                # Allow pre-sizing via env to reduce reallocations
+                # Bounded to prevent excessive memory allocation (max 100K rows, 10K length)
+                reserve_N = _env_int_bounded("NSA_SEL_PACK_RESERVE_N", 0, 0, 10**5)
+                reserve_L = _env_int_bounded("NSA_SEL_PACK_RESERVE_L", 0, 0, 10**4)
+                new_N = max(N, reserve_N)
+                new_L = max(L, reserve_L)
+                Qb = torch.empty((new_N, h, Dk), dtype=Q.dtype, device=device)
+                Kb = torch.empty((new_N, new_L, Dk), dtype=K.dtype, device=device)
+                Vb = torch.empty((new_N, new_L, V.shape[-1]), dtype=V.dtype, device=device)
+                _SEL_PACK_WS[ws_key] = {"Q": Qb, "K": Kb, "V": Vb}
+            else:
+                Qb = _SEL_PACK_WS[ws_key]["Q"][:N]
+                Kb = _SEL_PACK_WS[ws_key]["K"][:N, :L]
+                Vb = _SEL_PACK_WS[ws_key]["V"][:N, :L]
+            # Populate workspace buffers and perform SDPA (execute for both new and reused workspaces)
+            map_rows = []
+            for j, ridx in enumerate(bucket_idx):
+                b, t, g, idx = rows[ridx]
+                Qb[j] = Q[b, t, g]  # [h,Dk]
+                Kb[j] = K[b, g, idx]  # [L,Dk]
+                Vb[j] = V[b, g, idx]  # [L,Dv]
+                map_rows.append((b, t, g))
+            # SDPA per bucket: expand per-head
+            q_btgh = Qb.unsqueeze(1)  # [N,1,h,Dk]
+            q_btgh = q_btgh.permute(0, 2, 1, 3)  # [N,h,1,Dk]
+            k_btgh = Kb.unsqueeze(1).expand(N, h, L, Dk)
+            v_btgh = Vb.unsqueeze(1).expand(N, h, L, V.shape[-1])
+            attn = F.scaled_dot_product_attention(
+                q_btgh, k_btgh, v_btgh, is_causal=True
+            )  # [N,h,1,Dv]
+            Ob = attn.squeeze(2)  # [N,h,Dv]
+            # Scatter back
+            for j, (b, t, g) in enumerate(map_rows):
+                out[b, t, g] = Ob[j]
+    return out
+def selection_attention_varlen_all(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S_kv,Dk]
+    V: torch.Tensor,  # [B,G,S_kv,Dv]
+    ranges: torch.Tensor,  # [B,S,G,n,2]
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    """
+    Fully batched selection attention using varlen packing across all (B,S,G) rows.
+    If NSA_SEL_VARLEN_V2 is enabled (default), dispatches to the vectorized v2
+    packer. Otherwise uses the legacy v1 path (minimal loops with workspace).
+    """
+    # Optional v2 vectorized packer
+    if os.getenv("NSA_SEL_VARLEN_V2", "1").lower() in ("1", "true", "yes", "on"):
+        return selection_attention_varlen_all_v2(Q, K, V, ranges)
+    B, S, G, h, Dk = Q.shape
+    # Parity override: when enabled, force causal=True to match packed reference
+    _parity = os.getenv("NSA_SEL_VARLEN_FORCE_PARITY", "0").lower() in ("1", "true", "yes", "on")
+    if _parity:
+        # Force exact parity by delegating to the packed reference
+        return grouped_selection_attention_packed(Q, K, V, ranges)
+    device = Q.device
+    Dv = V.shape[-1]
+    out = torch.zeros((B, S, G, h, Dv), dtype=V.dtype, device=V.device)
+    # Build row list and lengths from ranges (sum of segment lengths)
+    rows: list[tuple[int, int, int]] = []
+    lens: list[int] = []
+    for b in range(B):
+        for t in range(S):
+            for g in range(G):
+                L = 0
+                for i in range(ranges.shape[3]):
+                    s0 = int(ranges[b, t, g, i, 0].item())
+                    e0 = int(ranges[b, t, g, i, 1].item())
+                    if e0 > s0:
+                        L += e0 - s0
+                if L > 0:
+                    rows.append((b, t, g))
+                    lens.append(L)
+    N = len(rows)
+    if N == 0:
+        return out
+    total_k = int(sum(lens))
+    # Workspace-backed packing
+    ws = _get_varlen_workspace(
+        device,
+        dtype_q=Q.dtype,
+        dtype_k=K.dtype,
+        dtype_v=V.dtype,
+        h=h,
+        d_k=Dk,
+        d_v=Dv,
+        cap_N=N,
+        cap_total_k=total_k,
+    )
+    q_pack = ws["q"][:N]
+    k_pack = ws["k"][:total_k]
+    v_pack = ws["v"][:total_k]
+    cuq = ws["cuq"][: N + 1]
+    cuk = ws["cuk"][: N + 1]
+    # Fill cu_seqlens
+    cuq.zero_()
+    cuk.zero_()
+    # Pack per row
+    write_pos = 0
+    for i, (b, t, g) in enumerate(rows):
+        # q for row
+        q_pack[i] = Q[b, t, g]
+        # iterate segments for this row
+        for j in range(ranges.shape[3]):
+            s0 = int(ranges[b, t, g, j, 0].item())
+            e0 = int(ranges[b, t, g, j, 1].item())
+            if e0 <= s0:
+                continue
+            seg_k = K[b, g, s0:e0]  # [Lseg,Dk]
+            seg_v = V[b, g, s0:e0]  # [Lseg,Dv]
+            Lseg = e0 - s0
+            # Assign using explicit expand_as to match target slice shape and avoid view pitfalls
+            _kslice = k_pack[write_pos : write_pos + Lseg]
+            _vslice = v_pack[write_pos : write_pos + Lseg]
+            _kslice.copy_(seg_k[:, None, :].expand_as(_kslice))
+            _vslice.copy_(seg_v[:, None, :].expand_as(_vslice))
+            write_pos += Lseg
+        cuq[i + 1] = cuq[i] + 1
+        cuk[i + 1] = cuk[i] + lens[i]
+    # Try FA‑2 varlen if available and supported. Default non-causal semantics;
+    # optionally force parity with packed path via NSA_SEL_VARLEN_FORCE_PARITY.
+    ok, _ = fa2_supported_verbose(device, Q.dtype, Dk)
+    if ok and is_flash_varlen_available():
+        try:
+            o_pack = attention_fa2_varlen(
+                q_pack,
+                k_pack,
+                v_pack,
+                cuq,
+                cuk,
+                max_seqlen_q=1,
+                max_seqlen_k=max(lens),
+                causal=_parity,
+            )  # [N,h,Dv]
+            # Scatter back
+            for i, (b, t, g) in enumerate(rows):
+                out[b, t, g] = o_pack[i]
+            return out
+        except Exception:
+            pass
+    # Dense batch per fixed L bucket as fallback
+    buckets: dict[int, list[int]] = {}
+    for i, L in enumerate(lens):
+        buckets.setdefault(L, []).append(i)
+    for L, idxs in buckets.items():
+        if L <= 0 or len(idxs) == 0:
+            continue
+        Nb = len(idxs)
+        Qb = torch.empty((Nb, h, Dk), dtype=Q.dtype, device=device)
+        Kb = torch.empty((Nb, L, Dk), dtype=K.dtype, device=device)
+        Vb = torch.empty((Nb, L, Dv), dtype=V.dtype, device=device)
+        tgt: list[tuple[int, int, int]] = []
+        for j, irow in enumerate(idxs):
+            b, t, g = rows[irow]
+            Qb[j] = Q[b, t, g]
+            # Rebuild fixed-length K/V for this row from ranges
+            write = 0
+            for rj in range(ranges.shape[3]):
+                s0 = int(ranges[b, t, g, rj, 0].item())
+                e0 = int(ranges[b, t, g, rj, 1].item())
+                if e0 <= s0:
+                    continue
+                Lseg = e0 - s0
+                Kb[j, write : write + Lseg] = K[b, g, s0:e0]
+                Vb[j, write : write + Lseg] = V[b, g, s0:e0]
+                write += Lseg
+            tgt.append((b, t, g))
+        # Batched dense fallback for this bucket. Default non-causal; optionally force parity.
+        try:
+            q_rows = Qb.unsqueeze(1)  # [Nb,1,h,Dk]
+            k_rows = Kb.unsqueeze(2).expand(Nb, L, h, Dk)  # [Nb,L,h,Dk]
+            v_rows = Vb.unsqueeze(2).expand(Nb, L, h, Dv)  # [Nb,L,h,Dv]
+            Ob = attention_fa2_dense_batch(q_rows, k_rows, v_rows, causal=_parity).squeeze(
+                1
+            )  # [Nb,h,Dv]
+            for i, (b, t, g) in enumerate(tgt):
+                out[b, t, g] = Ob[i]
+        except Exception:
+            # Final fallback: per-row SDPA
+            for j, (b, t, g) in enumerate(tgt):
+                q_btgh = Qb[j].unsqueeze(0).unsqueeze(0)  # [1,1,h,Dk]
+                k_btgh = Kb[j].unsqueeze(0).unsqueeze(0)  # [1,1,L,Dk]
+                v_btgh = Vb[j].unsqueeze(0).unsqueeze(0)  # [1,1,L,Dv]
+                out[b, t, g] = attention_bgh(q_btgh, k_btgh, v_btgh, causal=_parity)[0, 0]
+    return out
+def selection_attention_varlen_all_v2(
+    Q: torch.Tensor,
+    K: torch.Tensor,
+    V: torch.Tensor,
+    ranges: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Vectorized v2 varlen selection packer with FA‑2 varlen fast path and dense fallback.
+    - Eliminates Python loops for packing by using a difference-array mask to build per-row
+      allowed indices and flat-select K/V tokens.
+    - Uses causal=False for single‑query rows.
+    - Env: NSA_SEL_VARLEN_MIN_L to bypass on tiny rows (falls back to packed path).
+    """
+    B, S, G, h, Dk = Q.shape
+    # Parity override: when enabled, force causal=True to match packed reference
+    _parity = os.getenv("NSA_SEL_VARLEN_FORCE_PARITY", "0").lower() in ("1", "true", "yes", "on")
+    if _parity:
+        # Force exact parity by delegating to the packed reference
+        return grouped_selection_attention_packed(Q, K, V, ranges)
+    device = Q.device
+    Dv = V.shape[-1]
+    S_kv = K.shape[2]
+    out = torch.zeros((B, S, G, h, Dv), dtype=V.dtype, device=V.device)
+    if S_kv == 0:
+        return out
+    # Build allowed mask [B,S,G,S_kv]
+    n = ranges.shape[3]
+    starts = ranges[..., 0].to(torch.int64).clamp_(0, S_kv)
+    ends = ranges[..., 1].to(torch.int64).clamp_(0, S_kv)
+    BSG = B * S * G
+    starts_f = starts.reshape(BSG, n)
+    ends_f = ends.reshape(BSG, n)
+    diff = torch.zeros((BSG, S_kv + 1), dtype=torch.int32, device=device)
+    one = torch.ones_like(starts_f, dtype=diff.dtype, device=device)
+    diff.scatter_add_(1, starts_f, one)
+    diff.scatter_add_(1, ends_f, -one)
+    allowed = diff[:, :-1].cumsum(dim=1).gt(0)  # [BSG,S_kv]
+    lens_flat = allowed.sum(dim=1, dtype=torch.int32)  # [BSG]
+    row_mask = lens_flat.gt(0)
+    if not torch.any(row_mask):
+        return out
+    try:
+        min_L = int(os.getenv("NSA_SEL_VARLEN_MIN_L", "0"))
+    except Exception:
+        min_L = 0
+    if min_L > 0 and int(lens_flat.max().item()) < min_L:
+        return grouped_selection_attention_packed(Q, K, V, ranges)
+    idx_rows = torch.nonzero(row_mask, as_tuple=False).squeeze(1)  # [N]
+    N = int(idx_rows.numel())
+    # (b,t,g) indices for scatter
+    b_idx = idx_rows // (S * G)
+    rem = idx_rows % (S * G)
+    t_idx = rem // G
+    g_idx = rem % G
+    # Pack Q rows
+    Q_rows = Q.reshape(B * S * G, h, Dk)[idx_rows]
+    # Map rows to b,g to select K/V
+    bg_map = (
+        torch.arange(B, device=device).view(B, 1, 1) * G
+        + torch.arange(G, device=device).view(1, 1, G)
+    ).expand(B, S, G)
+    bg_rows = bg_map.reshape(B * S * G)[idx_rows]
+    K_bg = K.reshape(B * G, S_kv, Dk)[bg_rows]
+    V_bg = V.reshape(B * G, S_kv, Dv)[bg_rows]
+    allowed_rows = allowed[idx_rows]
+    total_k = int(lens_flat[row_mask].sum().item())
+    sel_k = K_bg[allowed_rows]  # [total_k, Dk]
+    sel_v = V_bg[allowed_rows]  # [total_k, Dv]
+    lens_sel = lens_flat[row_mask]  # [N]
+    # Workspace-backed packing
+    ws = _get_varlen_workspace(
+        device,
+        dtype_q=Q.dtype,
+        dtype_k=K.dtype,
+        dtype_v=V.dtype,
+        h=h,
+        d_k=Dk,
+        d_v=Dv,
+        cap_N=N,
+        cap_total_k=total_k,
+    )
+    q_pack = ws["q"][:N]
+    k_pack = ws["k"][:total_k]
+    v_pack = ws["v"][:total_k]
+    cuq = ws["cuq"][: N + 1]
+    cuk = ws["cuk"][: N + 1]
+    q_pack.copy_(Q_rows)
+    k_pack.copy_(sel_k.unsqueeze(1).expand(total_k, h, Dk))
+    v_pack.copy_(sel_v.unsqueeze(1).expand(total_k, h, Dv))
+    cuq.copy_(torch.arange(0, N + 1, device=device, dtype=torch.int32))
+    cuk[0] = 0
+    torch.cumsum(lens_sel.to(torch.int32), dim=0, out=cuk[1:])
+    # FA‑2 varlen (non-causal)
+    ok, _why = fa2_supported_verbose(device, Q.dtype, Dk)
+    max_len = int(lens_sel.max().item())
+    if ok and is_flash_varlen_available():
+        try:
+            o_pack = attention_fa2_varlen(
+                q_pack,
+                k_pack,
+                v_pack,
+                cuq,
+                cuk,
+                max_seqlen_q=1,
+                max_seqlen_k=max_len,
+                causal=_parity,
+            )
+            out[b_idx, t_idx, g_idx] = o_pack
+            return out
+        except Exception:
+            pass
+    # Correctness-first fallback: masked SDPA over an allowed key mask
+    # This path matches the non-causal packed reference exactly and avoids
+    # potential packing/indexing pitfalls in dense-bucket fallbacks.
+    try:
+        return grouped_selection_attention_masked(Q, K, V, ranges)
+    except Exception:
+        pass
+    # Legacy dense fallback by length buckets (kept as a final fallback)
+    starts = cuk[:-1].to(torch.int64)
+    ends = cuk[1:].to(torch.int64)
+    Ls = (ends - starts).to(torch.int64)
+    for L in torch.unique(Ls).tolist():
+        if L <= 0:
+            continue
+        sel = (Ls == L).nonzero(as_tuple=False).squeeze(1)
+        if sel.numel() == 0:
+            continue
+        Nb = int(sel.numel())
+        Qb = q_pack[sel]
+        k_rows = torch.empty((Nb, L, h, Dk), dtype=K.dtype, device=device)
+        v_rows = torch.empty((Nb, L, h, Dv), dtype=V.dtype, device=device)
+        for j in range(Nb):
+            s0 = int(starts[sel[j]].item())
+            e0 = int(ends[sel[j]].item())
+            k_rows[j] = k_pack[s0:e0]
+            v_rows[j] = v_pack[s0:e0]
+        try:
+            Ob = attention_fa2_dense_batch(Qb.unsqueeze(1), k_rows, v_rows, causal=_parity).squeeze(1)
+        except Exception:
+            Ob = torch.empty((Nb, h, Dv), dtype=V.dtype, device=device)
+            for j in range(Nb):
+                Ob[j] = attention_bgh(Qb[j].unsqueeze(0), k_rows[j].unsqueeze(0), v_rows[j].unsqueeze(0), causal=_parity)[
+                    0
+                ]
+        out[b_idx[sel], t_idx[sel], g_idx[sel]] = Ob
+    return out
+def grouped_selection_attention_masked(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S_kv,Dk]
+    V: torch.Tensor,  # [B,G,S_kv,Dv]
+    ranges: torch.Tensor,  # [B,S,G,n,2]
+) -> torch.Tensor:  # [B,S,G,h,Dv]
+    """
+    Fully batched selection attention using an additive -inf mask.
+    Vectorized ranges→mask construction via prefix-sum trick (no Python loops).
+    """
+    B, S, G, h, Dk = Q.shape
+    S_kv = K.shape[2]
+    device = Q.device
+    if S_kv == 0:
+        return torch.zeros((B, S, G, h, V.shape[-1]), dtype=V.dtype, device=device)
+    # Vectorized allowed mask [B,S,G,S_kv] from ranges using difference array
+    n = ranges.shape[3]
+    starts = ranges[..., 0].to(torch.int64).clamp_(0, S_kv)  # [B,S,G,n]
+    ends = ranges[..., 1].to(torch.int64).clamp_(0, S_kv)  # [B,S,G,n]
+    BSG = B * S * G
+    starts_f = starts.reshape(BSG, n)
+    ends_f = ends.reshape(BSG, n)
+    diff = torch.zeros((BSG, S_kv + 1), dtype=torch.int32, device=device)
+    one = torch.ones_like(starts_f, dtype=diff.dtype, device=device)
+    diff.scatter_add_(1, starts_f, one)
+    diff.scatter_add_(1, ends_f, -one)
+    allowed = diff[:, :-1].cumsum(dim=1).gt(0).reshape(B, S, G, S_kv)
+    # Detect rows with no allowed keys (all False along key dimension)
+    row_has_any = allowed.any(dim=-1)  # [B,S,G]
+    row_empty = ~row_has_any
+    # Prevent SDPA from seeing an all-−inf row which can produce NaNs.
+    # For originally empty rows, force a single safe key (index 0) to True,
+    # run SDPA, then zero their outputs afterward to preserve semantics.
+    if row_empty.any():
+        allowed_safe = allowed.clone()
+        flat = allowed_safe.view(B * S * G, S_kv)
+        row_empty_flat = row_empty.reshape(B * S * G)
+        if S_kv > 0:
+            flat[row_empty_flat, 0] = True
+        allowed_safe = flat.view_as(allowed_safe)
+    else:
+        allowed_safe = allowed
+    # Prepare SDPA tensors: [B,G*h,S, D*] and mask [B,G*h,S,S_kv]
+    Qf = Q.reshape(B, S, G * h, Dk).transpose(1, 2).contiguous()  # [B,G*h,S,Dk]
+    Kf = K.unsqueeze(2).expand(-1, -1, h, -1, -1).reshape(B, G * h, S_kv, Dk).contiguous()
+    Vf = V.unsqueeze(2).expand(-1, -1, h, -1, -1).reshape(B, G * h, S_kv, V.shape[-1]).contiguous()
+    # Build additive mask in float32 for numerical stability with -inf
+    zeros = torch.zeros((B, G * h, S, S_kv), dtype=torch.float32, device=device)
+    neg_inf = torch.full((B, G * h, S, S_kv), float("-inf"), dtype=torch.float32, device=device)
+    Mf = torch.where(
+        allowed_safe.transpose(1, 2)  # [B,G,S,S_kv]
+        .unsqueeze(2)
+        .expand(-1, -1, h, -1, -1)
+        .reshape(B, G * h, S, S_kv),
+        zeros,
+        neg_inf,
+    ).contiguous()
+    Of = F.scaled_dot_product_attention(Qf, Kf, Vf, attn_mask=Mf)  # [B,G*h,S,Dv]
+    Of = Of.transpose(1, 2).reshape(B, S, G, h, V.shape[-1])
+    # Zero outputs for originally empty rows to preserve semantics
+    if row_empty.any():
+        Of = torch.where(row_has_any.unsqueeze(-1).unsqueeze(-1), Of, torch.zeros_like(Of))
+    return Of
+# ===== FA-2 integration scaffolding (M1) =====
+def _env_bool(name: str, default: bool = False) -> bool:
+    v = os.getenv(name, "1" if default else "0").lower()
+    return v in ("1", "true", "yes", "on")
+def _is_sm89(device: torch.device) -> bool:
+    """Return True if running on CUDA device with SM 8.9 (Ada/RTX 4090)."""
+    if device.type != "cuda":
+        return False
+    try:
+        cap = torch.cuda.get_device_capability(device)
+        return cap == (8, 9)
+    except Exception:
+        return False
+def _fa2_forced() -> bool:
+    """Return True if FA-2 usage is explicitly forced via env."""
+    return _env_bool("NSA_FA2_FORCE", False)
+def sliding_window_attention_fa2(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K: torch.Tensor,  # [B,G,S,Dk]
+    V: torch.Tensor,  # [B,G,S,Dv]
+    w: int,
+    min_len_for_fa2: int = 16,
+) -> torch.Tensor:
+    """
+    Planned FA-2 path for sliding with safe fallbacks.
+    Currently falls back to masked SDPA to preserve numerics until FA-2 is wired.
+    """
+    B, S, G, h, Dk = Q.shape
+    device = Q.device
+    # Policy: sliding FA-2 is disabled by default due to API semantics
+    # limitation (causal mask assumes start at 0). Allow only if explicitly
+    # enabled via NSA_ALLOW_SLIDING_FA2 or forced flags.
+    allow_sliding_fa2 = _env_bool("NSA_ALLOW_SLIDING_FA2", False)
+    # Guard: disable FA-2 on Ada (SM 8.9) unless explicitly forced
+    if _is_sm89(device) and not _fa2_forced():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="win", reason="sm89_guard", forced=bool(_fa2_forced()))
+        return sliding_window_attention(Q, K, V, w)
+    # Policy guard
+    if not allow_sliding_fa2 and not (
+        _env_bool("NSA_FA2_FORCE_VARLEN", False) or _env_bool("NSA_FA2_FORCE_DENSE", False)
+    ):
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="win", reason="unsupported_sliding_semantics", forced=False)
+        return sliding_window_attention(Q, K, V, w)
+    # Compute effective per-row window lengths and buckets
+    lengths = compute_sliding_lengths(S, w, device)
+    max_len = int(lengths.max().item()) if lengths.numel() > 0 else 0
+    # Allow override via env
+    try:
+        min_len_for_fa2 = int(os.getenv("NSA_FA2_MIN_LEN_WIN", str(min_len_for_fa2)))
+    except Exception:
+        pass
+    # Disable sentinel: non-positive threshold disables FA‑2 entirely for this branch
+    if min_len_for_fa2 <= 0:
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="win", reason="disabled_threshold")
+        return sliding_window_attention(Q, K, V, w)
+    buckets = build_length_buckets(lengths)
+    if buckets:
+        log("fa2.win.buckets", n=len(buckets), max_len=max_len)
+        # Build cu_seqlens per bucket (for future FA-2 varlen call)
+        for idx in buckets:
+            blens = lengths[idx]
+            _ = build_cu_seqlens_for_buckets(blens)
+    # Small-length auto-switch to masked SDPA
+    if max_len < min_len_for_fa2:
+        if os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="win",
+                reason="below_min_len",
+                max_len=int(max_len),
+                min_len=int(min_len_for_fa2),
+            )
+        return sliding_window_attention(Q, K, V, w)
+    # Capability check
+    ok, why = fa2_supported_verbose(device, Q.dtype, Dk)
+    if not ok or not is_flash_varlen_available():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="win", reason=why, has_varlen=is_flash_varlen_available())
+        return sliding_window_attention(Q, K, V, w)
+    # Attempt FA-2 across all rows using varlen first, then dense per-bucket. Fallback to masked SDPA on error.
+    try:
+        B, S, G, h, Dk = Q.shape
+        Dv = V.shape[-1]
+        use_timing = os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes")
+        force_varlen = _env_bool("NSA_FA2_FORCE_VARLEN", False)
+        force_dense = _env_bool("NSA_FA2_FORCE_DENSE", False)
+        force_win_dense = _env_bool("NSA_WIN_FORCE_DENSE", False)
+        # Log histogram of lengths
+        if buckets:
+            uniq, counts = torch.unique(lengths, return_counts=True)
+            log("fa2.win.hist", uniq=uniq.tolist(), counts=counts.tolist())
+        # Try a single varlen call across all rows
+        if (is_flash_varlen_available() and not (force_dense or force_win_dense)) or force_varlen:
+            rows = []
+            len_rows = []
+            for t in range(S):
+                L = int(lengths[t].item())
+                for b in range(B):
+                    for g in range(G):
+                        rows.append((b, t, g))
+                        len_rows.append(L)
+            N = len(rows)
+            if N > 0 and max_len >= 1:
+                use_safe_pack = (
+                    torch.is_grad_enabled()
+                    and (Q.requires_grad or K.requires_grad or V.requires_grad)
+                ) or _env_bool("NSA_TRAIN_SAFE_PACK", False)
+                if use_safe_pack:
+                    # Autograd-safe packing via stack/cat to preserve graph links
+                    q_pack = torch.stack([Q[b, t, g] for (b, t, g) in rows], dim=0)  # [N,h,Dk]
+                    k_rows = []
+                    v_rows = []
+                    for i, (b, t, g) in enumerate(rows):
+                        L = len_rows[i]
+                        if L > 0:
+                            start = max(0, (t + 1) - w)
+                            end = t + 1
+                            seg_k = K[b, g, start:end].unsqueeze(1).expand(-1, h, -1)  # [L,h,Dk]
+                            seg_v = V[b, g, start:end].unsqueeze(1).expand(-1, h, -1)  # [L,h,Dv]
+                            k_rows.append(seg_k)
+                            v_rows.append(seg_v)
+                    total_k = int(sum(len_rows))
+                    if total_k > 0:
+                        k_pack = torch.cat(k_rows, dim=0)
+                        v_pack = torch.cat(v_rows, dim=0)
+                    else:
+                        k_pack = torch.zeros((0, h, Dk), dtype=K.dtype, device=K.device)
+                        v_pack = torch.zeros((0, h, Dv), dtype=V.dtype, device=V.device)
+                    cuq = torch.arange(0, N + 1, device=Q.device, dtype=torch.int32)
+                    lens_t = torch.tensor(len_rows, dtype=torch.int32, device=Q.device)
+                    cuk = torch.cumsum(torch.nn.functional.pad(lens_t, (1, 0)), dim=0)
+                else:
+                    total_k = int(sum(len_rows))
+                    ws = _get_varlen_workspace(
+                        Q.device, Q.dtype, K.dtype, V.dtype, h, Dk, Dv, N, total_k
+                    )
+                    q_pack = ws["q"][:N]
+                    k_pack = ws["k"][:total_k]
+                    v_pack = ws["v"][:total_k]
+                    # Build cumulative sequence lengths for Q and K
+                    cuq = ws["cuq"][: N + 1]
+                    cuq.copy_(torch.arange(0, N + 1, device=Q.device, dtype=torch.int32))
+                    lens_t = torch.tensor(len_rows, dtype=torch.int32, device=Q.device)
+                    cuk = ws["cuk"][: N + 1]
+                    torch.cumsum(torch.nn.functional.pad(lens_t, (1, 0)), dim=0, out=cuk)
+                    # Fill packs
+                    write_pos = 0
+                    for i, (b, t, g) in enumerate(rows):
+                        L = len_rows[i]
+                        q_pack[i] = Q[b, t, g]
+                        if L > 0:
+                            start = max(0, (t + 1) - w)
+                            end = t + 1
+                            seg_k = K[b, g, start:end]  # [L,Dk]
+                            seg_v = V[b, g, start:end]  # [L,Dv]
+                            assert (write_pos + L) <= total_k, "varlen K/V pack overflow"
+                            k_pack[write_pos : write_pos + L] = seg_k.unsqueeze(1).expand(L, h, Dk)
+                            v_pack[write_pos : write_pos + L] = seg_v.unsqueeze(1).expand(L, h, Dv)
+                            write_pos += L
+                # Optional integrity checks (debug only)
+                if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+                    try:
+                        assert cuq.numel() == (N + 1), "cuq length mismatch"
+                        assert cuk.numel() == (N + 1), "cuk length mismatch"
+                        assert int(cuk[-1].item()) == int(total_k), "cuk total_k mismatch"
+                        if total_k > 0 and N > 0:
+                            probe = [0, N // 2, N - 1] if N >= 3 else [0]
+                            for i in probe:
+                                L_i = int(len_rows[i])
+                                b_i, t_i, g_i = rows[i]
+                                s_i = int(max(0, (t_i + 1) - w))
+                                e_i = int(t_i + 1)
+                                if L_i > 0:
+                                    ks = k_pack[cuk[i] : cuk[i + 1]]  # [L,h,Dk]
+                                    kv = K[b_i, g_i, s_i:e_i].unsqueeze(1).expand(-1, h, -1)
+                                    if ks.shape != kv.shape:
+                                        log(
+                                            "warn.fa2_win_pack_shape",
+                                            row=i,
+                                            ks=ks.shape,
+                                            kv=kv.shape,
+                                        )
+                                    else:
+                                        md = float((ks - kv).abs().max().item())
+                                        if md > 1e-3:
+                                            log(
+                                                "warn.fa2_win_pack_mismatch",
+                                                row=i,
+                                                L=L_i,
+                                                max_diff=md,
+                                            )
+                    except Exception:
+                        pass
+                if use_timing:
+                    t0 = time.perf_counter()
+                o_pack = attention_fa2_varlen(
+                    q_pack,
+                    k_pack,
+                    v_pack,
+                    cuq,
+                    cuk,
+                    max_seqlen_q=1,
+                    max_seqlen_k=max_len,
+                    causal=False,
+                )  # [N,h,Dv]
+                if not torch.isfinite(o_pack).all():
+                    log("warn.fa2_win_varlen_nonfinite")
+                    return sliding_window_attention(Q, K, V, w)
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.win.varlen_all", N=int(N), total_k=int(total_k), ms=dt)
+                # Scatter back
+                out = torch.zeros((B, S, G, h, Dv), dtype=V.dtype, device=V.device)
+                for i, (b, t, g) in enumerate(rows):
+                    out[b, t, g] = o_pack[i]
+                return out
+        out = torch.zeros((B, S, G, h, Dv), dtype=V.dtype, device=V.device)
+        for idx in buckets:
+            if idx.numel() == 0:
+                continue
+            L = int(lengths[idx[0]].item())
+            # Collect rows for this bucket
+            rows_q = []  # [N,h,Dk]
+            rows_k = []  # [N,L,Dk]
+            rows_v = []  # [N,L,Dv]
+            tgt = []
+            for t in idx.tolist():
+                start = max(0, (t + 1) - w)
+                end = t + 1
+                for b in range(B):
+                    for g in range(G):
+                        rows_q.append(Q[b, t, g])
+                        rows_k.append(K[b, g, start:end])
+                        rows_v.append(V[b, g, start:end])
+                        tgt.append((b, t, g))
+            if not rows_q:
+                continue
+            N = len(rows_q)
+            Qb = torch.stack(rows_q, dim=0)  # [N,h,Dk]
+            Kb = torch.stack(rows_k, dim=0)  # [N,L,Dk]
+            Vb = torch.stack(rows_v, dim=0)  # [N,L,Dv]
+            if is_flash_varlen_available() and not (force_dense or force_win_dense):
+                # Pack varlen (constant L here, but use API for generality)
+                q_pack = Qb  # [N,h,Dk]
+                k_pack = Kb.reshape(N * L, Dk).unsqueeze(1).expand(-1, h, -1).reshape(N * L, h, Dk)
+                v_pack = Vb.reshape(N * L, Dv).unsqueeze(1).expand(-1, h, -1).reshape(N * L, h, Dv)
+                cuq = torch.arange(0, N + 1, device=Q.device, dtype=torch.int32)
+                cuk = torch.arange(0, (N + 1) * L, step=L, device=Q.device, dtype=torch.int32)
+                if use_timing:
+                    t0 = time.perf_counter()
+                o_pack = attention_fa2_varlen(
+                    q_pack,
+                    k_pack,
+                    v_pack,
+                    cuq,
+                    cuk,
+                    max_seqlen_q=1,
+                    max_seqlen_k=L,
+                    causal=False,
+                )  # [N,h,Dv]
+                if not torch.isfinite(o_pack).all():
+                    log("warn.fa2_win_bucket_nonfinite")
+                    return sliding_window_attention(Q, K, V, w)
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.win.bucket", path="varlen", L=L, N=int(N), ms=dt)
+                Ob = o_pack  # [N,h,Dv]
+            else:
+                q_rows = Qb.unsqueeze(1)  # [N,1,h,Dk]
+                k_rows = Kb.unsqueeze(2).expand(N, L, h, Dk)
+                v_rows = Vb.unsqueeze(2).expand(N, L, h, Dv)
+                if use_timing:
+                    t0 = time.perf_counter()
+                Ob = attention_fa2_dense_batch(q_rows, k_rows, v_rows, causal=False).squeeze(
+                    1
+                )  # [N,h,Dv]
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.win.bucket", path="dense", L=L, N=int(N), ms=dt)
+            for i, (b, t, g) in enumerate(tgt):
+                out[b, t, g] = Ob[i]
+        return out
+    except Exception as e:
+        log("warn.fa2_unexpected_fallback", branch="win", error=str(e)[:100])
+        return sliding_window_attention_masked(Q, K, V, w)
+def compressed_attention_fa2(
+    Q: torch.Tensor,  # [B,S,G,h,Dk]
+    K_cmp: torch.Tensor,  # [B,G,S_cmp,Dk]
+    V_cmp: torch.Tensor,  # [B,G,S_cmp,Dv]
+    l: int,
+    d: int,
+    min_len_for_fa2: int = 16,
+) -> torch.Tensor:
+    """
+    Planned FA-2 path for compressed with safe fallbacks.
+    Currently falls back to masked SDPA to preserve numerics until FA-2 is wired.
+    """
+    B, S, G, h, Dk = Q.shape
+    device = Q.device
+    # Guard: disable FA-2 on Ada (SM 8.9) unless explicitly forced
+    if _is_sm89(device) and not _fa2_forced():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="cmp", reason="sm89_guard", forced=bool(_fa2_forced()))
+        return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+    S_cmp = K_cmp.shape[2]
+    if S_cmp == 0:
+        return torch.zeros((B, S, G, h, V_cmp.shape[-1]), dtype=V_cmp.dtype, device=V_cmp.device)
+    num_cmp = compute_compressed_lengths(S, l, d, S_cmp, device)
+    max_len = int(num_cmp.max().item()) if num_cmp.numel() > 0 else 0
+    try:
+        min_len_for_fa2 = int(os.getenv("NSA_FA2_MIN_LEN_CMP", str(min_len_for_fa2)))
+    except Exception:
+        pass
+    # Disable sentinel: non-positive threshold disables FA‑2 entirely for this branch
+    if min_len_for_fa2 <= 0:
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="cmp", reason="disabled_threshold")
+        return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+    buckets = build_length_buckets(num_cmp)
+    if buckets:
+        log("fa2.cmp.buckets", n=len(buckets), max_len=max_len)
+        for idx in buckets:
+            blens = num_cmp[idx]
+            _ = build_cu_seqlens_for_buckets(blens)
+    if max_len < min_len_for_fa2:
+        if os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="cmp",
+                reason="below_min_len",
+                max_len=int(max_len),
+                min_len=int(min_len_for_fa2),
+            )
+        return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+    ok, why = fa2_supported_verbose(device, Q.dtype, Dk)
+    if not ok or not is_flash_varlen_available():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="cmp", reason=why, has_varlen=is_flash_varlen_available())
+        return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+    try:
+        Dv = V_cmp.shape[-1]
+        use_timing = os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes")
+        # Log histogram of lengths
+        if buckets:
+            uniq, counts = torch.unique(num_cmp, return_counts=True)
+            log("fa2.cmp.hist", uniq=uniq.tolist(), counts=counts.tolist())
+        # Try single varlen across all rows with L>0
+        force_varlen = _env_bool("NSA_FA2_FORCE_VARLEN", False)
+        force_dense = _env_bool("NSA_FA2_FORCE_DENSE", False)
+        if ((is_flash_varlen_available() and not force_dense) or force_varlen) and max_len >= 1:
+            rows = []
+            len_rows = []
+            for t in range(S):
+                L = int(num_cmp[t].item())
+                for b in range(B):
+                    for g in range(G):
+                        if L > 0:
+                            rows.append((b, t, g))
+                            len_rows.append(L)
+            N = len(rows)
+            if N > 0:
+                total_k = int(sum(len_rows))
+                use_safe_pack = (
+                    torch.is_grad_enabled()
+                    and (Q.requires_grad or K_cmp.requires_grad or V_cmp.requires_grad)
+                ) or _env_bool("NSA_TRAIN_SAFE_PACK", False)
+                if use_safe_pack:
+                    q_pack = torch.stack([Q[b, t, g] for (b, t, g) in rows], dim=0)
+                    k_rows = []
+                    v_rows = []
+                    for (b, t, g), L in zip(rows, len_rows):
+                        if L > 0:
+                            seg_k = K_cmp[b, g, :L]
+                            seg_v = V_cmp[b, g, :L]
+                            k_rows.append(seg_k.unsqueeze(1).expand(-1, h, -1))  # [L,h,Dk]
+                            v_rows.append(seg_v.unsqueeze(1).expand(-1, h, -1))  # [L,h,Dv]
+                    if total_k > 0:
+                        k_pack = torch.cat(k_rows, dim=0)
+                        v_pack = torch.cat(v_rows, dim=0)
+                    else:
+                        k_pack = torch.zeros((0, h, Dk), dtype=K_cmp.dtype, device=K_cmp.device)
+                        v_pack = torch.zeros((0, h, Dv), dtype=V_cmp.dtype, device=V_cmp.device)
+                    cuq = torch.arange(0, N + 1, device=Q.device, dtype=torch.int32)
+                    lens_t = torch.tensor(len_rows, dtype=torch.int32, device=Q.device)
+                    cuk = torch.cumsum(torch.nn.functional.pad(lens_t, (1, 0)), dim=0)
+                else:
+                    ws = _get_varlen_workspace(
+                        Q.device, Q.dtype, K_cmp.dtype, V_cmp.dtype, h, Dk, Dv, N, total_k
+                    )
+                    q_pack = ws["q"][:N]
+                    k_pack = ws["k"][:total_k]
+                    v_pack = ws["v"][:total_k]
+                    cuq = ws["cuq"][: N + 1]
+                    cuq.copy_(torch.arange(0, N + 1, device=Q.device, dtype=torch.int32))
+                    lens_t = torch.tensor(len_rows, dtype=torch.int32, device=Q.device)
+                    cuk = ws["cuk"][: N + 1]
+                    torch.cumsum(torch.nn.functional.pad(lens_t, (1, 0)), dim=0, out=cuk)
+                    write_pos = 0
+                    for i, (b, t, g) in enumerate(rows):
+                        L = len_rows[i]
+                        q_pack[i] = Q[b, t, g]
+                        if L > 0:
+                            seg_k = K_cmp[b, g, :L]
+                            seg_v = V_cmp[b, g, :L]
+                            assert (write_pos + L) <= total_k, "varlen cmp K/V pack overflow"
+                            k_pack[write_pos : write_pos + L] = seg_k.unsqueeze(1).expand(L, h, Dk)
+                            v_pack[write_pos : write_pos + L] = seg_v.unsqueeze(1).expand(L, h, Dv)
+                            write_pos += L
+                if use_timing:
+                    t0 = time.perf_counter()
+                o_pack = attention_fa2_varlen(
+                    q_pack,
+                    k_pack,
+                    v_pack,
+                    cuq,
+                    cuk,
+                    max_seqlen_q=1,
+                    max_seqlen_k=max_len,
+                    causal=False,
+                )  # [N,h,Dv]
+                if not torch.isfinite(o_pack).all():
+                    log("warn.fa2_cmp_varlen_nonfinite")
+                    return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.cmp.varlen_all", N=int(N), total_k=int(total_k), ms=dt)
+                out = torch.zeros((B, S, G, h, Dv), dtype=V_cmp.dtype, device=V_cmp.device)
+                for i, (b, t, g) in enumerate(rows):
+                    out[b, t, g] = o_pack[i]
+                return out
+        out = torch.zeros((B, S, G, h, Dv), dtype=V_cmp.dtype, device=V_cmp.device)
+        for idx in buckets:
+            if idx.numel() == 0:
+                continue
+            L = int(num_cmp[idx[0]].item())
+            rows_q = []  # [N,h,Dk]
+            rows_k = []  # [N,L,Dk]
+            rows_v = []  # [N,L, Dv]
+            tgt = []
+            for t in idx.tolist():
+                if L <= 0:
+                    continue
+                for b in range(B):
+                    for g in range(G):
+                        rows_q.append(Q[b, t, g])
+                        rows_k.append(K_cmp[b, g, :L])
+                        rows_v.append(V_cmp[b, g, :L])
+                        tgt.append((b, t, g))
+            if not rows_q:
+                continue
+            N = len(rows_q)
+            Qb = torch.stack(rows_q, dim=0)  # [N,h,Dk]
+            Kb = torch.stack(rows_k, dim=0)  # [N,L,Dk]
+            Vb = torch.stack(rows_v, dim=0)  # [N,L,Dv]
+            if is_flash_varlen_available() and not force_dense:
+                q_pack = Qb
+                k_pack = Kb.reshape(N * L, Dk).unsqueeze(1).expand(-1, h, -1).reshape(N * L, h, Dk)
+                v_pack = Vb.reshape(N * L, Dv).unsqueeze(1).expand(-1, h, -1).reshape(N * L, h, Dv)
+                cuq = torch.arange(0, N + 1, device=Q.device, dtype=torch.int32)
+                cuk = torch.arange(0, (N + 1) * L, step=L, device=Q.device, dtype=torch.int32)
+                if use_timing:
+                    t0 = time.perf_counter()
+                o_pack = attention_fa2_varlen(
+                    q_pack,
+                    k_pack,
+                    v_pack,
+                    cuq,
+                    cuk,
+                    max_seqlen_q=1,
+                    max_seqlen_k=L,
+                    causal=False,
+                )  # [N,h,Dv]
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.cmp.bucket", path="varlen", L=L, N=int(N), ms=dt)
+                Ob = o_pack
+            else:
+                q_rows = Qb.unsqueeze(1)
+                k_rows = Kb.unsqueeze(2).expand(N, L, h, Dk)
+                v_rows = Vb.unsqueeze(2).expand(N, L, h, Dv)
+                if use_timing:
+                    t0 = time.perf_counter()
+                Ob = attention_fa2_dense_batch(q_rows, k_rows, v_rows, causal=True).squeeze(1)
+                if not torch.isfinite(Ob).all():
+                    return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+                if use_timing:
+                    dt = (time.perf_counter() - t0) * 1e3
+                    log("fa2.cmp.bucket", path="dense", L=L, N=int(N), ms=dt)
+            for i, (b, t, g) in enumerate(tgt):
+                out[b, t, g] = Ob[i]
+        return out
+    except Exception as e:
+        log("warn.fa2_unexpected_fallback", branch="cmp", error=str(e)[:100])
+        return batched_causal_attention_compressed_masked(Q, K_cmp, V_cmp, l, d)
+def sliding_window_attention_fa2_decode(
+    q_t: torch.Tensor, K_win: torch.Tensor, V_win: torch.Tensor, w: int
+) -> torch.Tensor:
+    B, G, h, Dk = q_t.shape
+    # Guard: disable FA-2 on Ada (SM 8.9) unless explicitly forced
+    if _is_sm89(q_t.device) and not _fa2_forced():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="win.decode",
+                reason="sm89_guard",
+                forced=bool(_fa2_forced()),
+            )
+        end = K_win.shape[2]
+        win_len = min(w, end)
+        if win_len == 0:
+            return torch.zeros((B, G, h, V_win.shape[-1]), dtype=V_win.dtype, device=V_win.device)
+        start = end - win_len
+        return attention_bgh(q_t, K_win[:, :, start:end], V_win[:, :, start:end], causal=True)
+    end = K_win.shape[2]
+    win_len = min(w, end)
+    if win_len == 0:
+        return torch.zeros((B, G, h, V_win.shape[-1]), dtype=V_win.dtype, device=V_win.device)
+    # CPU or unsupported: direct SDPA for parity
+    ok, why = fa2_supported_verbose(q_t.device, q_t.dtype, Dk)
+    if not ok:
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="win.decode", reason=why)
+        start = end - win_len
+        return attention_bgh(q_t, K_win[:, :, start:end], V_win[:, :, start:end], causal=True)
+    # Small-length auto-switch for decode
+    try:
+        min_len = int(os.getenv("NSA_FA2_MIN_LEN_WIN", "16"))
+    except Exception:
+        min_len = 16
+    if min_len < 1:
+        min_len = 1
+    if win_len < min_len:
+        if os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="win.decode",
+                reason="below_min_len",
+                win_len=int(win_len),
+                min_len=int(min_len),
+            )
+        start = end - win_len
+        return attention_bgh(q_t, K_win[:, :, start:end], V_win[:, :, start:end], causal=True)
+    start = end - win_len
+    k = K_win[:, :, start:end]
+    v = V_win[:, :, start:end]
+    N = B * G
+    q_rows = q_t.reshape(N, h, Dk).unsqueeze(1)  # [N,1,h,Dk]
+    k_rows = k.reshape(N, win_len, Dk).unsqueeze(2).expand(N, win_len, h, Dk)
+    v_rows = v.reshape(N, win_len, v.shape[-1]).unsqueeze(2).expand(N, win_len, h, v.shape[-1])
+    try:
+        o = attention_fa2_dense_batch(q_rows, k_rows, v_rows, causal=False)  # [N,1,h,Dv]
+        o = o.squeeze(1).reshape(B, G, h, -1)
+        if not torch.isfinite(o).all():
+            return attention_bgh(q_t, k, v, causal=True)
+        return o
+    except Exception as e:
+        log("warn.fa2_unexpected_fallback", branch="win.decode", error=str(e)[:100])
+        return attention_bgh(q_t, k, v, causal=True)
+def compressed_attention_fa2_decode(
+    q_t: torch.Tensor, K_cmp: torch.Tensor, V_cmp: torch.Tensor, L: int
+) -> torch.Tensor:
+    if L <= 0:
+        B, G, h, _ = q_t.shape
+        return torch.zeros((B, G, h, V_cmp.shape[-1]), dtype=V_cmp.dtype, device=V_cmp.device)
+    B, G, h, Dk = q_t.shape
+    # Guard: disable FA-2 on Ada (SM 8.9) unless explicitly forced
+    if _is_sm89(q_t.device) and not _fa2_forced():
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="cmp.decode",
+                reason="sm89_guard",
+                forced=bool(_fa2_forced()),
+            )
+        return attention_bgh(q_t, K_cmp[:, :, :L], V_cmp[:, :, :L], causal=True)
+    ok, why = fa2_supported_verbose(q_t.device, q_t.dtype, Dk)
+    if not ok:
+        if os.getenv("NSA_SDPA_AUDIT", "0").lower() in ("1", "true", "yes"):
+            log("fa2.gate_skip", branch="cmp.decode", reason=why)
+        return attention_bgh(q_t, K_cmp[:, :, :L], V_cmp[:, :, :L], causal=True)
+    try:
+        min_len = int(os.getenv("NSA_FA2_MIN_LEN_CMP", "16"))
+    except Exception:
+        min_len = 16
+    if min_len < 1:
+        min_len = 1
+    if L < min_len:
+        if os.getenv("NSA_DEBUG_TIMING", "0").lower() in ("1", "true", "yes"):
+            log(
+                "fa2.gate_skip",
+                branch="cmp.decode",
+                reason="below_min_len",
+                L=int(L),
+                min_len=int(min_len),
+            )
+        return attention_bgh(q_t, K_cmp[:, :, :L], V_cmp[:, :, :L], causal=True)
+    k = K_cmp[:, :, :L]
+    v = V_cmp[:, :, :L]
+    N = B * G
+    q_rows = q_t.reshape(N, h, Dk).unsqueeze(1)
+    k_rows = k.reshape(N, L, Dk).unsqueeze(2).expand(N, L, h, Dk)
+    v_rows = v.reshape(N, L, v.shape[-1]).unsqueeze(2).expand(N, L, h, v.shape[-1])
+    try:
+        o = attention_fa2_dense_batch(q_rows, k_rows, v_rows, causal=False)
+        o = o.squeeze(1).reshape(B, G, h, -1)
+        if not torch.isfinite(o).all():
+            return attention_bgh(q_t, k, v, causal=True)
+        return o
+    except Exception:
+        return attention_bgh(q_t, k, v, causal=True)

nsa/core/block_index.py ADDED Viewed

	@@ -0,0 +1,100 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import List, Tuple
+import torch
+@dataclass
+class BlockMeta:
+    l: int
+    d: int
+    l_sel: int
+    n_sel: int
+    w: int
+    cmp_starts: torch.Tensor  # [S_cmp]
+    sel_starts: torch.Tensor  # [S_sel]
+    # CSR representation: (indptr, indices, values) mapping cmp_idx -> {sel_idx: weight}
+    M_csl_indptr: torch.Tensor
+    M_csl_indices: torch.Tensor
+    M_csl_values: torch.Tensor
+    # COO representation for fast batched matmul
+    M_csl_coo_indices: torch.Tensor  # [2, nnz] rows, cols
+    M_csl_coo_values: torch.Tensor  # [nnz]
+def build_block_starts(
+    seq_len: int, l: int, d: int, l_sel: int
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if d <= 0 or l <= 0 or l_sel <= 0:
+        raise ValueError("Block parameters must be positive")
+    # compression blocks (overlapped)
+    max_cmp = 0 if seq_len < l else (seq_len - l) // d + 1
+    cmp_starts = torch.arange(max_cmp, dtype=torch.int32) * d
+    # selection blocks (non-overlapped)
+    max_sel = 0 if seq_len <= 0 else (seq_len + l_sel - 1) // l_sel
+    sel_starts = torch.arange(max_sel, dtype=torch.int32) * l_sel
+    return cmp_starts, sel_starts
+def _overlap_len(a0: int, a1: int, b0: int, b1: int) -> int:
+    return max(0, min(a1, b1) - max(a0, b0))
+def build_M_csl_csr(
+    seq_len: int, l: int, d: int, l_sel: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Build CSR with fractional-overlap weights from cmp blocks to sel blocks
+    cmp_starts, sel_starts = build_block_starts(seq_len, l, d, l_sel)
+    indptr = [0]
+    indices: List[int] = []
+    values: List[float] = []
+    for cmp_i, s in enumerate(cmp_starts.tolist()):
+        a0, a1 = s, s + l
+        total = 0
+        row_pairs: List[Tuple[int, int]] = []
+        for sel_j, t in enumerate(sel_starts.tolist()):
+            b0, b1 = t, t + l_sel
+            ov = _overlap_len(a0, a1, b0, b1)
+            if ov > 0:
+                row_pairs.append((sel_j, ov))
+                total += ov
+        # normalize by total overlap to get fractional weights
+        if total > 0:
+            for sel_j, ov in row_pairs:
+                indices.append(sel_j)
+                values.append(ov / total)
+        indptr.append(len(indices))
+    return (
+        torch.tensor(indptr, dtype=torch.int32),
+        torch.tensor(indices, dtype=torch.int32),
+        torch.tensor(values, dtype=torch.float32),
+    )
+def build_block_meta(seq_len: int, l: int, d: int, l_sel: int, n_sel: int, w: int) -> BlockMeta:
+    if l % d != 0 or l_sel % d != 0:
+        # Enforce divisibility by default (per PRD); general overlaps allowed later if needed
+        raise ValueError("Require d|l and d|l_sel in M0")
+    cmp_starts, sel_starts = build_block_starts(seq_len, l, d, l_sel)
+    indptr, indices, values = build_M_csl_csr(seq_len, l, d, l_sel)
+    # Build COO from CSR
+    rows: List[int] = []
+    for r in range(len(cmp_starts)):
+        start, end = int(indptr[r].item()), int(indptr[r + 1].item())
+        rows.extend([r] * (end - start))
+    coo_indices = torch.stack([torch.tensor(rows, dtype=torch.int32), indices.clone()], dim=0)
+    return BlockMeta(
+        l=l,
+        d=d,
+        l_sel=l_sel,
+        n_sel=n_sel,
+        w=w,
+        cmp_starts=cmp_starts,
+        sel_starts=sel_starts,
+        M_csl_indptr=indptr,
+        M_csl_indices=indices,
+        M_csl_values=values,
+        M_csl_coo_indices=coo_indices,
+        M_csl_coo_values=values.clone(),
+    )

nsa/core/collate.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from __future__ import annotations
+from typing import List, Tuple
+import torch
+def collate_token_batch(
+    sequences: List[List[int]],
+    *,
+    pad_id: int = 0,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Collate token id sequences (var-length) into padded tensors and masks with label shift.
+    Args:
+            sequences: list of token id lists
+            pad_id: id used for padding
+    Returns:
+            input_ids: [B,S_max]
+            labels:    [B,S_max]  (next-token labels; last position masked out)
+            attn_mask: [B,S_max]  (True for valid tokens)
+            loss_mask: [B,S_max]  (True for positions to include in loss)
+            lengths:   [B]
+            cu_seqlens:[B+1]  cumulative lengths
+    """
+    B = len(sequences)
+    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.int32)
+    S_max = int(lengths.max().item()) if B > 0 else 0
+    input_ids = torch.full((B, S_max), pad_id, dtype=torch.long)
+    labels = torch.full((B, S_max), pad_id, dtype=torch.long)
+    attn_mask = torch.zeros((B, S_max), dtype=torch.bool)
+    loss_mask = torch.zeros((B, S_max), dtype=torch.bool)
+    for b, seq in enumerate(sequences):
+        L = len(seq)
+        if L == 0:
+            continue
+        input_ids[b, :L] = torch.tensor(seq, dtype=torch.long)
+        attn_mask[b, :L] = True
+        # next-token labels (shifted left by 1), last token has no next label
+        labels[b, : L - 1] = input_ids[b, 1:L]
+        loss_mask[b, : L - 1] = True
+    # cu_seqlens for varlen APIs
+    cu = torch.zeros((B + 1,), dtype=torch.int32)
+    cu[1:] = torch.cumsum(lengths, dim=0)
+    return input_ids, labels, attn_mask, loss_mask, lengths, cu

nsa/core/compress_pool.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from __future__ import annotations
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from .rope import apply_rope
+def avg_pool_phi_rope_kv(
+    K_raw: torch.Tensor,
+    V_raw: torch.Tensor,
+    l: int,
+    d: int,
+    pos: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Apply RoPE to K before ϕ; use absolute positions if provided
+    S = K_raw.shape[2]
+    if pos is None:
+        pos = torch.arange(S, device=K_raw.device)
+    K_rope = apply_rope(K_raw, pos)
+    V_rope = V_raw
+    # Expect shapes [B,G,S,D*]
+    B, G, S, Dk = K_rope.shape
+    # If sequence shorter than kernel, no compressed tokens yet
+    if S < l:
+        return (
+            torch.zeros((B, G, 0, Dk), device=K_rope.device, dtype=K_rope.dtype),
+            torch.zeros((B, G, 0, V_rope.shape[-1]), device=V_rope.device, dtype=V_rope.dtype),
+        )
+    # Unfold over time with stride d and kernel l (causal pooling over past)
+    Kf = K_rope.reshape(B * G, S, Dk).transpose(1, 2).unsqueeze(3)  # [B*G, Dk, S, 1]
+    Vf = V_rope.reshape(B * G, S, -1).transpose(1, 2).unsqueeze(3)
+    Kp = F.avg_pool2d(Kf[:, :, :S, :], kernel_size=(l, 1), stride=(d, 1))  # [B*G, Dk, S_cmp, 1]
+    Vp = F.avg_pool2d(Vf[:, :, :S, :], kernel_size=(l, 1), stride=(d, 1))
+    S_cmp = Kp.shape[2]
+    K_cmp = Kp.squeeze(3).transpose(1, 2).reshape(B, G, S_cmp, Dk)
+    V_cmp = Vp.squeeze(3).transpose(1, 2).reshape(B, G, S_cmp, V_rope.shape[-1])
+    return K_cmp, V_cmp

nsa/core/debug.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import annotations
+import os
+from typing import Any, Dict
+def _flag(name: str) -> bool:
+    val = os.getenv(name, "0").lower()
+    return val in ("1", "true", "yes")
+def debug_enabled() -> bool:
+    return _flag("NSA_DEBUG_LOG")
+_COUNTS: Dict[str, int] = {}
+def log(tag: str, **fields: Any) -> None:
+    if not debug_enabled():
+        return
+    limit_env = os.getenv("NSA_LOG_LIMIT")
+    if limit_env is not None:
+        try:
+            limit = int(limit_env)
+        except Exception:
+            limit = 0
+        if limit > 0:
+            cnt = _COUNTS.get(tag, 0)
+            if cnt >= limit:
+                return
+            _COUNTS[tag] = cnt + 1
+    parts = [f"{k}={_safe(v)}" for k, v in fields.items()]
+    print(f"NSA-LOG {tag} " + " ".join(parts))
+def _safe(v: Any) -> str:
+    try:
+        if isinstance(v, int | float | str):
+            return str(v)
+        if hasattr(v, "shape"):
+            return str(tuple(int(x) for x in v.shape))
+        return str(v)
+    except Exception:
+        return "<unrepr>"

nsa/core/flags.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from __future__ import annotations
+import os
+from typing import Optional
+import torch
+def env_true(name: str, default: bool = False) -> bool:
+    v = os.getenv(name)
+    if v is None:
+        return default
+    v = v.strip().lower()
+    return v in ("1", "true", "yes", "on")
+def env_int(name: str, default: int) -> int:
+    try:
+        return int(os.getenv(name, str(default)))
+    except Exception:
+        return default
+def is_sm89(device: Optional[torch.device] = None) -> bool:
+    dev = device or (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu"))
+    if dev.type != "cuda":
+        return False
+    try:
+        cap = torch.cuda.get_device_capability(dev)
+        return cap == (8, 9)
+    except Exception:
+        return False
+def torch_triton_version_pairing_ok() -> bool:
+    try:
+        import triton  # noqa: F401
+        tv = triton.__version__
+    except ImportError:
+        tv = "<none>"
+    except Exception:
+        tv = "<unknown>"
+    try:
+        tt = torch.__version__
+    except Exception:
+        tt = "<unknown>"
+    # Basic heuristic: 2.2.x ↔ triton 2.2.x; 2.3.x ↔ 2.3.x; 2.4+ ↔ 3.x
+    try:
+        major_minor = ".".join((tt or "").split("+")[0].split(".")[:2])
+        parts = major_minor.split(".")
+        t_major = int(parts[0])
+        t_minor = int(parts[1])
+        if t_major != 2:
+            return True  # do not gate non-2.x
+        if t_minor in (2, 3):
+            return tv.startswith(f"{t_minor}.")
+        if t_minor >= 4:
+            return tv.startswith("3.")
+        return True
+    except (ValueError, IndexError):
+        return True
+def execution_routing_summary() -> dict:
+    """Return a snapshot of routing-related flags and runtime probes."""
+    info = {
+        "cuda": torch.cuda.is_available(),
+        "sm89": is_sm89(),
+        "torch": torch.__version__,
+    }
+    try:
+        import triton
+        info["triton"] = triton.__version__
+    except Exception:
+        info["triton"] = "<none>"
+    info["NSA_USE_TRITON_SEL"] = env_true("NSA_USE_TRITON_SEL", False)
+    info["NSA_TRITON_SEL_FORCE"] = env_true("NSA_TRITON_SEL_FORCE", False)
+    info["NSA_USE_FA2"] = env_true("NSA_USE_FA2", False)
+    return info

nsa/core/nsa_attention.py ADDED Viewed

	@@ -0,0 +1,1850 @@

+from __future__ import annotations
+import os
+from typing import List, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nsa.cache.kv_cache import NSA_KV
+from nsa.core.attention_kernels import (
+    compressed_attention_fa2,
+    compressed_attention_fa2_decode,
+    grouped_selection_attention,
+    grouped_selection_attention_masked,
+    grouped_selection_attention_packed,
+    sliding_window_attention_fa2,
+    sliding_window_attention_fa2_decode,
+)
+from nsa.core.block_index import build_block_meta
+from nsa.core.compress_pool import avg_pool_phi_rope_kv
+from nsa.core.debug import log
+from nsa.core.rope import apply_rope
+from nsa.core.selection_scorer import (
+    compute_pcmp_all,
+    map_pcmp_to_pslc_batched,
+    select_topn_ranges,
+    select_topn_ranges_batched,
+    verify_mapping_equivalence,
+)
+from nsa.kernels.flash_wrappers import attention_bgh
+class GateMLP(nn.Module):
+    def __init__(self, d_k: int, hidden: Optional[int] = None):
+        super().__init__()
+        hidden = hidden or max(1, d_k // 2)
+        self.fc1 = nn.Linear(d_k, hidden)
+        self.fc2 = nn.Linear(hidden, 3)
+        # Initialize fc2 with small random values to break symmetry and enable learning
+        # Use Xavier uniform with reduced scale to start near uniform but allow differentiation
+        nn.init.xavier_uniform_(self.fc2.weight, gain=0.1)
+        nn.init.zeros_(self.fc2.bias)  # Keep bias at zero for initial balance
+        # Cache environment variables at init to avoid hot path parsing
+        self._force_uniform_gate = os.getenv("NSA_FORCE_UNIFORM_GATE", "0").lower() in (
+            "1",
+            "true",
+            "yes",
+        )
+        self._force_branch = os.getenv("NSA_FORCE_BRANCH")
+    def forward(self, q_group_pooled: torch.Tensor, tau: float = 1.0) -> torch.Tensor:
+        # Uniform gate override for debugging DDP hangs
+        if self._force_uniform_gate:
+            one_third = 1.0 / 3.0
+            shape = (*q_group_pooled.shape[:-1], 3)
+            return torch.full(
+                shape, one_third, device=q_group_pooled.device, dtype=q_group_pooled.dtype
+            )
+        fb = self._force_branch
+        if fb:
+            fb = fb.strip().lower()
+            if fb in ("cmp", "sel", "win"):
+                idx = 0 if fb == "cmp" else (1 if fb == "sel" else 2)
+                one = torch.zeros(
+                    (*q_group_pooled.shape[:-1], 3),
+                    device=q_group_pooled.device,
+                    dtype=q_group_pooled.dtype,
+                )
+                one[..., idx] = 1.0
+                return one
+        x = F.silu(self.fc1(q_group_pooled))
+        g = self.fc2(x) / max(tau, 1e-6)
+        p = F.softmax(g, dim=-1)
+        # Hard one-hot if extremely peaked to avoid numerical drift in ablations/tests
+        with torch.no_grad():
+            top2 = torch.topk(g, k=2, dim=-1).values
+            peaked = (top2[..., 0] - top2[..., 1]) > 50.0
+        if peaked.any():
+            one_hot = torch.zeros_like(p)
+            idx = torch.argmax(g, dim=-1, keepdim=True)
+            one_hot.scatter_(-1, idx, 1.0)
+            p = torch.where(peaked.unsqueeze(-1), one_hot, p)
+        return p
+def _fused_gate_combine_bsg(
+    q_gp: torch.Tensor,  # [B,S,G,Dk]
+    O_cmp: torch.Tensor,  # [B,S,G,h,Dv]
+    O_sel: torch.Tensor,  # [B,S,G,h,Dv]
+    O_win: torch.Tensor,  # [B,S,G,h,Dv]
+    fc1_w: torch.Tensor,
+    fc1_b: torch.Tensor | None,
+    fc2_w: torch.Tensor,
+    fc2_b: torch.Tensor | None,
+    tau: float,
+) -> torch.Tensor:
+    import torch.nn.functional as _F
+    x = _F.silu(_F.linear(q_gp, fc1_w, fc1_b))
+    g = _F.linear(x, fc2_w, fc2_b) / max(tau, 1e-6)
+    p = _F.softmax(g, dim=-1)
+    w_cmp = p[..., 0:1].unsqueeze(-1)
+    w_sel = p[..., 1:2].unsqueeze(-1)
+    w_win = p[..., 2:3].unsqueeze(-1)
+    return w_cmp * O_cmp + w_sel * O_sel + w_win * O_win
+def _fused_gate_combine_bg(
+    q_gp: torch.Tensor,  # [B,G,Dk]
+    O_cmp: torch.Tensor,  # [B,G,h,Dv]
+    O_sel: torch.Tensor,  # [B,G,h,Dv]
+    O_win: torch.Tensor,  # [B,G,h,Dv]
+    fc1_w: torch.Tensor,
+    fc1_b: torch.Tensor | None,
+    fc2_w: torch.Tensor,
+    fc2_b: torch.Tensor | None,
+    tau: float,
+) -> torch.Tensor:
+    import torch.nn.functional as _F
+    x = _F.silu(_F.linear(q_gp, fc1_w, fc1_b))
+    g = _F.linear(x, fc2_w, fc2_b) / max(tau, 1e-6)
+    p = _F.softmax(g, dim=-1)
+    w_cmp = p[..., 0:1].unsqueeze(-1)
+    w_sel = p[..., 1:2].unsqueeze(-1)
+    w_win = p[..., 2:3].unsqueeze(-1)
+    return w_cmp * O_cmp + w_sel * O_sel + w_win * O_win
+def _compute_gate_stats(gates: torch.Tensor) -> dict:
+    """Compute gate health statistics for monitoring.
+    Args:
+        gates: Gate probabilities [B, S, G, 3] or [B, G, 3]
+    Returns:
+        Dict with gate statistics: entropy, max_gate, branch_shares
+    """
+    with torch.no_grad():
+        # Flatten to [*, 3] for consistent computation
+        gates_flat = gates.view(-1, 3)
+        # Gate entropy (should be > 0.5 for healthy mixing)
+        entropy = -(gates_flat * (gates_flat + 1e-8).log()).sum(dim=-1)
+        mean_entropy = entropy.mean().item()
+        min_entropy = entropy.min().item()
+        # Max gate value (should be < 0.9 to avoid collapse)
+        max_gate = gates_flat.max(dim=-1)[0]
+        mean_max_gate = max_gate.mean().item()
+        max_max_gate = max_gate.max().item()
+        # Branch usage shares (should be balanced)
+        branch_shares = gates_flat.mean(dim=0).tolist()  # [cmp, sel, win]
+        # Gate collapse detection (entropy < 0.1 and max_gate > 0.95)
+        collapsed = (entropy < 0.1) & (max_gate > 0.95)
+        collapse_fraction = collapsed.float().mean().item()
+        return {
+            "entropy_mean": mean_entropy,
+            "entropy_min": min_entropy,
+            "max_gate_mean": mean_max_gate,
+            "max_gate_max": max_max_gate,
+            "branch_shares": branch_shares,  # [cmp, sel, win]
+            "collapse_fraction": collapse_fraction,
+            "total_gates": len(gates_flat),
+        }
+class NSAAttention(nn.Module):
+    """
+    Native Sparse Attention (NSA) module (M0 steel-thread).
+    Shapes:
+    - Input x (prefill): [B,S,dim]; x (decode): [B,1,dim]
+    - Heads: n_heads, grouped into n_kv_groups with h_per_group = n_heads // n_kv_groups
+    - Projections produce:
+      - Q: [B,S,G,h,Dk]
+      - K/V per-branch: [B,G,S,D*]
+    Returns:
+    - out: [B,S,dim] (prefill) or [B,1,dim] (decode)
+    - kv: updated NSA_KV caches
+    Notes:
+    - M0 constraints: SDPA-only, fixed sequence length in tests, deterministic.
+    - Masked/packed fast paths are env-gated with `NSA_FORCE_PARITY` fallback.
+    """
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_groups: int,
+        d_k: int,
+        d_v: int,
+        l: int = 32,
+        d: int = 16,
+        l_sel: int = 64,
+        n_sel: int = 16,
+        w: int = 512,
+        phi: str = "avg",
+        gate_hidden: Optional[int] = None,
+        gate_temp: float = 1.0,
+        rope_impl: str = "llama",
+        use_flash: bool = True,
+        use_triton_sel: bool = False,
+    ) -> None:
+        super().__init__()
+        assert n_heads % n_kv_groups == 0, "heads must be divisible by kv groups"
+        # M0 config validation (PRD enforces divisibility)
+        if l % d != 0 or l_sel % d != 0:
+            raise ValueError("M0 requires d|l and d|l_sel; set valid block sizes/stride.")
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_kv_groups = n_kv_groups
+        self.h_per_group = n_heads // n_kv_groups
+        self.d_k = d_k
+        self.d_v = d_v
+        self.l = l
+        self.d = d
+        self.l_sel = l_sel
+        self.n_sel = n_sel
+        self.w = w
+        self.gate_temp = gate_temp
+        self.phi_type = (phi or "avg").lower()
+        # Gate health tracking for M8 monitoring
+        self._last_gate_stats = None
+        # M8: Selection length stats for monitoring (updated each forward)
+        self._last_sel_stats: Optional[dict] = None
+        # M8: Fallback counters for routing monitoring
+        self._fallback_counters = {
+            "selection_triton_fails": 0,
+            "selection_cuda_fails": 0,
+            "selection_pack_fails": 0,
+            "selection_mask_fails": 0,
+            "compressed_fa2_fails": 0,
+            "sliding_fa2_fails": 0,
+            "total_fallbacks": 0,
+        }
+        # RoPE scaling and prefill tiling for long-context demos (env-overridable)
+        try:
+            rs = float(os.getenv("NSA_ROPE_SCALE", "1.0"))
+            if not (rs > 0.0) or rs != rs:  # require positive finite
+                rs = 1.0
+            self.rope_scale = rs
+        except ValueError:
+            self.rope_scale = 1.0
+        try:
+            pt = int(os.getenv("NSA_PREFILL_TILE", "0"))
+            if pt < 0:
+                pt = 0
+            self.prefill_tile = pt
+        except ValueError:
+            self.prefill_tile = 0
+        # Projections
+        self.W_Q = nn.Linear(dim, n_heads * d_k, bias=False)
+        self.W_K_sel = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_sel = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.W_K_win = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_win = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.W_K_cmp = nn.Linear(dim, n_kv_groups * d_k, bias=False)
+        self.W_V_cmp = nn.Linear(dim, n_kv_groups * d_v, bias=False)
+        self.out = nn.Linear(n_heads * d_v, dim, bias=False)
+        self.gate = GateMLP(d_k, gate_hidden)
+        # Default FA-2 usage (can be overridden by env flags)
+        self.use_flash_default = use_flash
+        # One-time SDPA backend audit flag
+        self._sdpa_audited = False
+        # Selection Triton toggle (M4)
+        self.use_triton_sel = use_triton_sel
+        # Cache environment variables to avoid repeated parsing in hot path
+        self._cache_env_vars()
+        # Optional learnable ϕ via depthwise Conv1d over time with kernel l and stride d
+        # Initialize to average pooling for parity with M0
+        self.phi_k_conv: Optional[nn.Conv1d]
+        self.phi_v_conv: Optional[nn.Conv1d]
+        if self.phi_type == "mlp":
+            self.phi_k_conv = nn.Conv1d(
+                self.d_k, self.d_k, kernel_size=self.l, stride=self.d, groups=self.d_k, bias=False
+            )
+            self.phi_v_conv = nn.Conv1d(
+                self.d_v, self.d_v, kernel_size=self.l, stride=self.d, groups=self.d_v, bias=False
+            )
+            with torch.no_grad():
+                self.phi_k_conv.weight.fill_(1.0 / float(self.l))
+                self.phi_v_conv.weight.fill_(1.0 / float(self.l))
+        else:
+            self.phi_k_conv = None
+            self.phi_v_conv = None
+    def _cache_env_vars(self) -> None:
+        """Cache environment variables to avoid repeated parsing in hot path."""
+        def parse_bool(val: str, default: str = "0") -> bool:
+            return os.getenv(val, default).lower() in ("1", "true", "yes")
+        # Cache frequently accessed environment variables
+        # Raw parsed flags
+        self._env_cache = {
+            "static": parse_bool("NSA_ENV_STATIC", "0"),
+            "force_uniform_gate": parse_bool("NSA_FORCE_UNIFORM_GATE", "0"),
+            "force_branch": os.getenv("NSA_FORCE_BRANCH"),
+            "prefill_batched": parse_bool("NSA_PREFILL_BATCHED", "0"),
+            "strict_asserts": parse_bool("NSA_STRICT_ASSERTS", "0"),
+            "force_parity": parse_bool("NSA_FORCE_PARITY", "0"),
+            "use_sel_pack": parse_bool("NSA_USE_SEL_PACK", "1"),
+            "use_triton_sel": parse_bool("NSA_USE_TRITON_SEL", "0") or self.use_triton_sel,
+            "use_cuda_sel": parse_bool("NSA_SEL_CUDA", "0"),
+            "use_sel_varlen": parse_bool("NSA_USE_SEL_VARLEN", "0"),
+            # Hard override to force masked selection path (debug/triage)
+            "force_sel_mask": parse_bool("NSA_FORCE_SEL_MASK", "0"),
+            "fa2_all": parse_bool("NSA_USE_FA2", "0"),
+            "fa2_win": parse_bool("NSA_USE_FA2_WIN", "0"),
+            "fa2_cmp": parse_bool("NSA_USE_FA2_CMP", "0"),
+            "use_sel_mask": parse_bool("NSA_USE_SEL_MASK", "0"),
+            "use_cmp_mask": parse_bool("NSA_USE_CMP_MASK", "1"),
+            "use_win_mask": parse_bool("NSA_USE_WIN_MASK", "1"),
+            "verify_eq9": parse_bool("NSA_VERIFY_EQ9_MAPPING", "0"),
+            "stopgrad_gates": parse_bool("NSA_STOPGRAD_GATES", "0"),
+            "nvtx": parse_bool("NSA_NVTX", "0"),
+            "debug_compare": parse_bool("NSA_DEBUG_COMPARE", "0"),
+            "gate_compile": parse_bool("NSA_GATE_COMPILE", "0"),
+        }
+        # Detect whether env overrides were explicitly provided so we can honor hard-disable
+        fa2_all_set = "NSA_USE_FA2" in os.environ
+        fa2_win_set = "NSA_USE_FA2_WIN" in os.environ
+        fa2_cmp_set = "NSA_USE_FA2_CMP" in os.environ
+        self._env_cache.update(
+            {
+                "fa2_all_set": fa2_all_set,
+                "fa2_win_set": fa2_win_set,
+                "fa2_cmp_set": fa2_cmp_set,
+            }
+        )
+        # Compute effective FA-2 gating with sensible defaults and hard-disable semantics
+        fa2_all_env = self._env_cache["fa2_all"]
+        fa2_win_env = self._env_cache["fa2_win"]
+        fa2_cmp_env = self._env_cache["fa2_cmp"]
+        # Defaults when no explicit env flags are provided:
+        # - Enable compressed FA‑2 by default (robustly capability-gated at call sites)
+        # - Keep sliding FA‑2 off by default due to API semantics
+        # - Do not use the global "all" default to avoid inadvertently enabling sliding
+        if not (fa2_all_set or fa2_win_set or fa2_cmp_set):
+            fa2_all_eff = False
+            fa2_win_eff = False
+            fa2_cmp_eff = True
+        else:
+            # If NSA_USE_FA2 not set, fall back to model default; else honor explicit value
+            fa2_all_eff = self.use_flash_default if not fa2_all_set else fa2_all_env
+            # If global is explicitly set to 0, that hard-disables branch flags too
+            if fa2_all_set and not fa2_all_env:
+                fa2_win_eff = False
+                fa2_cmp_eff = False
+            else:
+                # Branch-specific flags only take effect if explicitly set; otherwise default off
+                fa2_win_eff = fa2_win_env if fa2_win_set else False
+                fa2_cmp_eff = fa2_cmp_env if fa2_cmp_set else False
+        self._env_cache.update(
+            {
+                "fa2_all_eff": fa2_all_eff,
+                "fa2_win_eff": fa2_win_eff,
+                "fa2_cmp_eff": fa2_cmp_eff,
+            }
+        )
+        # Parse numeric values
+        try:
+            self._rope_scale = float(os.getenv("NSA_ROPE_SCALE", "1.0"))
+            if not (self._rope_scale > 0.0) or self._rope_scale != self._rope_scale:
+                self._rope_scale = 1.0
+        except (ValueError, TypeError):
+            self._rope_scale = 1.0
+        try:
+            self._prefill_tile = int(os.getenv("NSA_PREFILL_TILE", "0"))
+            if self._prefill_tile < 0:
+                self._prefill_tile = 0
+        except (ValueError, TypeError):
+            self._prefill_tile = 0
+        # Fused gate combine (lazy-compiled)
+        self._gate_fused_bsg = None
+        self._gate_fused_bg = None
+    def _shape_q(self, Q: torch.Tensor, B: int, S: int) -> torch.Tensor:
+        Q = Q.view(B, S, self.n_heads, self.d_k)
+        # group-major: [B,S,G,h,Dk]
+        G = self.n_kv_groups
+        h = self.h_per_group
+        return Q.view(B, S, G, h, self.d_k)
+    def _shape_kv(self, X: torch.Tensor, B: int, S: int) -> torch.Tensor:
+        G = self.n_kv_groups
+        return X.view(B, S, G, -1).permute(0, 2, 1, 3).contiguous()  # [B,G,S,D*]
+    def get_gate_stats(self) -> Optional[dict]:
+        """Get the most recent gate statistics for monitoring.
+        Returns:
+            Dict with gate health metrics or None if no recent computation
+        """
+        return self._last_gate_stats
+    def get_fallback_counters(self) -> dict:
+        """Get fallback counters for routing monitoring.
+        Returns:
+            Dict with fallback counts per implementation type
+        """
+        return self._fallback_counters.copy()
+    def get_selection_stats(self) -> Optional[dict]:
+        """Return last computed selection length statistics, if available.
+        Keys:
+        - k_mean: mean selected K per row (float)
+        - k_max: max selected K in batch (int)
+        - rows: number of (B,S,G) rows aggregated (int)
+        - pct_at_max: fraction of rows equal to k_max (float)
+        - l_sel: configured selection block size (int)
+        - n_sel: configured top-n selection blocks (int)
+        """
+        return self._last_sel_stats
+    def reset_fallback_counters(self) -> dict:
+        """Reset fallback counters and return the previous values.
+        Returns:
+            Dict with fallback counts before reset
+        """
+        prev_counters = self._fallback_counters.copy()
+        for key in self._fallback_counters:
+            self._fallback_counters[key] = 0
+        return prev_counters
+    def _update_gate_stats(self, gates: torch.Tensor) -> None:
+        """Update stored gate statistics for monitoring."""
+        try:
+            self._last_gate_stats = _compute_gate_stats(gates)
+        except Exception as e:
+            log("warn.gate_stats_fail", error=str(e))
+            self._last_gate_stats = None
+    def _update_sel_stats_from_ranges(self, ranges: torch.Tensor) -> None:
+        """Compute and store selection statistics from [B,*,G,n,2] ranges tensor."""
+        try:
+            if ranges is None or ranges.numel() == 0:
+                self._last_sel_stats = {
+                    "k_mean": 0.0,
+                    "k_max": 0,
+                    "rows": 0,
+                    "pct_at_max": 0.0,
+                    "l_sel": int(self.l_sel),
+                    "n_sel": int(self.n_sel),
+                }
+                return
+            # ranges: [B, T, G, n, 2] or [B, G, n, 2]
+            if ranges.dim() == 5:
+                B, T, G, n, _ = ranges.shape
+                rs = ranges
+                rows = B * T * G
+                # [B,T,G,n]
+                lengths = (rs[..., 1] - rs[..., 0]).clamp_min(0)
+                # Sum across n ranges → [B,T,G]
+                L = lengths.sum(dim=-1).to(torch.int64)
+            elif ranges.dim() == 4:
+                B, G, n, _ = ranges.shape
+                rs = ranges
+                rows = B * G
+                lengths = (rs[..., 1] - rs[..., 0]).clamp_min(0)
+                L = lengths.sum(dim=-1).to(torch.int64)  # [B,G]
+            else:
+                # Unknown shape; skip
+                return
+            if L.numel() == 0:
+                k_mean = 0.0
+                k_max = 0
+                pct_at_max = 0.0
+            else:
+                k_max = int(L.max().item())
+                k_mean = float(L.to(torch.float32).mean().item())
+                if k_max > 0:
+                    pct_at_max = float((L == k_max).to(torch.float32).mean().item())
+                else:
+                    pct_at_max = 0.0
+            self._last_sel_stats = {
+                "k_mean": k_mean,
+                "k_max": k_max,
+                "rows": int(rows),
+                "pct_at_max": pct_at_max,
+                "l_sel": int(self.l_sel),
+                "n_sel": int(self.n_sel),
+            }
+        except Exception as e:
+            log("warn.sel_stats_fail", error=str(e))
+            self._last_sel_stats = None
+    def forward(self, x: torch.Tensor, kv: NSA_KV, *, prefill: bool) -> tuple[torch.Tensor, NSA_KV]:
+        """
+        Forward pass.
+        Args:
+            x: [B,S,dim] if prefill else [B,1,dim]
+            kv: NSA_KV caches (updated in-place per branch)
+            prefill: True for batched prefill, False for single-token decode
+        Returns:
+            (out, kv): out is [B,S,dim] (prefill) or [B,1,dim] (decode)
+        """
+        # x: [B,S,dim] (prefill) or [B,1,dim] (decode)
+        B, S, _ = x.shape
+        assert x.dim() == 3, "x must be [B,S,dim]"
+        assert self.n_heads % self.n_kv_groups == 0, "n_heads must be divisible by n_kv_groups"
+        # Strict assertions may introduce GPU syncs; gate via env for tests/smokes
+        strict_asserts = self._env_cache.get("strict_asserts", False)
+        # M8: Assert causal masking - enforce mode constraints
+        if prefill:
+            assert S > 0, f"Prefill mode requires S > 0, got S={S}"
+        else:
+            assert S == 1, (
+                f"Decode mode requires S=1 (single token), got S={S}. "
+                f"This ensures proper causal ordering in decode steps."
+            )
+        if prefill:
+            # Optional: route prefill via single-token decode steps to support very long contexts safely.
+            if getattr(self, "prefill_tile", 0) and self.prefill_tile > 0:
+                return self._forward_prefill_via_decode(x, kv)
+            use_batched = self._env_cache.get("prefill_batched", False)
+            if use_batched:
+                return self._forward_prefill_batched(x, kv)
+            else:
+                return self._forward_prefill_sequential(x, kv)
+        else:
+            # Projections
+            # Compute absolute position offset from existing cache length for RoPE on Q
+            t_prev = kv.K_sel.shape[2] if hasattr(kv, "K_sel") else 0
+            Q_lin = self._shape_q(self.W_Q(x), B, S)  # [B,S,G,h,Dk]
+            # Apply RoPE to Q with absolute positions (decode)
+            pos = torch.arange(t_prev, t_prev + S, device=x.device)
+            Q = apply_rope(
+                Q_lin.view(B, S, self.n_heads, self.d_k).reshape(B, S, self.n_heads * self.d_k),
+                pos,
+                scale=getattr(self, "rope_scale", 1.0),
+            )
+            Q = Q.view(B, S, self.n_heads, self.d_k)
+            G = self.n_kv_groups
+            h = self.h_per_group
+            Q = Q.view(B, S, G, h, self.d_k)
+            K_sel = self._shape_kv(self.W_K_sel(x), B, S)
+            V_sel = self._shape_kv(self.W_V_sel(x), B, S)
+            K_win = self._shape_kv(self.W_K_win(x), B, S)
+            V_win = self._shape_kv(self.W_V_win(x), B, S)
+            K_cmp_raw = self._shape_kv(self.W_K_cmp(x), B, S)
+            V_cmp_raw = self._shape_kv(self.W_V_cmp(x), B, S)
+            # Apply RoPE to K for selection/sliding branches using absolute position of the new token(s)
+            # Determine current token index before appending to caches
+            t_prev = kv.K_sel.shape[2] if hasattr(kv, "K_sel") else 0
+            pos_k = torch.arange(t_prev, t_prev + S, device=x.device)
+            K_sel = apply_rope(K_sel, pos_k, scale=getattr(self, "rope_scale", 1.0))
+            K_win = apply_rope(K_win, pos_k, scale=getattr(self, "rope_scale", 1.0))
+            # decode step: append raw tokens and window, emit compressed every d after warmup l
+            kv.update_selection_raw(K_sel, V_sel)
+            kv.update_window(K_win, V_win, self.w)
+            if not hasattr(kv, "K_cmp_raw_seq"):
+                kv.K_cmp_raw_seq = K_cmp_raw[:, :, :0]
+                kv.V_cmp_raw_seq = V_cmp_raw[:, :, :0]
+                kv.reads_pred = torch.zeros((0,), dtype=torch.int64, device=x.device)
+                kv.reads_act_total = torch.zeros((0,), dtype=torch.int64, device=x.device)
+                kv.reads_act_sel = torch.zeros((0,), dtype=torch.int64, device=x.device)
+                kv.reads_act_cmp = torch.zeros((0,), dtype=torch.int64, device=x.device)
+                kv.reads_act_win = torch.zeros((0,), dtype=torch.int64, device=x.device)
+            kv.append_cmp_raw(K_cmp_raw, V_cmp_raw)
+            S_raw = kv.K_cmp_raw_seq.shape[2]
+            if S_raw >= self.l and (S_raw - self.l) % self.d == 0:
+                # Emit compressed token from the last l raw tokens
+                K_last = kv.K_cmp_raw_seq[:, :, S_raw - self.l : S_raw, :]
+                V_last = kv.V_cmp_raw_seq[:, :, S_raw - self.l : S_raw, :]
+                pos_last = torch.arange(S_raw - self.l, S_raw, device=x.device)
+                if self.phi_type == "mlp":
+                    K_cmp_new, V_cmp_new = self._phi_apply_last(K_last, V_last, pos_last)
+                else:
+                    K_cmp_new, V_cmp_new = avg_pool_phi_rope_kv(
+                        K_last, V_last, self.l, self.d, pos=pos_last
+                    )
+                kv.update_compressed(
+                    torch.cat([kv.K_cmp, K_cmp_new], dim=2) if kv.K_cmp.numel() else K_cmp_new,
+                    torch.cat([kv.V_cmp, V_cmp_new], dim=2) if kv.V_cmp.numel() else V_cmp_new,
+                    self.l,
+                    self.d,
+                )
+            # Ensure block metadata exists and covers current token index for selection (expand if needed)
+            t_token = kv.K_sel.shape[2] - 1
+            if not hasattr(kv, "meta") or kv.meta.sel_starts.numel() == 0:
+                kv.meta = build_block_meta(
+                    seq_len=max(t_token + 1, self.l_sel),
+                    l=self.l,
+                    d=self.d,
+                    l_sel=self.l_sel,
+                    n_sel=self.n_sel,
+                    w=self.w,
+                )
+            else:
+                # If current t exceeds covered selection range, rebuild meta with expanded seq_len
+                sel_max_end = (
+                    int(kv.meta.sel_starts[-1].item()) + kv.meta.l_sel
+                    if kv.meta.sel_starts.numel() > 0
+                    else 0
+                )
+                if (t_token + 1) > sel_max_end:
+                    kv.meta = build_block_meta(
+                        seq_len=t_token + 1,
+                        l=self.l,
+                        d=self.d,
+                        l_sel=self.l_sel,
+                        n_sel=self.n_sel,
+                        w=self.w,
+                    )
+            # Append predicted reads per formula for this step
+            num_cmp = 0 if S_raw < self.l else (S_raw - self.l) // self.d + 1
+            reads = num_cmp + self.n_sel * self.l_sel + min(self.w, S_raw)
+            kv.append_reads_pred(reads)
+            # Append actual reads equal to formula in M0
+            kv.append_reads_actual(reads, self.n_sel * self.l_sel, num_cmp, min(self.w, S_raw))
+            log(
+                "decode.reads",
+                S_raw=int(S_raw),
+                num_cmp=int(num_cmp),
+                sel=int(self.n_sel * self.l_sel),
+                win=int(min(self.w, S_raw)),
+                total=int(reads),
+            )
+            scale = 1.0 / (self.d_k**0.5)
+            # Compute p_cmp only for this step (S is 1 in decode)
+            K_cmp_full = kv.K_cmp
+            p_cmp_all = compute_pcmp_all(Q, K_cmp_full, scale)
+            # Per-token outputs (S should be 1 in decode)
+            outs = []
+            # Use cached environment variables
+            env = self._env_cache
+            for t in range(S):
+                p_slc_all = map_pcmp_to_pslc_batched(p_cmp_all[:, t : t + 1], kv.meta)
+                # M8: Optional Eq.9 verification in decode
+                if self._env_cache.get("verify_eq9", False):
+                    is_equiv, details = verify_mapping_equivalence(p_cmp_all[:, t : t + 1], kv.meta)
+                    if not is_equiv:
+                        log(
+                            "error.eq9_verification_failed_decode",
+                            msg="Eq.9 mapping verification failed in decode",
+                            step=t,
+                            **details,
+                        )
+                p_grp = p_slc_all.sum(dim=3).squeeze(1)  # [B,G,S_sel]
+                current_pos = kv.K_sel.shape[2] - 1  # Current token position (0-indexed)
+                sel_ranges = select_topn_ranges(p_grp, kv.meta, self.n_sel, current_pos, True, 2)
+                # M8: Assert causal masking - selection ranges cannot include future tokens
+                if strict_asserts and sel_ranges.numel() > 0:
+                    # Only sync for strict asserts (debug mode)
+                    max_end = sel_ranges[..., 1].max().item()  # GPU sync only in debug
+                    assert max_end <= current_pos + 1, (
+                        f"Selection range violates causality: max_end={max_end} > current_pos+1={current_pos + 1}. "
+                        f"Selection must not access future tokens."
+                    )
+                # Update selection stats and observability: distance summary per step
+                try:
+                    # Update per-step selection stats (decode has S==1)
+                    self._update_sel_stats_from_ranges(sel_ranges)
+                    starts = sel_ranges[..., 0].to(torch.int64)
+                    ends = sel_ranges[..., 1].to(torch.int64)
+                    lengths = (ends - starts).clamp_min(0)
+                    dist = (kv.K_sel.shape[2] - 1) - starts
+                    log(
+                        "decode.select",
+                        n_ranges=int(sel_ranges.shape[2]),
+                        mean_len=float(lengths.float().mean().item()) if lengths.numel() else 0.0,
+                        max_len=int(lengths.max().item()) if lengths.numel() else 0,
+                        mean_dist=float(dist.float().mean().item()) if dist.numel() else 0.0,
+                        max_dist=int(dist.max().item()) if dist.numel() else 0,
+                    )
+                except Exception as e:
+                    log("warn.decode.select_log_fail", error=str(e))
+                Q_t = Q[:, t]
+                K_sel_t = kv.K_sel
+                V_sel_t = kv.V_sel
+                # Selection attention: prefer Triton if enabled; else packed; fallback to gather
+                force_parity = env["force_parity"]
+                use_sel_pack = env["use_sel_pack"] and not force_parity
+                use_triton_sel = env["use_triton_sel"] and not force_parity
+                use_cuda_sel = env["use_cuda_sel"] and not force_parity
+                force_sel_mask = env.get("force_sel_mask", False) and not force_parity
+                if force_sel_mask:
+                    try:
+                        O_sel_bt = grouped_selection_attention_masked(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                        log("decode.sel.path", path="masked_forced")
+                    except Exception as e:
+                        self._fallback_counters["selection_mask_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log("warn.masked_selection_forced_fallback",
+                            error=str(e),
+                            step=t,
+                            Q_shape=list(Q_t.shape),
+                            K_shape=list(K_sel_t.shape),
+                            V_shape=list(V_sel_t.shape),
+                            ranges_shape=list(sel_ranges.shape) if sel_ranges is not None else None,
+                            total_fails=self._fallback_counters["selection_mask_fails"])
+                        O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+                elif use_triton_sel:
+                    try:
+                        from nsa.kernels.triton_sel_kernel import selection_attention_triton
+                        O_sel_bt = selection_attention_triton(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                        log("decode.sel.path", path="triton")
+                    except Exception as e:
+                        # M8: Fallback counter - Triton selection failed
+                        self._fallback_counters["selection_triton_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.triton_selection_fallback",
+                            error=str(e),
+                            step=t,
+                            Q_shape=list(Q_t.shape),
+                            K_shape=list(K_sel_t.shape),
+                            V_shape=list(V_sel_t.shape),
+                            ranges_shape=list(sel_ranges.shape) if sel_ranges is not None else None,
+                            total_fails=self._fallback_counters["selection_triton_fails"],
+                        )
+                        # Fallback to packed SDPA
+                        O_sel_bt = grouped_selection_attention_packed(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                elif use_cuda_sel:
+                    try:
+                        from nsa.kernels.cuda_sel_kernel import selection_attention_cuda
+                        O_sel_bt = selection_attention_cuda(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                    except Exception as e:
+                        # M8: Fallback counter - CUDA selection failed
+                        self._fallback_counters["selection_cuda_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.cuda_selection_fallback",
+                            error=str(e),
+                            step=t,
+                            Q_shape=list(Q_t.shape),
+                            K_shape=list(K_sel_t.shape),
+                            V_shape=list(V_sel_t.shape),
+                            ranges_shape=list(sel_ranges.shape) if sel_ranges is not None else None,
+                            total_fails=self._fallback_counters["selection_cuda_fails"],
+                        )
+                        # Fallback to packed SDPA
+                        O_sel_bt = grouped_selection_attention_packed(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                elif use_sel_pack:
+                    try:
+                        O_sel_bt = grouped_selection_attention_packed(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                        log("decode.sel.path", path="packed")
+                    except Exception as e:
+                        # M8: Fallback counter - Packed selection failed
+                        self._fallback_counters["selection_pack_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.packed_selection_fallback",
+                            error=str(e),
+                            step=t,
+                            Q_shape=list(Q_t.shape),
+                            K_shape=list(K_sel_t.shape),
+                            V_shape=list(V_sel_t.shape),
+                            ranges_shape=list(sel_ranges.shape) if sel_ranges is not None else None,
+                            total_fails=self._fallback_counters["selection_pack_fails"],
+                        )
+                        # Fallback to gather SDPA
+                        O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+                elif self._env_cache.get("use_sel_mask", False) and not force_parity:
+                    try:
+                        O_sel_bt = grouped_selection_attention_masked(
+                            Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                        )
+                        O_sel = O_sel_bt[:, 0]
+                        log("decode.sel.path", path="masked")
+                    except Exception as e:
+                        # M8: Fallback counter - Masked selection failed
+                        self._fallback_counters["selection_mask_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.masked_selection_fallback",
+                            error=str(e),
+                            step=t,
+                            Q_shape=list(Q_t.shape),
+                            K_shape=list(K_sel_t.shape),
+                            V_shape=list(V_sel_t.shape),
+                            ranges_shape=list(sel_ranges.shape) if sel_ranges is not None else None,
+                            total_fails=self._fallback_counters["selection_mask_fails"],
+                        )
+                        # Fallback to gather SDPA
+                        O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+                else:
+                    O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+                win_len = min(self.w, kv.K_win.shape[2])
+                # M8: Assert causal masking - sliding window bounds in decode
+                total_tokens = kv.K_win.shape[2]
+                start_idx = total_tokens - win_len
+                end_idx = total_tokens
+                assert start_idx >= 0, (
+                    f"Sliding window start index negative: start_idx={start_idx}, "
+                    f"total_tokens={total_tokens}, win_len={win_len}"
+                )
+                assert end_idx <= total_tokens, (
+                    f"Sliding window end exceeds cache: end_idx={end_idx} > total_tokens={total_tokens}"
+                )
+                assert win_len <= self.w, (
+                    f"Window length exceeds max: win_len={win_len} > self.w={self.w}"
+                )
+                K_w = kv.K_win[:, :, start_idx:end_idx, :]
+                V_w = kv.V_win[:, :, start_idx:end_idx, :]
+                use_flash = (
+                    env["fa2_all_eff"] or env["fa2_win_eff"] or env["fa2_cmp_eff"]
+                ) and not force_parity
+                if use_flash and (env["fa2_all_eff"] or env["fa2_win_eff"]):
+                    try:
+                        O_win = sliding_window_attention_fa2_decode(Q_t, kv.K_win, kv.V_win, self.w)
+                    except Exception as e:
+                        # M8: Fallback counter - Sliding FA2 failed
+                        self._fallback_counters["sliding_fa2_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.sliding_fa2_fallback",
+                            error=str(e),
+                            total_fails=self._fallback_counters["sliding_fa2_fails"],
+                        )
+                        # Fallback to standard attention
+                        O_win = attention_bgh(
+                            Q_t.contiguous(), K_w.contiguous(), V_w.contiguous(), causal=True
+                        )
+                else:
+                    O_win = attention_bgh(
+                        Q_t.contiguous(), K_w.contiguous(), V_w.contiguous(), causal=True
+                    )
+                S_cmp_t = kv.K_cmp.shape[2]
+                # M8: Assert causal masking - compressed bounds in decode
+                assert S_cmp_t >= 0, f"Compressed cache size negative: S_cmp_t={S_cmp_t}"
+                assert S_cmp_t <= kv.K_cmp.shape[2], (
+                    f"Compressed range exceeds cache: S_cmp_t={S_cmp_t} > cache_size={kv.K_cmp.shape[2]}"
+                )
+                if use_flash and (env["fa2_all_eff"] or env["fa2_cmp_eff"]):
+                    try:
+                        O_cmp = compressed_attention_fa2_decode(Q_t, kv.K_cmp, kv.V_cmp, S_cmp_t)
+                    except Exception as e:
+                        # M8: Fallback counter - Compressed FA2 failed
+                        self._fallback_counters["compressed_fa2_fails"] += 1
+                        self._fallback_counters["total_fallbacks"] += 1
+                        log(
+                            "warn.compressed_fa2_fallback",
+                            error=str(e),
+                            total_fails=self._fallback_counters["compressed_fa2_fails"],
+                        )
+                        # Fallback to standard attention
+                        O_cmp = attention_bgh(
+                            Q_t.contiguous(),
+                            kv.K_cmp[:, :, :S_cmp_t, :].contiguous(),
+                            kv.V_cmp[:, :, :S_cmp_t, :].contiguous(),
+                            causal=True,
+                        )
+                else:
+                    O_cmp = attention_bgh(
+                        Q_t.contiguous(),
+                        kv.K_cmp[:, :, :S_cmp_t, :].contiguous(),
+                        kv.V_cmp[:, :, :S_cmp_t, :].contiguous(),
+                        causal=True,
+                    )
+                # Preserve dtype for gate input
+                q_gp = Q_t.mean(dim=2, dtype=Q_t.dtype)
+                if self._env_cache.get("gate_compile", False):
+                    try:
+                        fused = self._gate_fused_bg
+                        if fused is None:
+                            fused = _fused_gate_combine_bg
+                            try:
+                                fused = torch.compile(fused, mode="reduce-overhead")  # type: ignore[attr-defined]
+                            except Exception:
+                                pass
+                            self._gate_fused_bg = fused
+                        O = fused(
+                            q_gp,
+                            O_cmp,
+                            O_sel,
+                            O_win,
+                            self.gate.fc1.weight,
+                            self.gate.fc1.bias,
+                            self.gate.fc2.weight,
+                            self.gate.fc2.bias,
+                            float(self.gate_temp),
+                        )
+                    except Exception:
+                        gates = self.gate(q_gp, tau=self.gate_temp)
+                        if self._env_cache.get("stopgrad_gates", False):
+                            gates = gates.detach()
+                        self._update_gate_stats(gates)
+                        try:
+                            log(
+                                "decode.gates",
+                                mean=gates.mean(dim=(-1, -2)).tolist()
+                                if gates.dim() >= 2
+                                else gates.mean().item(),
+                                std=gates.std(dim=(-1, -2)).tolist()
+                                if gates.dim() >= 2
+                                else gates.std().item(),
+                            )
+                        except Exception as e:
+                            log("warn.decode.gate_log_fail", error=str(e))
+                        w_cmp = gates[..., 0:1].unsqueeze(-1)
+                        w_sel = gates[..., 1:2].unsqueeze(-1)
+                        w_win = gates[..., 2:3].unsqueeze(-1)
+                        O = w_cmp * O_cmp + w_sel * O_sel + w_win * O_win
+                else:
+                    gates = self.gate(q_gp, tau=self.gate_temp)
+                    if self._env_cache.get("stopgrad_gates", False):
+                        gates = gates.detach()
+                    self._update_gate_stats(gates)
+                    try:
+                        log(
+                            "decode.gates",
+                            mean=gates.mean(dim=(-1, -2)).tolist()
+                            if gates.dim() >= 2
+                            else gates.mean().item(),
+                            std=gates.std(dim=(-1, -2)).tolist()
+                            if gates.dim() >= 2
+                            else gates.std().item(),
+                        )
+                    except Exception as e:
+                        log("warn.decode.gate_log_fail", error=str(e))
+                    w_cmp = gates[..., 0:1].unsqueeze(-1)
+                    w_sel = gates[..., 1:2].unsqueeze(-1)
+                    w_win = gates[..., 2:3].unsqueeze(-1)
+                    O = w_cmp * O_cmp + w_sel * O_sel + w_win * O_win
+                O_heads = O.reshape(B, self.n_heads, self.d_v)
+                out_t = self.out(O_heads.reshape(B, 1, -1))
+                outs.append(out_t)
+            out = torch.cat(outs, dim=1)
+            return out, kv
+    def _forward_prefill_batched(self, x: torch.Tensor, kv: NSA_KV) -> tuple[torch.Tensor, NSA_KV]:
+        """
+        Vectorized prefill path.
+        Steps:
+        - Projections with RoPE(Q); RoPE applied to K before ϕ for compressed branch
+        - Cache updates for selection/window/compressed
+        - Batched p_cmp → p_slc → p_grp; top‑n ranges for all t
+        - Branch attentions (masked/packed per env flags), gating, projection
+        """
+        B, S, _ = x.shape
+        # Projections
+        _nvtx = self._env_cache.get("nvtx", False)
+        if _nvtx:
+            try:
+                import torch as _t
+                _t.cuda.nvtx.range_push("projections+rope")
+            except Exception:
+                _nvtx = False
+        Q_lin = self._shape_q(self.W_Q(x), B, S)  # [B,S,G,h,Dk]
+        assert Q_lin.shape[:2] == (B, S)
+        # Apply RoPE to Q
+        pos = torch.arange(S, device=x.device)
+        Q = apply_rope(
+            Q_lin.view(B, S, self.n_heads, self.d_k).reshape(B, S, self.n_heads * self.d_k),
+            pos,
+            scale=getattr(self, "rope_scale", 1.0),
+        )
+        Q = Q.view(B, S, self.n_heads, self.d_k).view(
+            B, S, self.n_kv_groups, self.h_per_group, self.d_k
+        )
+        # K/V projections per branch
+        K_sel = self._shape_kv(self.W_K_sel(x), B, S)
+        V_sel = self._shape_kv(self.W_V_sel(x), B, S)
+        K_win = self._shape_kv(self.W_K_win(x), B, S)
+        V_win = self._shape_kv(self.W_V_win(x), B, S)
+        K_cmp_raw = self._shape_kv(self.W_K_cmp(x), B, S)
+        V_cmp_raw = self._shape_kv(self.W_V_cmp(x), B, S)
+        G = self.n_kv_groups
+        assert K_sel.shape[:3] == (B, G, S) and V_sel.shape[:3] == (B, G, S)
+        assert K_win.shape[:3] == (B, G, S) and V_win.shape[:3] == (B, G, S)
+        assert K_cmp_raw.shape[:3] == (B, G, S) and V_cmp_raw.shape[:3] == (B, G, S)
+        # Apply RoPE to per-branch K tensors (Q already has RoPE applied)
+        pos_k = torch.arange(S, device=x.device)
+        K_sel = apply_rope(K_sel, pos_k, scale=getattr(self, "rope_scale", 1.0))
+        K_win = apply_rope(K_win, pos_k, scale=getattr(self, "rope_scale", 1.0))
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+        # Update caches (prefill uses full sequence projections)
+        kv.update_selection_raw(K_sel, V_sel)
+        # Build/refresh meta for selection and compressed mapping
+        kv.meta = build_block_meta(
+            seq_len=S, l=self.l, d=self.d, l_sel=self.l_sel, n_sel=self.n_sel, w=self.w
+        )
+        kv.update_window(K_win, V_win, self.w)
+        if self.phi_type == "mlp":
+            K_cmp, V_cmp = self._phi_apply_seq(
+                K_cmp_raw, V_cmp_raw, pos=torch.arange(S, device=x.device)
+            )
+        else:
+            K_cmp, V_cmp = avg_pool_phi_rope_kv(
+                K_cmp_raw, V_cmp_raw, self.l, self.d, pos=torch.arange(S, device=x.device)
+            )
+        kv.update_compressed(K_cmp, V_cmp, self.l, self.d)
+        # One-time SDPA backend audit (opt-in via env)
+        try:
+            if (not self._sdpa_audited) and os.getenv("NSA_SDPA_AUDIT", "0").lower() in (
+                "1",
+                "true",
+                "yes",
+            ):
+                self._audit_sdpa_backends_once(
+                    Q[:, :1],
+                    K_sel[:, :, : max(1, S // 8), :],
+                    V_sel[:, :, : max(1, S // 8), :],
+                    K_win[:, :, : max(1, S // 8), :],
+                    V_win[:, :, : max(1, S // 8), :],
+                )
+        except Exception:
+            pass
+        # Selection scores (batched)
+        scale = 1.0 / (self.d_k**0.5)
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_push("pcmp_all")
+            except Exception:
+                pass
+        p_cmp_all = compute_pcmp_all(Q, kv.K_cmp, scale)  # [B,S,G,h,S_cmp]
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_pop()
+                _t.cuda.nvtx.range_push("map_pcmp_to_pslc")
+            except Exception:
+                pass
+        p_slc_all = map_pcmp_to_pslc_batched(p_cmp_all, kv.meta)  # [B,S,G,h,S_sel]
+        # M8: Optional Eq.9 verification in batched prefill
+        if self._env_cache.get("verify_eq9", False):
+            is_equiv, details = verify_mapping_equivalence(p_cmp_all, kv.meta)
+            if not is_equiv:
+                log(
+                    "error.eq9_verification_failed_prefill",
+                    msg="Eq.9 mapping verification failed in batched prefill",
+                    **details,
+                )
+        p_grp_all = p_slc_all.sum(dim=3)  # [B,S,G,S_sel]
+        log(
+            "prefill.scores",
+            B=B,
+            S=S,
+            S_cmp=int(kv.K_cmp.shape[2]),
+            S_sel=int(kv.meta.sel_starts.numel()),
+        )
+        # Batched top‑n → ranges for all positions
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_push("topk+ranges")
+            except Exception:
+                pass
+        sel_ranges_all = select_topn_ranges_batched(
+            p_grp_all, kv.meta, self.n_sel, S, True, 2
+        )  # [B,S,G,n,2]
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_pop()
+                _t.cuda.nvtx.range_push("branch_attn+gate")
+            except Exception:
+                pass
+        # Update selection statistics for this prefill batch
+        self._update_sel_stats_from_ranges(sel_ranges_all)
+        if _nvtx:
+            try:
+                _t.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+        # M8: Assert causal masking for batched selection (GPU-sync gated)
+        strict_asserts = self._env_cache.get("strict_asserts", False)
+        if strict_asserts and sel_ranges_all.numel() > 0:
+            for t in range(S):
+                t_ranges = sel_ranges_all[:, t]  # [B,G,n,2]
+                if t_ranges.numel() > 0:
+                    max_end = t_ranges[..., 1].max().item()
+                    assert max_end <= t + 1, (
+                        f"Batched selection violates causality at t={t}: max_end={max_end} > t+1={t + 1}. "
+                        f"Selection ranges cannot access future tokens."
+                    )
+        log("prefill.select", n_sel=self.n_sel, l_sel=self.l_sel, ranges=sel_ranges_all)
+        # Branch attentions in parallel (parity-first for cmp/win, with optional masked SDPA gates)
+        force_parity = self._env_cache.get("force_parity", False)
+        fa2_all = self._env_cache.get("fa2_all_eff", False)
+        fa2_win = self._env_cache.get("fa2_win_eff", False)
+        fa2_cmp = self._env_cache.get("fa2_cmp_eff", False)
+        use_cmp_mask = self._env_cache.get("use_cmp_mask", True) and not force_parity
+        if (fa2_all or fa2_cmp) and not force_parity:
+            try:
+                O_cmp = compressed_attention_fa2(Q, kv.K_cmp, kv.V_cmp, self.l, self.d)
+            except Exception as e:
+                # M8: Fallback counter - Compressed FA2 failed in prefill
+                self._fallback_counters["compressed_fa2_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.compressed_fa2_prefill_fallback",
+                    error=str(e),
+                    total_fails=self._fallback_counters["compressed_fa2_fails"],
+                )
+                # Fallback to masked SDPA
+                from nsa.core.attention_kernels import batched_causal_attention_compressed_masked
+                O_cmp = batched_causal_attention_compressed_masked(
+                    Q, kv.K_cmp, kv.V_cmp, self.l, self.d
+                )
+        elif use_cmp_mask:
+            from nsa.core.attention_kernels import batched_causal_attention_compressed_masked
+            O_cmp = batched_causal_attention_compressed_masked(
+                Q, kv.K_cmp, kv.V_cmp, self.l, self.d
+            )
+        else:
+            # Compressed per-t using the same kernel as sequential
+            O_cmp = torch.zeros(
+                (B, S, self.n_kv_groups, self.h_per_group, self.d_v),
+                device=x.device,
+                dtype=V_cmp.dtype,
+            )
+            S_cmp_full = kv.K_cmp.shape[2]
+            for t in range(S):
+                L = 0 if (t + 1) < self.l else min(((t + 1 - self.l) // self.d) + 1, S_cmp_full)
+                # M8: Assert causal masking - compressed tokens must respect position bounds
+                if L > 0:
+                    # Check that compressed range doesn't exceed causal bounds
+                    assert L <= S_cmp_full, (
+                        f"Compressed range exceeds cache: L={L} > S_cmp_full={S_cmp_full} at t={t}"
+                    )
+                    # Verify causal constraint: at position t, can only see compressed tokens
+                    # that represent original positions up to t
+                    max_allowed_L = ((t + 1 - self.l) // self.d) + 1 if (t + 1) >= self.l else 0
+                    assert L <= max_allowed_L, (
+                        f"Compressed range violates causality: L={L} > max_allowed_L={max_allowed_L} "
+                        f"at t={t}. Compressed tokens represent future positions."
+                    )
+                    q_t = Q[:, t].contiguous()
+                    k_t = kv.K_cmp[:, :, :L, :].contiguous()
+                    v_t = kv.V_cmp[:, :, :L, :].contiguous()
+                    O_cmp[:, t] = attention_bgh(q_t, k_t, v_t, causal=True)
+        # Strict finite check and fallback
+        if strict_asserts and not torch.isfinite(O_cmp).all():
+            from nsa.core.attention_kernels import batched_causal_attention_compressed_masked
+            log("warn.prefill_cmp_nonfinite_fallback")
+            O_cmp = batched_causal_attention_compressed_masked(
+                Q, kv.K_cmp, kv.V_cmp, self.l, self.d
+            )
+        log("prefill.cmp", O_cmp=O_cmp)
+        # Selected ranges attention (prefer Triton if enabled; else packed/gather)
+        use_sel_pack = self._env_cache.get("use_sel_pack", True) and not force_parity
+        use_sel_varlen = self._env_cache.get("use_sel_varlen", False) and not force_parity
+        use_triton_sel = (
+            self._env_cache.get("use_triton_sel", False) or self.use_triton_sel and not force_parity
+        )
+        force_sel_mask = self._env_cache.get("force_sel_mask", False) and not force_parity
+        if force_sel_mask:
+            try:
+                O_sel = grouped_selection_attention_masked(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+                log("prefill.sel.path", path="masked_forced")
+            except Exception as e:
+                # Fallback to gather SDPA
+                self._fallback_counters["selection_mask_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log("warn.masked_selection_prefill_forced_fallback",
+                    error=str(e),
+                    Q_shape=list(Q.shape) if hasattr(Q, 'shape') else list(Q_t.shape),
+                    K_shape=list(kv.K_sel.shape) if hasattr(kv, 'K_sel') else list(K_sel_t.shape),
+                    V_shape=list(kv.V_sel.shape) if hasattr(kv, 'V_sel') else list(V_sel_t.shape),
+                    ranges_shape=list(sel_ranges_all.shape) if 'sel_ranges_all' in locals() else list(sel_ranges.shape) if sel_ranges is not None else None,
+                    total_fails=self._fallback_counters["selection_mask_fails"])
+                O_sel = grouped_selection_attention(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+        elif use_triton_sel:
+            try:
+                from nsa.kernels.triton_sel_kernel import selection_attention_triton
+                O_sel = selection_attention_triton(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+                log("prefill.sel.path", path="triton")
+            except Exception as e:
+                # M8: Fallback counter - Triton selection failed in prefill
+                self._fallback_counters["selection_triton_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.triton_selection_prefill_fallback",
+                    error=str(e),
+                    Q_shape=list(Q.shape) if hasattr(Q, 'shape') else list(Q_t.shape),
+                    K_shape=list(kv.K_sel.shape) if hasattr(kv, 'K_sel') else list(K_sel_t.shape),
+                    V_shape=list(kv.V_sel.shape) if hasattr(kv, 'V_sel') else list(V_sel_t.shape),
+                    ranges_shape=list(sel_ranges_all.shape) if 'sel_ranges_all' in locals() else list(sel_ranges.shape) if 'sel_ranges' in locals() and sel_ranges is not None else None,
+                    total_fails=self._fallback_counters["selection_triton_fails"],
+                )
+                # Fallback to packed SDPA
+                O_sel = grouped_selection_attention_packed(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+        elif use_sel_varlen:
+            try:
+                from nsa.core.attention_kernels import selection_attention_varlen_all
+                O_sel = selection_attention_varlen_all(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+                log("prefill.sel.path", path="varlen")
+            except Exception as e:
+                # Fallback counter reuse for selection pack failures
+                self._fallback_counters["selection_pack_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.selection_varlen_prefill_fallback",
+                    error=str(e),
+                    total_fails=self._fallback_counters["selection_pack_fails"],
+                )
+                # Fallback to packed SDPA
+                O_sel = grouped_selection_attention_packed(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+                log("prefill.sel.path", path="packed")
+        elif use_sel_pack:
+            try:
+                O_sel = grouped_selection_attention_packed(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+            except Exception as e:
+                # M8: Fallback counter - Packed selection failed in prefill
+                self._fallback_counters["selection_pack_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.packed_selection_prefill_fallback",
+                    error=str(e),
+                    total_fails=self._fallback_counters["selection_pack_fails"],
+                )
+                # Fallback to gather SDPA
+                O_sel = grouped_selection_attention(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+        elif self._env_cache.get("use_sel_mask", False):
+            try:
+                O_sel = grouped_selection_attention_masked(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+                log("prefill.sel.path", path="masked")
+            except Exception as e:
+                # M8: Fallback counter - Masked selection failed in prefill
+                self._fallback_counters["selection_mask_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.masked_selection_prefill_fallback",
+                    error=str(e),
+                    total_fails=self._fallback_counters["selection_mask_fails"],
+                )
+                # Fallback to gather SDPA
+                O_sel = grouped_selection_attention(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+        else:
+            O_sel = grouped_selection_attention(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+            log("prefill.sel.path", path="gather")
+        if strict_asserts and not torch.isfinite(O_sel).all():
+            log("warn.prefill_sel_nonfinite_fallback")
+            O_sel = grouped_selection_attention(Q, kv.K_sel, kv.V_sel, sel_ranges_all)
+        log("prefill.sel", O_sel=O_sel)
+        use_win_mask = self._env_cache.get("use_win_mask", True) and not force_parity
+        if (fa2_all or fa2_win) and not force_parity:
+            try:
+                O_win = sliding_window_attention_fa2(Q, K_win, V_win, self.w)
+            except Exception as e:
+                # M8: Fallback counter - Sliding FA2 failed in prefill
+                self._fallback_counters["sliding_fa2_fails"] += 1
+                self._fallback_counters["total_fallbacks"] += 1
+                log(
+                    "warn.sliding_fa2_prefill_fallback",
+                    error=str(e),
+                    total_fails=self._fallback_counters["sliding_fa2_fails"],
+                )
+                # Fallback to masked SDPA
+                from nsa.core.attention_kernels import sliding_window_attention
+                O_win = sliding_window_attention(Q, K_win, V_win, self.w)
+        elif use_win_mask:
+            from nsa.core.attention_kernels import sliding_window_attention
+            O_win = sliding_window_attention(Q, K_win, V_win, self.w)
+        else:
+            # Sliding per-t using the same kernel as sequential
+            O_win = torch.zeros(
+                (B, S, self.n_kv_groups, self.h_per_group, self.d_v),
+                device=x.device,
+                dtype=V_win.dtype,
+            )
+            for t in range(S):
+                end = t + 1
+                start = max(0, end - self.w)
+                # M8: Assert causal masking - sliding window must not exceed current position
+                assert end <= t + 1, (
+                    f"Sliding window violates causality: end={end} > t+1={t + 1} at position t={t}. "
+                    f"This indicates window is accessing future tokens."
+                )
+                assert start <= end, (
+                    f"Sliding window has invalid range: start={start} > end={end} at position t={t}."
+                )
+                q_t = Q[:, t].contiguous()
+                k_t = K_win[:, :, start:end, :].contiguous()
+                v_t = V_win[:, :, start:end, :].contiguous()
+                O_win[:, t] = attention_bgh(q_t, k_t, v_t, causal=True)
+        if strict_asserts and not torch.isfinite(O_win).all():
+            from nsa.core.attention_kernels import sliding_window_attention
+            log("warn.prefill_win_nonfinite_fallback")
+            O_win = sliding_window_attention(Q, K_win, V_win, self.w)
+        log("prefill.win", O_win=O_win)
+        # Gates and combine
+        q_gp = Q.mean(dim=3)  # [B,S,G,Dk]
+        if self._env_cache.get("gate_compile", False):
+            try:
+                fused = self._gate_fused_bsg
+                if fused is None:
+                    fused = _fused_gate_combine_bsg
+                    try:
+                        fused = torch.compile(fused, mode="reduce-overhead")  # type: ignore[attr-defined]
+                    except Exception:
+                        pass
+                    self._gate_fused_bsg = fused
+                O = fused(
+                    q_gp,
+                    O_cmp,
+                    O_sel,
+                    O_win,
+                    self.gate.fc1.weight,
+                    self.gate.fc1.bias,
+                    self.gate.fc2.weight,
+                    self.gate.fc2.bias,
+                    float(self.gate_temp),
+                )
+            except Exception:
+                gates = self.gate(q_gp.reshape(B * S * self.n_kv_groups, self.d_k), tau=self.gate_temp)
+                if self._env_cache.get("stopgrad_gates", False):
+                    gates = gates.detach()
+                gates = gates.view(B, S, self.n_kv_groups, 3)  # [B,S,G,3]
+                self._update_gate_stats(gates)
+                w_cmp = gates[..., 0:1].unsqueeze(3)
+                w_sel = gates[..., 1:2].unsqueeze(3)
+                w_win = gates[..., 2:3].unsqueeze(3)
+                O = w_cmp * O_cmp + w_sel * O_sel + w_win * O_win  # [B,S,G,h,Dv]
+        else:
+            gates = self.gate(q_gp.reshape(B * S * self.n_kv_groups, self.d_k), tau=self.gate_temp)
+            if self._env_cache.get("stopgrad_gates", False):
+                gates = gates.detach()
+            gates = gates.view(B, S, self.n_kv_groups, 3)  # [B,S,G,3]
+            self._update_gate_stats(gates)
+            w_cmp = gates[..., 0:1].unsqueeze(3)
+            w_sel = gates[..., 1:2].unsqueeze(3)
+            w_win = gates[..., 2:3].unsqueeze(3)
+            O = w_cmp * O_cmp + w_sel * O_sel + w_win * O_win  # [B,S,G,h,Dv]
+        # Output projection
+        O_heads = O.reshape(B, S, self.n_kv_groups * self.h_per_group, self.d_v)
+        out = self.out(O_heads.reshape(B, S, -1))
+        log("prefill.out", out=out)
+        # Optional debug compare: sequential-style per-token recompute to measure MAE
+        if self._env_cache.get("debug_compare", False):
+            with torch.no_grad():
+                # Compressed per-token recompute
+                O_cmp_seq = torch.zeros_like(O_cmp)
+                S_cmp = kv.K_cmp.shape[2]
+                for t in range(S):
+                    L = 0 if (t + 1) < self.l else min(((t + 1 - self.l) // self.d) + 1, S_cmp)
+                    # M8: Assert causal masking in debug recompute
+                    if L > 0:
+                        assert L <= S_cmp, (
+                            f"Debug compressed range exceeds cache: L={L} > S_cmp={S_cmp} at t={t}"
+                        )
+                        q_t = Q[:, t].contiguous()
+                        k_t = kv.K_cmp[:, :, :L, :].contiguous()
+                        v_t = kv.V_cmp[:, :, :L, :].contiguous()
+                        O_cmp_seq[:, t] = attention_bgh(q_t, k_t, v_t, causal=True)
+                cmp_mae = (O_cmp - O_cmp_seq).abs().mean().item()
+                print(f"NSA-DBG cmp_mae={cmp_mae:.6e}")
+                # Sliding per-token recompute
+                O_win_seq = torch.zeros_like(O_win)
+                for t in range(S):
+                    end = t + 1
+                    start = max(0, end - self.w)
+                    q_t = Q[:, t].contiguous()
+                    k_t = K_win[:, :, start:end, :].contiguous()
+                    v_t = V_win[:, :, start:end, :].contiguous()
+                    O_win_seq[:, t] = attention_bgh(q_t, k_t, v_t, causal=True)
+                win_mae = (O_win - O_win_seq).abs().mean().item()
+                print(f"NSA-DBG win_mae={win_mae:.6e}")
+                # Final output recompute using seq per-branch
+                w_cmp_dbg = gates[..., 0:1].unsqueeze(-1)
+                w_sel_dbg = gates[..., 1:2].unsqueeze(-1)
+                w_win_dbg = gates[..., 2:3].unsqueeze(-1)
+                O_seq = w_cmp_dbg * O_cmp_seq + w_sel_dbg * O_sel + w_win_dbg * O_win_seq
+                O_heads_seq = O_seq.reshape(B, S, self.n_kv_groups * self.h_per_group, self.d_v)
+                out_seq = self.out(O_heads_seq.reshape(B, S, -1))
+                out_mae = (out - out_seq).abs().mean().item()
+                print(f"NSA-DBG out_mae={out_mae:.6e}")
+        return out, kv
+    def _audit_sdpa_backends_once(
+        self,
+        Q: torch.Tensor,  # [B,1,G,h,Dk]
+        K_sel: torch.Tensor,  # [B,G,S,Dk]
+        V_sel: torch.Tensor,  # [B,G,S,Dv]
+        K_win: torch.Tensor,  # [B,G,S,Dk]
+        V_win: torch.Tensor,  # [B,G,S,Dv]
+    ) -> None:
+        if self._sdpa_audited:
+            return
+        try:
+            from torch.nn.attention import sdpa_kernel
+        except Exception:
+            # Older torch, skip audit
+            self._sdpa_audited = True
+            return
+        B = Q.shape[0]
+        G = self.n_kv_groups
+        h = self.h_per_group
+        # Prepare a small representative slice per branch
+        q = Q[:, 0]  # [B,G,h,Dk]
+        # Ensure contiguity
+        q = q.contiguous()
+        ks = K_sel.contiguous()
+        vs = V_sel.contiguous()
+        kw = K_win.contiguous()
+        vw = V_win.contiguous()
+        def _probe(tag: str, k: torch.Tensor, v: torch.Tensor) -> str:
+            try:
+                with sdpa_kernel(enable_flash=True, enable_mem_efficient=False, enable_math=False):
+                    q2 = q.reshape(B * G * h, 1, self.d_k)
+                    k2 = (
+                        k.unsqueeze(2)
+                        .expand(B, G, h, k.shape[2], self.d_k)
+                        .reshape(B * G * h, k.shape[2], self.d_k)
+                    )
+                    v2 = (
+                        v.unsqueeze(2)
+                        .expand(B, G, h, v.shape[2], self.d_v)
+                        .reshape(B * G * h, v.shape[2], self.d_v)
+                    )
+                    _ = F.scaled_dot_product_attention(
+                        q2.contiguous(), k2.contiguous(), v2.contiguous(), is_causal=True
+                    )
+                return "flash"
+            except Exception:
+                return "fallback"
+        try:
+            b_sel = _probe("cmp/win(sel)", ks, vs)
+            b_win = _probe("win", kw, vw)
+            log("sdpa.audit", sel=b_sel, win=b_win)
+        except Exception:
+            pass
+        self._sdpa_audited = True
+    def _forward_prefill_via_decode(
+        self, x: torch.Tensor, kv: NSA_KV
+    ) -> tuple[torch.Tensor, NSA_KV]:
+        """Prefill by stepping decode one token at a time.
+        This path avoids recursion back into prefill and guarantees progress.
+        """
+        B, S, _ = x.shape
+        outs = []
+        for t in range(S):
+            out_t, kv = self.forward(x[:, t : t + 1], kv, prefill=False)
+            outs.append(out_t)
+        return torch.cat(outs, dim=1), kv
+    def _forward_prefill_sequential(
+        self, x: torch.Tensor, kv: NSA_KV
+    ) -> tuple[torch.Tensor, NSA_KV]:
+        """
+        Reference prefill path (sequential per‑token), used for parity checks.
+        """
+        B, S, _ = x.shape
+        # Projections
+        Q_lin = self._shape_q(self.W_Q(x), B, S)  # [B,S,G,h,Dk]
+        pos = torch.arange(S, device=x.device)
+        Q = apply_rope(
+            Q_lin.view(B, S, self.n_heads, self.d_k).reshape(B, S, self.n_heads * self.d_k),
+            pos,
+            scale=getattr(self, "rope_scale", 1.0),
+        )
+        Q = Q.view(B, S, self.n_heads, self.d_k).view(
+            B, S, self.n_kv_groups, self.h_per_group, self.d_k
+        )
+        K_sel = self._shape_kv(self.W_K_sel(x), B, S)
+        V_sel = self._shape_kv(self.W_V_sel(x), B, S)
+        K_win = self._shape_kv(self.W_K_win(x), B, S)
+        V_win = self._shape_kv(self.W_V_win(x), B, S)
+        K_cmp_raw = self._shape_kv(self.W_K_cmp(x), B, S)
+        V_cmp_raw = self._shape_kv(self.W_V_cmp(x), B, S)
+        # Apply RoPE to per-branch K tensors to align with batched path
+        pos_k = torch.arange(S, device=x.device)
+        K_sel = apply_rope(K_sel, pos_k, scale=getattr(self, "rope_scale", 1.0))
+        K_win = apply_rope(K_win, pos_k, scale=getattr(self, "rope_scale", 1.0))
+        kv.update_selection_raw(K_sel, V_sel)
+        kv.meta = build_block_meta(
+            seq_len=S, l=self.l, d=self.d, l_sel=self.l_sel, n_sel=self.n_sel, w=self.w
+        )
+        kv.update_window(K_win, V_win, self.w)
+        if self.phi_type == "mlp":
+            K_cmp, V_cmp = self._phi_apply_seq(
+                K_cmp_raw, V_cmp_raw, pos=torch.arange(S, device=x.device)
+            )
+        else:
+            K_cmp, V_cmp = avg_pool_phi_rope_kv(
+                K_cmp_raw, V_cmp_raw, self.l, self.d, pos=torch.arange(S, device=x.device)
+            )
+        kv.update_compressed(K_cmp, V_cmp, self.l, self.d)
+        # Precompute p_grp_all batched for reuse per t
+        scale = 1.0 / (self.d_k**0.5)
+        p_cmp_all = compute_pcmp_all(Q, kv.K_cmp, scale)  # [B,S,G,h,S_cmp]
+        p_slc_all = map_pcmp_to_pslc_batched(p_cmp_all, kv.meta)  # [B,S,G,h,S_sel]
+        p_grp_all = p_slc_all.sum(dim=3)  # [B,S,G,S_sel]
+        outs = []
+        sel_ranges_accum: List[torch.Tensor] = []
+        for t in range(S):
+            p_grp = p_grp_all[:, t]  # [B,G,S_sel]
+            sel_ranges = select_topn_ranges(p_grp, kv.meta, self.n_sel, t, True, 2)
+            sel_ranges_accum.append(sel_ranges)
+            Q_t = Q[:, t]
+            K_sel_t = kv.K_sel[:, :, : t + 1, :]
+            V_sel_t = kv.V_sel[:, :, : t + 1, :]
+            # Selection attention routing (mirror decode/batched semantics)
+            force_parity = self._env_cache.get("force_parity", False)
+            use_sel_pack = self._env_cache.get("use_sel_pack", True) and not force_parity
+            use_triton_sel = self._env_cache.get("use_triton_sel", False) and not force_parity
+            use_cuda_sel = self._env_cache.get("use_cuda_sel", False) and not force_parity
+            force_sel_mask = self._env_cache.get("force_sel_mask", False) and not force_parity
+            if force_sel_mask:
+                try:
+                    O_sel_bt = grouped_selection_attention_masked(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+                    log("prefill.sel.path", path="masked_forced")
+                except Exception as e:
+                    self._fallback_counters["selection_mask_fails"] += 1
+                    self._fallback_counters["total_fallbacks"] += 1
+                    log("warn.masked_selection_prefill_forced_fallback",
+                    error=str(e),
+                    Q_shape=list(Q.shape) if hasattr(Q, 'shape') else list(Q_t.shape),
+                    K_shape=list(kv.K_sel.shape) if hasattr(kv, 'K_sel') else list(K_sel_t.shape),
+                    V_shape=list(kv.V_sel.shape) if hasattr(kv, 'V_sel') else list(V_sel_t.shape),
+                    ranges_shape=list(sel_ranges_all.shape) if 'sel_ranges_all' in locals() else list(sel_ranges.shape) if sel_ranges is not None else None,
+                    total_fails=self._fallback_counters["selection_mask_fails"])
+                    O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+            elif use_triton_sel:
+                try:
+                    from nsa.kernels.triton_sel_kernel import selection_attention_triton
+                    O_sel_bt = selection_attention_triton(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+                    log("prefill.sel.path", path="triton")
+                except Exception as e:
+                    # Fallback counter - Triton selection failed (sequential prefill)
+                    self._fallback_counters["selection_triton_fails"] += 1
+                    self._fallback_counters["total_fallbacks"] += 1
+                    log(
+                        "warn.triton_selection_prefill_fallback",
+                        error=str(e),
+                        total_fails=self._fallback_counters["selection_triton_fails"],
+                    )
+                    # Fallback to packed SDPA
+                    O_sel_bt = grouped_selection_attention_packed(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+            elif use_cuda_sel:
+                try:
+                    from nsa.kernels.cuda_sel_kernel import selection_attention_cuda
+                    O_sel_bt = selection_attention_cuda(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+                except Exception as e:
+                    # Fallback counter - CUDA selection failed (sequential prefill)
+                    self._fallback_counters["selection_cuda_fails"] += 1
+                    self._fallback_counters["total_fallbacks"] += 1
+                    log(
+                        "warn.cuda_selection_prefill_fallback",
+                        error=str(e),
+                        total_fails=self._fallback_counters["selection_cuda_fails"],
+                    )
+                    # Fallback to packed SDPA
+                    O_sel_bt = grouped_selection_attention_packed(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+            elif use_sel_pack:
+                try:
+                    O_sel_bt = grouped_selection_attention_packed(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+                    log("prefill.sel.path", path="packed")
+                except Exception as e:
+                    # Fallback counter - Packed selection failed (sequential prefill)
+                    self._fallback_counters["selection_pack_fails"] += 1
+                    self._fallback_counters["total_fallbacks"] += 1
+                    log(
+                        "warn.packed_selection_prefill_fallback",
+                        error=str(e),
+                        total_fails=self._fallback_counters["selection_pack_fails"],
+                    )
+                    # Fallback to gather SDPA
+                    O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+            elif self._env_cache.get("use_sel_mask", False) and not force_parity:
+                try:
+                    O_sel_bt = grouped_selection_attention_masked(
+                        Q_t.unsqueeze(1), K_sel_t, V_sel_t, sel_ranges.unsqueeze(1)
+                    )
+                    O_sel = O_sel_bt[:, 0]
+                    log("prefill.sel.path", path="masked")
+                except Exception as e:
+                    # Fallback counter - Masked selection failed (sequential prefill)
+                    self._fallback_counters["selection_mask_fails"] += 1
+                    self._fallback_counters["total_fallbacks"] += 1
+                    log(
+                        "warn.masked_selection_prefill_fallback",
+                        error=str(e),
+                        total_fails=self._fallback_counters["selection_mask_fails"],
+                    )
+                    # Fallback to gather SDPA
+                    O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+            else:
+                O_sel = self._sdpa_over_ranges(Q_t, K_sel_t, V_sel_t, sel_ranges)
+            win_len = min(self.w, t + 1)
+            K_w = kv.K_win[:, :, t + 1 - win_len : t + 1, :]
+            V_w = kv.V_win[:, :, t + 1 - win_len : t + 1, :]
+            O_win = attention_bgh(Q_t.contiguous(), K_w.contiguous(), V_w.contiguous(), causal=True)
+            S_cmp_t = 0 if (t + 1) < self.l else (t + 1 - self.l) // self.d + 1
+            O_cmp = attention_bgh(
+                Q_t.contiguous(),
+                kv.K_cmp[:, :, :S_cmp_t, :].contiguous(),
+                kv.V_cmp[:, :, :S_cmp_t, :].contiguous(),
+                causal=True,
+            )
+            q_gp = Q_t.mean(dim=2, dtype=Q_t.dtype)
+            gates = self.gate(q_gp, tau=self.gate_temp)
+            if self._env_cache.get("stopgrad_gates", False):
+                gates = gates.detach()
+            # Update gate statistics for M8 monitoring (accumulate across steps)
+            self._update_gate_stats(gates)
+            w_cmp = gates[..., 0:1].unsqueeze(-1)
+            w_sel = gates[..., 1:2].unsqueeze(-1)
+            w_win = gates[..., 2:3].unsqueeze(-1)
+            O = w_cmp * O_cmp + w_sel * O_sel + w_win * O_win
+            O_heads = O.reshape(B, self.n_heads, self.d_v)
+            out_t = self.out(O_heads.reshape(B, 1, -1))
+            outs.append(out_t)
+        out = torch.cat(outs, dim=1)
+        # Aggregate selection stats across all t in this prefill (sequential path)
+        try:
+            if sel_ranges_accum:
+                # Stack to [T,B,G,n,2] then permute to [B,T,G,n,2]
+                rs = torch.stack(sel_ranges_accum, dim=0).permute(1, 0, 2, 3, 4)
+                self._update_sel_stats_from_ranges(rs)
+        except Exception:
+            pass
+        return out, kv
+    def _sdpa_full(self, Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor) -> torch.Tensor:
+        # Q: [B,G,h,Dk]; K/V: [B,G,S,D*] -> out [B,G,h,Dv]
+        B, G, h, Dk = Q.shape
+        S = K.shape[2]
+        q = Q.reshape(B * G * h, 1, Dk).contiguous()
+        k = K.unsqueeze(2).expand(B, G, h, S, Dk).reshape(B * G * h, S, Dk).contiguous()
+        v = (
+            V.unsqueeze(2)
+            .expand(B, G, h, S, V.shape[-1])
+            .reshape(B * G * h, S, V.shape[-1])
+            .contiguous()
+        )
+        attn = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        o = attn.squeeze(1).reshape(B, G, h, -1)
+        return o
+    def _phi_apply_seq(
+        self, K_raw: torch.Tensor, V_raw: torch.Tensor, pos: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Apply learnable ϕ over the full sequence using depthwise Conv1d initialized to avg.
+        Expects K_raw,V_raw: [B,G,S,D*]; returns [B,G,S_cmp,D*].
+        """
+        assert self.phi_k_conv is not None and self.phi_v_conv is not None
+        B, G, S, Dk = K_raw.shape
+        Dv = V_raw.shape[-1]
+        K_rope = apply_rope(K_raw, pos, scale=getattr(self, "rope_scale", 1.0))
+        Kx = K_rope.permute(0, 1, 3, 2).reshape(B * G, Dk, S)
+        Vx = V_raw.permute(0, 1, 3, 2).reshape(B * G, Dv, S)
+        Kc = self.phi_k_conv(Kx)
+        Vc = self.phi_v_conv(Vx)
+        S_cmp = Kc.shape[-1]
+        K_cmp = Kc.reshape(B, G, Dk, S_cmp).permute(0, 1, 3, 2).contiguous()
+        V_cmp = Vc.reshape(B, G, Dv, S_cmp).permute(0, 1, 3, 2).contiguous()
+        return K_cmp, V_cmp
+    def _phi_apply_last(
+        self, K_last: torch.Tensor, V_last: torch.Tensor, pos_last: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Emit a single compressed token from the last l raw tokens using Conv1d with kernel=l,stride=d.
+        Inputs: [B,G,l,D*] -> Outputs: [B,G,1,D*].
+        """
+        assert self.phi_k_conv is not None and self.phi_v_conv is not None
+        B, G, lwin, Dk = K_last.shape
+        Dv = V_last.shape[-1]
+        assert lwin == self.l, "decode emission expects exactly l tokens"
+        K_rope = apply_rope(K_last, pos_last, scale=getattr(self, "rope_scale", 1.0))
+        Kx = K_rope.permute(0, 1, 3, 2).reshape(B * G, Dk, lwin)
+        Vx = V_last.permute(0, 1, 3, 2).reshape(B * G, Dv, lwin)
+        Kc = self.phi_k_conv(Kx)
+        Vc = self.phi_v_conv(Vx)
+        K_cmp_new = Kc.reshape(B, G, Dk, 1).permute(0, 1, 3, 2).contiguous()
+        V_cmp_new = Vc.reshape(B, G, Dv, 1).permute(0, 1, 3, 2).contiguous()
+        return K_cmp_new, V_cmp_new
+    def _sdpa_over_ranges(
+        self,
+        Q: torch.Tensor,
+        K: torch.Tensor,
+        V: torch.Tensor,
+        ranges: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        SDPA over concatenated gathered tokens per (B,G) according to `ranges`.
+        Args:
+            Q: [B,G,h,Dk]
+            K: [B,G,S_kv,Dk]
+            V: [B,G,S_kv,Dv]
+            ranges: [B,G,n,2] start/end pairs
+        Returns:
+            [B,G,h,Dv]
+        """
+        # Concatenate gathered tokens per (B,G)
+        B, G, h, Dk = Q.shape
+        Dv = V.shape[-1]
+        outs = []
+        S_kv = K.shape[2]
+        strict_asserts = (
+            self._env_cache.get("strict_asserts", False) if hasattr(self, "_env_cache") else False
+        )
+        for b in range(B):
+            row = []
+            for g in range(G):
+                # Clamp and validate ranges to avoid invalid or oversized indices
+                r = ranges[b, g].to(dtype=torch.int64, device=K.device)  # [n,2]
+                if r.numel() == 0:
+                    valid_pairs = torch.empty((0, 2), dtype=torch.int64, device=K.device)
+                else:
+                    s = r[:, 0].clamp_(0, S_kv)
+                    e = r[:, 1].clamp_(0, S_kv)
+                    valid = e > s
+                    valid_pairs = torch.stack([s[valid], e[valid]], dim=-1)
+                    # M8: Assert bounds for gathered ranges (GPU-sync gated)
+                    if strict_asserts and valid_pairs.numel() > 0:
+                        max_end = valid_pairs[:, 1].max().item()
+                        assert max_end <= S_kv, (
+                            f"Selection range exceeds sequence length: max_end={max_end} > S_kv={S_kv} "
+                            f"at batch={b}, group={g}."
+                        )
+                # Build a boolean mask over S_kv to gather selected tokens (limits worst-case size)
+                if valid_pairs.numel() > 0:
+                    m = torch.zeros((S_kv,), dtype=torch.bool, device=K.device)
+                    for s_e in valid_pairs:
+                        s_i = int(s_e[0].item())
+                        e_i = int(s_e[1].item())
+                        if e_i > s_i:
+                            m[s_i:e_i] = True
+                    idx = m.nonzero(as_tuple=False).squeeze(-1)
+                else:
+                    idx = torch.empty((0,), dtype=torch.int64, device=K.device)
+                k = (
+                    K[b, g, idx]
+                    if idx.numel() > 0
+                    else torch.zeros((1, Dk), device=K.device, dtype=K.dtype)
+                )
+                v = (
+                    V[b, g, idx]
+                    if idx.numel() > 0
+                    else torch.zeros((1, Dv), device=K.device, dtype=V.dtype)
+                )
+                q = Q[b, g]  # [h,Dk]
+                attn = F.scaled_dot_product_attention(
+                    q.unsqueeze(0).contiguous(),
+                    k.unsqueeze(0).contiguous(),
+                    v.unsqueeze(0).contiguous(),
+                    is_causal=True,
+                )
+                row.append(attn.squeeze(0))  # [h,Dv]
+            outs.append(torch.stack(row, dim=0))  # [G,h,Dv]
+        return torch.stack(outs, dim=0)  # [B,G,h,Dv]

nsa/core/packing.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from __future__ import annotations
+from typing import List
+import torch
+def compute_sliding_lengths(S: int, w: int, device: torch.device) -> torch.Tensor:
+    """
+    Return per-row window lengths for sliding attention: L_t = min(w, t+1)
+    Shape: [S]
+    """
+    tpos = torch.arange(S, device=device)
+    return (tpos + 1).clamp_max(w)
+def compute_compressed_lengths(
+    S: int, l: int, d: int, S_cmp: int, device: torch.device
+) -> torch.Tensor:
+    """
+    Return per-row valid compressed lengths: num_cmp(t)
+    Shape: [S]
+    """
+    tpos = torch.arange(S, device=device)
+    return torch.where(tpos + 1 < l, 0, ((tpos + 1 - l) // d) + 1).clamp(min=0, max=S_cmp)
+def build_length_buckets(lengths: torch.Tensor) -> List[torch.Tensor]:
+    """
+    Group row indices by identical length.
+    Args:
+            lengths: [S] int tensor
+    Returns:
+            List of index tensors, one per unique length (descending by length)
+    """
+    if lengths.numel() == 0:
+        return []
+    unique = torch.unique(lengths, sorted=True)
+    # sort descending so larger buckets processed first
+    unique = torch.flip(unique, dims=[0])
+    buckets: List[torch.Tensor] = []
+    for L in unique.tolist():
+        idx = torch.nonzero(lengths == int(L), as_tuple=False).flatten()
+        buckets.append(idx)
+    return buckets
+def build_cu_seqlens_for_buckets(bucket_lengths: torch.Tensor) -> torch.Tensor:
+    """
+    Build cumulative sequence lengths (cu_seqlens) for varlen APIs from a vector of lengths.
+    Args:
+            bucket_lengths: [N] lengths per row in a bucket
+    Returns:
+            cu_seqlens: [N+1] with cu_seqlens[0]=0 and cu_seqlens[i+1]=sum_{j<=i} len[j]
+    """
+    if bucket_lengths.numel() == 0:
+        return torch.zeros((1,), dtype=torch.int32, device=bucket_lengths.device)
+    cs = torch.zeros((bucket_lengths.numel() + 1,), dtype=torch.int32, device=bucket_lengths.device)
+    cs[1:] = torch.cumsum(bucket_lengths.to(dtype=torch.int32), dim=0)
+    return cs
+def pack_batch_by_lengths(
+    x: torch.Tensor, lengths: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack a batch of padded rows into a contiguous buffer with cu_seqlens.
+    Args:
+            x: [B,S_max,D]
+            lengths: [B] valid lengths per row
+    Returns:
+            (packed: [sum(lengths), D], cu_seqlens: [B+1])
+    """
+    device = x.device
+    B, S_max, D = x.shape
+    assert lengths.shape[0] == B
+    cu = build_cu_seqlens_for_buckets(lengths.to(torch.int32))
+    N = int(cu[-1].item())
+    packed = torch.empty((N, D), dtype=x.dtype, device=device)
+    write = 0
+    for b in range(B):
+        L = int(lengths[b].item())
+        if L > 0:
+            packed[write : write + L] = x[b, :L]
+            write += L
+    return packed, cu
+def unpack_packed_to_padded(
+    packed: torch.Tensor, cu_seqlens: torch.Tensor, S_max: int
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Unpack a packed buffer back to padded batch and mask.
+    Args:
+            packed: [N,D]
+            cu_seqlens: [B+1]
+            S_max: target padded length
+    Returns:
+            (padded [B,S_max,D], mask [B,S_max])
+    """
+    device = packed.device
+    B = cu_seqlens.shape[0] - 1
+    D = packed.shape[-1]
+    padded = torch.zeros((B, S_max, D), dtype=packed.dtype, device=device)
+    mask = torch.zeros((B, S_max), dtype=torch.bool, device=device)
+    for b in range(B):
+        start = int(cu_seqlens[b].item())
+        end = int(cu_seqlens[b + 1].item())
+        L = end - start
+        if L > 0:
+            padded[b, :L] = packed[start:end]
+            mask[b, :L] = True
+    return padded, mask

nsa/core/rope.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from __future__ import annotations
+import torch
+def build_inv_freq(
+    dim: int, base: float = 10000.0, device: torch.device | None = None
+) -> torch.Tensor:
+    assert dim % 2 == 0, "RoPE requires even dimension"
+    half = dim // 2
+    idx = torch.arange(half, device=device, dtype=torch.float32)
+    inv_freq = base ** (-2 * idx / dim)
+    return inv_freq  # [half]
+def apply_rope(
+    x: torch.Tensor,
+    pos: torch.Tensor,
+    base: float = 10000.0,
+    *,
+    scale: float = 1.0,
+) -> torch.Tensor:
+    """
+    Apply rotary position embeddings along the last dimension.
+    x: [..., S, D] tensor with even D
+    pos: [S] or [..., S] integer positions
+    returns: same shape as x
+    """
+    D = x.shape[-1]
+    assert D % 2 == 0, "RoPE requires even dimension"
+    device = x.device
+    inv_freq = build_inv_freq(D, base=base, device=device)  # [D/2]
+    # pos shape broadcasting to [..., S, D/2]
+    while pos.dim() < x.dim() - 1:
+        pos = pos.unsqueeze(0)
+    # Simple NTK/YARN-style extension via position scaling: effective_pos = pos / scale
+    if scale <= 0:
+        scale = 1.0
+    # Compute angles in float32 for accuracy, then cast sin/cos to input dtype to preserve dtype end-to-end
+    angles = (pos.to(torch.float32) / float(scale)).unsqueeze(
+        -1
+    ) * inv_freq  # [..., S, D/2] (float32)
+    sin = torch.sin(angles).to(dtype=x.dtype)
+    cos = torch.cos(angles).to(dtype=x.dtype)
+    x_2 = x.view(*x.shape[:-1], D // 2, 2)
+    x0, x1 = x_2[..., 0], x_2[..., 1]
+    y0 = x0 * cos - x1 * sin
+    y1 = x0 * sin + x1 * cos
+    y = torch.stack((y0, y1), dim=-1).view_as(x)
+    return y

nsa/core/selection_scorer.py ADDED Viewed

	@@ -0,0 +1,759 @@

+from __future__ import annotations
+import os
+from typing import List, Tuple
+import torch
+import torch.nn.functional as F
+from .block_index import BlockMeta
+def compute_pcmp(Q: torch.Tensor, K_cmp: torch.Tensor, scale: float) -> torch.Tensor:
+    # Q: [G,h,Dk]; K_cmp: [B,G,S_cmp,Dk] with implicit B=1 for this path
+    if Q.dim() == 3:
+        # Q: [G,h,Dk]; K_cmp: [1,G,S_cmp,Dk] (implicit B=1)
+        G, h, Dk = Q.shape
+        S_cmp = K_cmp.shape[2]
+        q = Q.reshape(G * h, 1, Dk)
+        # Expand K over heads without materializing copies
+        k = (
+            K_cmp[0]
+            .unsqueeze(1)  # [G,1,S_cmp,Dk]
+            .expand(G, h, S_cmp, Dk)
+            .reshape(G * h, S_cmp, Dk)
+        )
+        logits = torch.bmm(q, k.transpose(1, 2)).squeeze(1) * scale
+        return F.softmax(logits, dim=-1).reshape(1, G, h, S_cmp)
+    else:
+        # Q: [B,G,h,Dk]; K_cmp: [B,G,S_cmp,Dk]
+        B, G, h, Dk = Q.shape
+        S_cmp = K_cmp.shape[2]
+        q = Q.reshape(B * G * h, 1, Dk)
+        # Expand K over heads without materializing copies
+        k = (
+            K_cmp.unsqueeze(2)  # [B,G,1,S_cmp,Dk]
+            .expand(B, G, h, S_cmp, Dk)
+            .reshape(B * G * h, S_cmp, Dk)
+        )
+        logits = torch.bmm(q, k.transpose(1, 2)).squeeze(1) * scale
+        p = F.softmax(logits, dim=-1)
+        return p.reshape(B, G, h, S_cmp)
+def compute_pcmp_all(Q_all: torch.Tensor, K_cmp: torch.Tensor, scale: float) -> torch.Tensor:
+    """
+    Q_all: [B,S,G,h,Dk], K_cmp: [B,G,S_cmp,Dk] -> p_cmp_all: [B,S,G,h,S_cmp]
+    """
+    use_mixed = os.getenv("NSA_P_CMP_MIXED", "0").lower() in ("1", "true", "yes", "on")
+    if use_mixed and Q_all.device.type == "cuda":
+        # Optional mixed-precision path (disabled by default). Computes logits and softmax
+        # under autocast to reduce memory bandwidth on large shapes. Output is upcast
+        # back to the original dtype to preserve downstream numerics.
+        orig_dtype = Q_all.dtype
+        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
+            Kt = K_cmp.permute(0, 1, 3, 2)  # [B,G,Dk,S_cmp]
+            logits = torch.einsum("bsghd,bgdc->bsghc", Q_all, Kt) * scale
+            p = F.softmax(logits, dim=-1)
+        return p.to(orig_dtype)
+    else:
+        # Baseline precise path
+        Kt = K_cmp.permute(0, 1, 3, 2)  # [B,G,Dk,S_cmp]
+        logits = torch.einsum("bsghd,bgdc->bsghc", Q_all, Kt) * scale
+        return F.softmax(logits, dim=-1)
+def map_pcmp_to_pslc(p_cmp: torch.Tensor, meta: BlockMeta) -> torch.Tensor:
+    # p_cmp: [B,G,h,S_cmp]
+    B, G, h, S_cmp = p_cmp.shape
+    indptr = meta.M_csl_indptr
+    indices = meta.M_csl_indices
+    values = meta.M_csl_values
+    S_sel = meta.sel_starts.numel()
+    device = p_cmp.device
+    # Out-of-place accumulation to avoid in-place versioning issues under GC/DDP
+    p_slc = torch.zeros((B, G, h, S_sel), device=device, dtype=p_cmp.dtype)
+    acc = torch.zeros_like(p_slc)
+    # CSR row-wise multiply-add
+    for r in range(S_cmp):
+        start, end = int(indptr[r].item()), int(indptr[r + 1].item())
+        if start == end:
+            continue
+        cols = indices[start:end].to(device)
+        w = values[start:end].to(device=device, dtype=p_cmp.dtype)  # [nnz_r]
+        contrib = p_cmp[..., r].unsqueeze(-1) * w  # [B,G,h,nnz_r]
+        # Ensure Long dtype for scatter_add indices
+        idx = cols.view(1, 1, 1, -1).expand(B, G, h, -1).long()
+        acc = acc.scatter_add(-1, idx, contrib)
+    return acc
+def map_pcmp_to_pslc_batched(p_cmp_all: torch.Tensor, meta: BlockMeta) -> torch.Tensor:
+    """
+    p_cmp_all: [B,S,G,h,S_cmp] -> p_slc_all: [B,S,G,h,S_sel]
+    Vectorized over B,S,G,h while looping CSR rows over S_cmp.
+    """
+    B, S, G, h, S_cmp = p_cmp_all.shape
+    device = p_cmp_all.device
+    S_sel = meta.sel_starts.numel()
+    if S_cmp == 0:
+        return torch.zeros((B, S, G, h, S_sel), device=device, dtype=p_cmp_all.dtype)
+    # COO sparse matmul: for each nnz (r,c,w), add p_cmp[..., r]*w to p_slc[..., c]
+    rows, cols = meta.M_csl_coo_indices.to(device)
+    w = meta.M_csl_coo_values.to(device=device, dtype=p_cmp_all.dtype)
+    # Filter mapping rows to those < current S_cmp to avoid out-of-bounds in early decode
+    valid_mask = rows < S_cmp
+    if valid_mask.dim() == 0:
+        valid_mask = valid_mask.unsqueeze(0)
+    rows = rows[valid_mask]
+    cols = cols[valid_mask]
+    w = w[valid_mask]
+    if rows.numel() == 0:
+        return torch.zeros((B, S, G, h, S_sel), device=device, dtype=p_cmp_all.dtype)
+    p_src = p_cmp_all[..., rows] * w  # [B,S,G,h,nnz]
+    p_slc = torch.zeros((B, S, G, h, S_sel), device=device, dtype=p_cmp_all.dtype)
+    # Ensure Long dtype for scatter_add indices
+    idx = cols.view(1, 1, 1, 1, -1).expand(B, S, G, h, -1).long()
+    p_slc = p_slc.scatter_add(-1, idx, p_src)
+    return p_slc
+def group_reduce_pslc(p_slc: torch.Tensor) -> torch.Tensor:
+    # Sum across heads in group (Eq. 10)
+    return p_slc.sum(dim=2)
+def select_topn_ranges(
+    p_grp: torch.Tensor,
+    meta: BlockMeta,
+    n_top: int,
+    t_token: int,
+    force_init: bool = True,
+    force_local: int = 2,
+    _skip_validation: bool = False,
+) -> torch.Tensor:
+    """Select top-n block ranges with deterministic tie-breaking.
+    M8: Enhanced with robust deterministic tie-breaking for training reproducibility.
+    Uses scaled epsilon bias to prefer lower indices on score ties, ensuring
+    identical selection across runs with the same inputs.
+    Args:
+        p_grp: Group probabilities [B,G,S_sel]
+        meta: Block metadata with selection ranges
+        n_top: Number of top blocks to select
+        t_token: Current token position (0-indexed)
+        force_init: Whether to force include block 0
+        force_local: Number of local blocks to force include
+    Returns:
+        Selected ranges [B,G,n_top,2] as [start,end) pairs
+    """
+    # p_grp: [B,G,S_sel]
+    B, G, S_sel = p_grp.shape
+    device = p_grp.device
+    # Determine candidate blocks ≤ t
+    sel_starts = meta.sel_starts.to(device)
+    # mask future blocks
+    valid = sel_starts + meta.l_sel - 1 <= t_token
+    masked = p_grp.masked_fill(~valid.view(1, 1, -1), float("-inf"))
+    # force-includes set
+    forced_list = []
+    if force_init:
+        forced_list.append(torch.zeros((B, G), dtype=torch.int64, device=device))
+    if force_local > 0:
+        last_block = torch.clamp((torch.tensor(t_token, device=device) // meta.l_sel), min=0)
+        for i in range(force_local):
+            forced_list.append(torch.clamp(last_block - i, min=0).expand(B, G))
+    forced_idx = (
+        torch.stack(forced_list, dim=-1)
+        if forced_list
+        else torch.empty((B, G, 0), device=device, dtype=torch.int64)
+    )
+    # Exclude forced from top-k candidates by setting their scores to -inf
+    if forced_idx.numel() > 0:
+        forced_mask = torch.zeros_like(masked, dtype=torch.bool)
+        forced_mask.scatter_(-1, forced_idx, True)
+        masked = masked.masked_fill(forced_mask, float("-inf"))
+    # pick remaining to fill up to n_top
+    k_rest = torch.clamp(torch.tensor(n_top - forced_idx.shape[-1], device=device), min=0).item()
+    if k_rest > 0:
+        # M8: Deterministic tie-breaker - prefer lower indices for reproducible selection
+        # Use a tiny, fixed bias in float32 space to avoid overwhelming scores in low-precision
+        # dtypes (e.g., bf16/FP16). We perform ranking in float32 regardless of input dtype.
+        tie_break_scale = torch.tensor(1e-8, device=device, dtype=torch.float32)
+        base_idx = torch.arange(S_sel, device=device, dtype=torch.float32).view(1, 1, S_sel)
+        composite = masked.to(torch.float32) - (base_idx * tie_break_scale)
+        # Ensure deterministic topk with sorted=True for consistent ordering
+        k_actual = min(k_rest, S_sel)
+        _, top_idx = torch.topk(composite, k=k_actual, dim=-1, largest=True, sorted=True)
+        # M8: Assert tie-breaking worked - check for potential numerical issues
+        if torch.is_grad_enabled():
+            # Only check during training when gradients are enabled
+            with torch.no_grad():
+                orig_scores = torch.gather(masked, -1, top_idx).to(torch.float32)
+                if orig_scores.numel() > 1:
+                    # Check if adjacent scores are suspiciously close (potential tie-break failure)
+                    score_diffs = torch.diff(orig_scores, dim=-1)
+                    very_close = torch.abs(score_diffs) < (float(tie_break_scale.item()) * 0.1)
+                    if very_close.any():
+                        from nsa.core.debug import log
+                        log(
+                            "warn.selection_tiebreak",
+                            msg="Close scores detected in selection - potential tie-break instability",
+                            min_diff=float(torch.abs(score_diffs).min().item()),
+                            tie_break_scale=float(tie_break_scale),
+                        )
+        sel_idx = torch.cat([forced_idx, top_idx], dim=-1)
+    else:
+        sel_idx = forced_idx
+    # sort selected indices ascending for consistent range merging
+    sel_idx = torch.sort(sel_idx, dim=-1).values
+    # M8: Optional determinism validation (skip if called from validation itself)
+    if not _skip_validation and os.getenv("NSA_VALIDATE_SELECTION_DETERMINISM", "0").lower() in (
+        "1",
+        "true",
+        "yes",
+    ):
+        validate_selection_determinism(p_grp, meta, n_top, t_token)
+    # merge adjacent into contiguous ranges
+    ranges = []
+    for b in range(B):
+        bg = []
+        for g in range(G):
+            blocks = sel_starts[sel_idx[b, g]]  # [k], sorted non-decreasing
+            # Deduplicate without extra sort (faster on GPU for small k)
+            blocks = torch.unique_consecutive(blocks)
+            if blocks.numel() == 0:
+                bg.append(torch.zeros((n_top, 2), dtype=torch.int32, device=device))
+                continue
+            cur_s = int(blocks[0].item())
+            cur_e = cur_s + meta.l_sel
+            merged: List[Tuple[int, int]] = []
+            for x in blocks[1:].tolist():
+                if x == cur_e:  # adjacent
+                    cur_e += meta.l_sel
+                else:
+                    merged.append((cur_s, cur_e))
+                    cur_s, cur_e = x, x + meta.l_sel
+            merged.append((cur_s, cur_e))
+            # pad/truncate to n_top
+            out = torch.zeros((n_top, 2), dtype=torch.int32, device=device)
+            for i, (s, e) in enumerate(merged[:n_top]):
+                e = min(e, t_token + 1)
+                out[i, 0] = s
+                out[i, 1] = e
+            bg.append(out)
+        ranges.append(torch.stack(bg, dim=0))
+    return torch.stack(ranges, dim=0)  # [B,G,n_top,2]
+# ===== Batched selection (prefill fast path) =====
+def select_topn_ranges_batched(
+    p_grp_all: torch.Tensor,  # [B,S,G,S_sel]
+    meta: BlockMeta,
+    n_top: int,
+    S: int,
+    force_init: bool = True,
+    force_local: int = 2,
+) -> torch.Tensor:  # [B,S,G,n_ranges,2]
+    """
+    M8: Deterministic batched selection with enhanced tie-breaking:
+    - Mask future blocks per position t via block end ≤ t+1
+    - Force include block 0 and last k local blocks (dedup)
+    - Exclude forced from scored top‑k
+    - Robust deterministic tie‑break to lower index on equal scores
+    - Convert to merged contiguous [start,end) ranges clamped to ≤ t+1
+    - Validation hooks for training reproducibility
+    """
+    B, S_q, G, S_sel = p_grp_all.shape
+    device = p_grp_all.device
+    sel_starts = meta.sel_starts.to(device)
+    sel_ends = sel_starts + meta.l_sel
+    tpos = torch.arange(S, device=device).view(S, 1)
+    valid = sel_ends.view(1, -1) <= (tpos + 1)  # [S,S_sel]
+    disallowed = ~valid
+    masked = p_grp_all.masked_fill(disallowed.view(1, S, 1, S_sel), float("-inf"))
+    # Forced blocks (dedup across 0 and locals)
+    forced_list = []
+    if force_init:
+        forced_list.append(torch.zeros((B, S, G, 1), dtype=torch.long, device=device))
+    if force_local > 0:
+        tpos1 = torch.arange(S, device=device)
+        last_block = (tpos1 // meta.l_sel).clamp_min(0)
+        for k in range(force_local):
+            idx = (last_block - k).clamp_min(0).view(1, S, 1, 1).expand(B, S, G, 1)
+            forced_list.append(idx)
+    forced = (
+        torch.cat(forced_list, dim=-1)
+        if forced_list
+        else torch.empty((B, S, G, 0), dtype=torch.long, device=device)
+    )
+    if forced.numel() > 0:
+        # Ensure ascending per trailing dim then drop duplicates consecutively
+        forced = torch.sort(forced, dim=-1).values
+        forced = torch.unique_consecutive(forced, dim=-1)
+    if forced.numel() > 0:
+        forced_mask = torch.zeros_like(masked, dtype=torch.bool)
+        forced_mask.scatter_(-1, forced, True)
+        masked = masked.masked_fill(forced_mask, float("-inf"))
+    # Deterministic top‑k using composite key with tiny index bias
+    k_rest = max(0, n_top - forced.shape[-1])
+    if k_rest > 0:
+        # M8: Deterministic tie-breaker - prefer lower indices; rank in float32 to avoid
+        # overwhelming biases under low-precision dtypes.
+        tie_break_scale = torch.tensor(1e-8, device=device, dtype=torch.float32)
+        base_idx = (
+            torch.arange(S_sel, device=device, dtype=torch.float32)
+            .view(1, 1, 1, S_sel)
+            .expand(B, S, G, S_sel)
+        )
+        composite = masked.to(torch.float32) - (base_idx * tie_break_scale)
+        # Ensure deterministic topk with explicit sorted=True for batched path
+        k_actual = min(k_rest, S_sel)
+        _, top_idx = torch.topk(composite, k=k_actual, dim=-1, largest=True, sorted=True)
+        # M8: Optional validation for tie-breaking effectiveness in training
+        if torch.is_grad_enabled() and k_actual > 1:
+            with torch.no_grad():
+                orig_scores = torch.gather(masked, -1, top_idx).to(torch.float32)
+                # Check last dimension for potential tie-break issues
+                score_diffs = torch.diff(orig_scores, dim=-1)
+                very_close = torch.abs(score_diffs) < (float(tie_break_scale.item()) * 0.1)
+                if very_close.any():
+                    from nsa.core.debug import log
+                    log(
+                        "warn.batched_selection_tiebreak",
+                        msg="Close scores in batched selection - potential instability",
+                        batch_close_count=int(very_close.sum().item()),
+                        tie_break_scale=float(tie_break_scale),
+                    )
+        selected = torch.cat([forced, top_idx], dim=-1)
+    else:
+        selected = forced[..., :n_top]
+    # Keep only valid (≤ t) indices; drop disallowed fill-ins
+    valid_full = valid.view(1, S, 1, S_sel).expand(B, S, G, S_sel)
+    is_valid_pick = torch.gather(valid_full, -1, selected)
+    # Replace invalid with -1 sentinel
+    selected = torch.where(is_valid_pick, selected, torch.full_like(selected, -1))
+    # Special-case: if requested n_top ≥ number of valid blocks at t, select exactly all valid blocks [0..t]
+    num_valid = valid.sum(dim=1)  # [S]
+    # Build ascending [0..S_sel-1] to pick prefix per t
+    all_idx = torch.arange(S_sel, device=device).view(1, 1, 1, S_sel).expand(B, S, G, S_sel)
+    pick_mask = all_idx < num_valid.view(1, S, 1, 1)
+    if n_top >= S_sel:
+        selected = torch.where(pick_mask, all_idx, torch.full_like(all_idx, -1))
+    selected = torch.sort(selected, dim=-1).values
+    # Env-gated GPU range conversion (v2) to remove Python loops on hot path
+    use_v2 = os.getenv("NSA_SEL_RANGES_V2", "1").lower() in ("1", "true", "yes")
+    if use_v2:
+        ranges = convert_indices_to_ranges_batched_v2(selected, meta, S)
+    else:
+        ranges = convert_indices_to_ranges_batched(selected, meta, S)
+    return ranges
+def convert_indices_to_ranges_batched_dispatch(
+    indices: torch.Tensor,
+    meta: BlockMeta,
+    S: int,
+) -> torch.Tensor:
+    """
+    Dispatch helper mirroring production behavior: chooses v2 by default unless disabled.
+    Exposed for tests and tooling.
+    """
+    use_v2 = os.getenv("NSA_SEL_RANGES_V2", "1").lower() in ("1", "true", "yes")
+    if use_v2:
+        return convert_indices_to_ranges_batched_v2(indices, meta, S)
+    return convert_indices_to_ranges_batched(indices, meta, S)
+def convert_indices_to_ranges_batched(
+    indices: torch.Tensor,  # [B,S,G,k]
+    meta: BlockMeta,
+    S: int,
+) -> torch.Tensor:  # [B,S,G,n_max,2]
+    B, S_q, G, k = indices.shape
+    device = indices.device
+    sel_starts = meta.sel_starts.to(device)
+    all_ranges = []
+    for b in range(B):
+        for t in range(S_q):
+            clamp_end = int(t) + 1
+            for g in range(G):
+                block_ids = [int(x) for x in indices[b, t, g].tolist() if int(x) >= 0]
+                spans = []
+                last_s, last_e = None, None
+                prev = None
+                for bid in block_ids:
+                    # Skip invalid/out-of-range indices defensively
+                    if bid < 0 or bid >= sel_starts.numel():
+                        continue
+                    if prev is not None and bid == prev:
+                        continue
+                    prev = bid
+                    s0 = int(sel_starts[bid].item())
+                    e0 = min(s0 + meta.l_sel, clamp_end)
+                    if e0 <= s0:
+                        continue
+                    if last_s is None:
+                        last_s, last_e = s0, e0
+                    elif s0 == last_e:
+                        last_e = e0
+                    else:
+                        spans.append((last_s, last_e))
+                        last_s, last_e = s0, e0
+                if last_s is not None:
+                    spans.append((last_s, last_e))
+                all_ranges.append(spans)
+    max_ranges = max((len(r) for r in all_ranges), default=0)
+    out = torch.zeros((B, S_q, G, max_ranges, 2), dtype=torch.int32, device=device)
+    idx = 0
+    for b in range(B):
+        for t in range(S_q):
+            for g in range(G):
+                spans = all_ranges[idx]
+                for i, (s0, e0) in enumerate(spans):
+                    out[b, t, g, i, 0] = s0
+                    out[b, t, g, i, 1] = e0
+                idx += 1
+    return out
+def convert_indices_to_ranges_batched_v2(
+    indices: torch.Tensor,  # [B,S,G,k], sorted asc, -1 padded
+    meta: BlockMeta,
+    S: int,
+) -> torch.Tensor:  # [B,S,G,k,2] (padded with zero-length ranges)
+    """
+    Vectorized GPU range conversion with no Python loops.
+    - Treat equal and +1 successive block ids as a single merged run.
+    - Map runs to token [start, end) using sel_starts and l_sel.
+    - Clamp end to t+1 per row to preserve causality.
+    - Output is padded to k runs per row; zero-length ranges are encoded as [0,0].
+    """
+    # NVTX annotation support
+    _nvtx = os.getenv("NSA_NVTX", "0").lower() in ("1", "true", "yes")
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_push("nsa.sel.ranges_v2")
+        except Exception:
+            _nvtx = False
+    device = indices.device
+    B, S_q, G, K = indices.shape
+    if K == 0:
+        return torch.zeros((B, S_q, G, 0, 2), dtype=torch.int32, device=device)
+    # Valid mask and prepared index tensor
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_push("v2_run_detection")
+        except Exception:
+            pass
+    valid = indices.ge(0)
+    x = torch.where(valid, indices, torch.full_like(indices, -2))  # sentinel -2
+    # Identify run starts: first valid element or break in adjacency (including dedup collapse)
+    x_shift = torch.cat([torch.full_like(x[..., :1], -2), x[..., :-1]], dim=-1)
+    prev_valid = x_shift.ge(0)
+    diff = x - x_shift
+    adjacent_or_dup = (diff.eq(1) | diff.eq(0)) & prev_valid
+    run_start = valid & (~adjacent_or_dup | (~prev_valid))
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_pop()
+        except Exception:
+            pass
+    # Row-local run ids [0..runs_per_row-1], -1 for invalid
+    run_id = run_start.to(torch.int32).cumsum(dim=-1) - 1
+    run_id = torch.where(valid, run_id, torch.full_like(run_id, -1))
+    # Number of runs per row and flattened row indexing
+    runs_per_row = run_start.sum(dim=-1, dtype=torch.int32)  # [B,S,G]
+    N = B * S_q * G
+    runs_per_row_flat = runs_per_row.reshape(N)
+    # Build flattened per-run metadata
+    # Flatten last dim for selection
+    run_start_flat = run_start.reshape(-1, K)
+    x_flat = x.reshape(-1, K)
+    run_id_flat = run_id.reshape(-1, K)
+    # Indices (within last dim) where runs start per row
+    pos = torch.arange(K, device=device, dtype=torch.int32)
+    pos_flat = pos.view(1, K).expand(run_start_flat.shape[0], K)
+    start_pos_flat = pos_flat[run_start_flat]
+    # Corresponding block ids where runs start
+    start_blk_flat = x_flat[run_start_flat].to(torch.int32)
+    # Build unique global run ids by offsetting row-local run ids with row offsets
+    run_offsets = torch.cumsum(torch.nn.functional.pad(runs_per_row_flat, (1, 0)), dim=0)[
+        :-1
+    ]  # [N]
+    # Row index per element (0..N-1)
+    row_ids = torch.arange(N, device=device, dtype=torch.int32)
+    row_ids_per_elem = row_ids.view(N, 1).expand(N, K)
+    # Global run id per element; -1 for invalid
+    global_rid = torch.where(
+        run_id_flat.ge(0),
+        run_id_flat + run_offsets.view(N, 1),
+        torch.full_like(run_id_flat, -1),
+    )
+    global_rid_valid = global_rid[run_id_flat.ge(0)]  # [total_valid_elems]
+    # For each global run, compute max block id in that run (end block)
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_push("v2_scatter_reduce")
+        except Exception:
+            pass
+    total_runs = int(runs_per_row_flat.sum().item())
+    if total_runs == 0:
+        if _nvtx:
+            try:
+                torch.cuda.nvtx.range_pop()
+            except Exception:
+                pass
+        return torch.zeros((B, S_q, G, K, 2), dtype=torch.int32, device=device)
+    max_blk = torch.full((total_runs,), -2, dtype=torch.int32, device=device)
+    # Values to reduce are block ids for valid elements
+    blk_vals = x_flat[run_id_flat.ge(0)].to(torch.int32)
+    max_blk.scatter_reduce_(
+        0, global_rid_valid.to(torch.int64), blk_vals, reduce="amax", include_self=False
+    )
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_pop()
+        except Exception:
+            pass
+    # Start block ids per run, collected in row order
+    start_blk_per_run = start_blk_flat  # length == total_runs
+    # Map block ids to token starts/ends (guard invalid/out-of-range)
+    sel_starts = meta.sel_starts.to(device=device, dtype=torch.int32)
+    S_sel = int(sel_starts.numel())
+    l_sel = int(meta.l_sel)
+    valid_runs = (
+        (start_blk_per_run >= 0)
+        & (start_blk_per_run < S_sel)
+        & (max_blk >= 0)
+        & (max_blk < S_sel)
+    )
+    # Default zeros; fill only valid runs
+    start_tok_flat = torch.zeros_like(start_blk_per_run, dtype=torch.int32, device=device)
+    end_tok_flat = torch.zeros_like(max_blk, dtype=torch.int32, device=device)
+    if valid_runs.any():
+        start_tok_flat[valid_runs] = sel_starts[start_blk_per_run[valid_runs]]
+        end_tok_flat[valid_runs] = sel_starts[max_blk[valid_runs]] + l_sel
+    # Clamp end to t+1 per row (only meaningful for valid runs)
+    # Row t positions: [S] repeated over B,G
+    tpos = torch.arange(S, device=device, dtype=torch.int32)
+    t_rows = tpos.view(1, S, 1).expand(B, S, G).reshape(N)  # [N]
+    # t per run: repeat per row by runs_per_row
+    t_per_run = torch.repeat_interleave(t_rows, runs_per_row_flat)
+    end_tok_flat = torch.minimum(end_tok_flat, (t_per_run + 1))
+    # Prepare output [B,S,G,K,2], fill zeros then scatter first runs_per_row entries per row
+    out = torch.zeros((B, S_q, G, K, 2), dtype=torch.int32, device=device)
+    # Positions within row to write (0..K-1): take row-local run_id at run starts
+    run_id_at_starts = (run_id.reshape(-1, K))[run_start_flat]
+    # Compute base index in flattened out for each run write
+    # Build linear indices for advanced indexing
+    # Map flat run order back to (row, pos)
+    row_of_run = torch.repeat_interleave(row_ids, runs_per_row_flat)
+    pos_in_row = run_id_at_starts  # 0..runs_per_row[row]-1
+    b = (row_of_run // (S_q * G)).to(torch.int64)
+    rem = row_of_run % (S_q * G)
+    t = (rem // G).to(torch.int64)
+    g = (rem % G).to(torch.int64)
+    p = pos_in_row.to(torch.int64)
+    # Scatter only valid runs
+    if valid_runs.any():
+        vr = valid_runs.to(torch.bool)
+        b_v = b[vr]
+        t_v = t[vr]
+        g_v = g[vr]
+        p_v = p[vr]
+        out[b_v, t_v, g_v, p_v, 0] = start_tok_flat[vr].to(torch.int32)
+        out[b_v, t_v, g_v, p_v, 1] = end_tok_flat[vr].to(torch.int32)
+    if _nvtx:
+        try:
+            torch.cuda.nvtx.range_pop()
+        except Exception:
+            pass
+    return out
+def map_pcmp_to_pslc_slow_path(p_cmp_all: torch.Tensor, meta: BlockMeta) -> torch.Tensor:
+    """
+    M8: Eq.9 slow path verifier - explicit mathematical computation.
+    This function implements the exact mathematical definition by using the
+    CSR mapping directly instead of recomputing overlaps. This ensures it
+    matches the fast path exactly.
+    Args:
+        p_cmp_all: [B,S,G,h,S_cmp] compressed probabilities
+        meta: Block metadata with overlap mapping
+    Returns:
+        p_slc_all: [B,S,G,h,S_sel] selection probabilities
+    """
+    B, S, G, h, S_cmp = p_cmp_all.shape
+    device = p_cmp_all.device
+    S_sel = meta.sel_starts.numel()
+    if S_cmp == 0:
+        return torch.zeros((B, S, G, h, S_sel), device=device, dtype=p_cmp_all.dtype)
+    # Use CSR mapping directly (same as fast path but with explicit loops)
+    p_slc_all = torch.zeros((B, S, G, h, S_sel), device=device, dtype=p_cmp_all.dtype)
+    indptr = meta.M_csl_indptr.to(device)
+    indices = meta.M_csl_indices.to(device)
+    values = meta.M_csl_values.to(device, dtype=p_cmp_all.dtype)
+    # For each compressed block (CSR row)
+    for cmp_i in range(min(S_cmp, len(indptr) - 1)):
+        start = int(indptr[cmp_i].item())
+        end = int(indptr[cmp_i + 1].item())
+        if start == end:
+            continue
+        # Get the selection blocks this compressed block contributes to
+        sel_cols = indices[start:end]
+        weights = values[start:end]
+        # Add weighted contribution to each selection block
+        for j, (sel_idx, weight) in enumerate(zip(sel_cols, weights)):
+            sel_idx = int(sel_idx.item())
+            if sel_idx < S_sel:
+                p_slc_all[..., sel_idx] += p_cmp_all[..., cmp_i] * float(weight.item())
+    return p_slc_all
+def verify_mapping_equivalence(
+    p_cmp_all: torch.Tensor, meta: BlockMeta, rtol: float = 1e-5, atol: float = 1e-8
+) -> tuple[bool, dict]:
+    """
+    M8: Verify fast COO path matches slow mathematical path (Eq.9 verification).
+    Args:
+        p_cmp_all: Compressed probabilities to test
+        meta: Block metadata
+        rtol: Relative tolerance for comparison
+        atol: Absolute tolerance for comparison
+    Returns:
+        (is_equivalent, details): True if paths match, plus diagnostic info
+    """
+    # Only run verification if explicitly requested via env flag
+    if os.getenv("NSA_VERIFY_EQ9_MAPPING", "0").lower() not in ("1", "true", "yes"):
+        return True, {"status": "skipped", "reason": "NSA_VERIFY_EQ9_MAPPING not set"}
+    with torch.no_grad():
+        # Compute both paths
+        fast_result = map_pcmp_to_pslc_batched(p_cmp_all, meta)
+        slow_result = map_pcmp_to_pslc_slow_path(p_cmp_all, meta)
+        # Compare results
+        is_close = torch.allclose(fast_result, slow_result, rtol=rtol, atol=atol)
+        # Compute diagnostic metrics
+        abs_diff = (fast_result - slow_result).abs()
+        max_abs_diff = abs_diff.max().item()
+        mean_abs_diff = abs_diff.mean().item()
+        rel_diff = abs_diff / (slow_result.abs() + atol)
+        max_rel_diff = rel_diff.max().item()
+        details = {
+            "status": "verified" if is_close else "mismatch",
+            "max_abs_diff": max_abs_diff,
+            "mean_abs_diff": mean_abs_diff,
+            "max_rel_diff": max_rel_diff,
+            "shape": list(p_cmp_all.shape),
+            "rtol": rtol,
+            "atol": atol,
+        }
+        if not is_close:
+            from nsa.core.debug import log
+            log(
+                "error.eq9_mapping_mismatch",
+                msg="Fast COO path does not match slow mathematical path",
+                **details,
+            )
+        return is_close, details
+def validate_selection_determinism(
+    p_grp: torch.Tensor, meta: BlockMeta, n_top: int, t_token: int, num_trials: int = 5
+) -> bool:
+    """Validate that selection is deterministic by running multiple times.
+    Args:
+        p_grp: Group probabilities [B,G,S_sel]
+        meta: Block metadata
+        n_top: Number of top blocks to select
+        t_token: Current token position
+        num_trials: Number of trials to test determinism
+    Returns:
+        True if all trials produce identical results
+    """
+    # Only run validation if explicitly requested via env flag
+    if os.getenv("NSA_VALIDATE_SELECTION_DETERMINISM", "0").lower() not in ("1", "true", "yes"):
+        return True
+    if p_grp.requires_grad:
+        # Don't validate during training to avoid affecting gradients
+        return True
+    with torch.no_grad():
+        results = []
+        for trial in range(num_trials):
+            ranges = select_topn_ranges(
+                p_grp.clone(), meta, n_top, t_token, True, 2, _skip_validation=True
+            )
+            results.append(ranges.clone())
+        # Check if all results are identical
+        for i in range(1, num_trials):
+            if not torch.equal(results[0], results[i]):
+                from nsa.core.debug import log
+                log(
+                    "error.selection_nondeterministic",
+                    msg=f"Selection non-deterministic: trial 0 != trial {i}",
+                    trial_0_shape=list(results[0].shape),
+                    trial_i_shape=list(results[i].shape),
+                )
+                return False
+    return True

nsa/data_pipeline.py ADDED Viewed

	@@ -0,0 +1,199 @@

+from __future__ import annotations
+#!/usr/bin/env python3
+"""Data pipeline utilities for streaming and local datasets.
+Provides a FineWeb-Edu IterableDataset and simple local JSONL/TXT loaders.
+This module is optional; scripts/train_showcase.py currently uses a simpler
+loader in scripts/datasets. Migrate incrementally as needed.
+"""
+from __future__ import annotations
+import json
+import os
+from dataclasses import dataclass
+from typing import Callable, Iterable, Iterator, List, Optional
+Tokenizer = Callable[[str], List[int]]
+@dataclass
+class Shard:
+    mod: int = 1
+    rem: int = 0
+def fineweb_stream_batches(
+    encode: Tokenizer,
+    seq_len: int,
+    batch_size: int,
+    shard: Shard = Shard(),
+    report_docs: int = 1000,
+) -> Iterator[List[List[int]]]:
+    try:
+        from datasets import Features, Value, load_dataset  # type: ignore
+    except Exception as e:
+        raise RuntimeError("datasets package required. Install with: pip install datasets") from e
+    features = Features(
+        {
+            "text": Value("string"),
+            "id": Value("string"),
+            "dump": Value("string"),
+            "url": Value("string"),
+            "file_path": Value("string"),
+            "language": Value("string"),
+            "language_score": Value("float64"),
+            "token_count": Value("int64"),
+            "score": Value("float64"),
+            "int_score": Value("int64"),
+        }
+    )
+    ds = load_dataset("HuggingFaceFW/fineweb-edu", split="train", streaming=True, features=features)
+    buf: List[int] = []
+    batch: List[List[int]] = []
+    seen = 0
+    import time as _t
+    t0 = _t.time()
+    last = t0
+    for ex in ds:
+        if seen % shard.mod != shard.rem:
+            seen += 1
+            continue
+        seen += 1
+        if report_docs and seen % report_docs == 0:
+            dt = _t.time() - last
+            print(f"[fwe] seen_docs={seen} dt={dt:.1f}s buf={len(buf)}", flush=True)
+            last = _t.time()
+        text = ex.get("text") or ""
+        if not text:
+            continue
+        toks = encode(text)
+        if not toks:
+            continue
+        buf.extend(toks)
+        while len(buf) >= seq_len:
+            seq = buf[:seq_len]
+            buf = buf[seq_len:]
+            batch.append(seq)
+            if len(batch) >= batch_size:
+                yield batch[:batch_size]
+                batch = batch[batch_size:]
+def fineweb_stream_batches_batched(
+    encode_batch: Callable[[List[str]], List[List[int]]],
+    seq_len: int,
+    batch_size: int,
+    shard: Shard = Shard(),
+    report_docs: int = 1000,
+    doc_batch: int = 64,
+) -> Iterator[List[List[int]]]:
+    """Streaming FineWeb‑Edu with batched tokenization and fixed-length packing.
+    - encode_batch: function mapping a list of texts -> list of token id lists
+    - Packs contiguous tokens from a rolling buffer into fixed seq_len examples
+    - Yields Python lists of shape [batch_size][seq_len]
+    """
+    try:
+        from datasets import load_dataset, Features, Value  # type: ignore
+    except Exception as e:
+        raise RuntimeError("datasets package required. Install with: pip install datasets") from e
+    features = Features(
+        {
+            "text": Value("string"),
+            "id": Value("string"),
+            "dump": Value("string"),
+            "url": Value("string"),
+            "file_path": Value("string"),
+            "language": Value("string"),
+            "language_score": Value("float64"),
+            "token_count": Value("int64"),
+            "score": Value("float64"),
+            "int_score": Value("int64"),
+        }
+    )
+    ds = load_dataset("HuggingFaceFW/fineweb-edu", split="train", streaming=True, features=features)
+    buf: List[int] = []
+    batch: List[List[int]] = []
+    seen = 0
+    acc_texts: List[str] = []
+    import time as _t
+    last = _t.time()
+    for ex in ds:
+        if seen % shard.mod != shard.rem:
+            seen += 1
+            continue
+        seen += 1
+        if report_docs and seen % report_docs == 0:
+            dt = _t.time() - last
+            print(f"[fwe] (batched) seen_docs={seen} dt={dt:.1f}s buf={len(buf)} acc_texts={len(acc_texts)}", flush=True)
+            last = _t.time()
+        text = ex.get("text") or ""
+        if not text:
+            continue
+        acc_texts.append(text)
+        if len(acc_texts) < max(1, int(doc_batch)):
+            continue
+        # Batched tokenize
+        try:
+            toks_list = encode_batch(acc_texts)
+        except Exception:
+            # Fallback to per-doc encode if batch path fails
+            toks_list = []
+            for t in acc_texts:
+                try:
+                    toks_list.append(encode_batch([t])[0])
+                except Exception:
+                    toks_list.append([])
+        acc_texts.clear()
+        # Fill rolling buffer and output fixed-length sequences
+        for toks in toks_list:
+            if not toks:
+                continue
+            buf.extend(toks)
+            while len(buf) >= seq_len:
+                seq = buf[:seq_len]
+                buf = buf[seq_len:]
+                batch.append(seq)
+                if len(batch) >= batch_size:
+                    yield batch[:batch_size]
+                    batch = batch[batch_size:]
+def local_jsonl_or_txt_batches(
+    path: str,
+    encode: Tokenizer,
+    seq_len: int,
+    batch_size: int,
+) -> Iterator[List[List[int]]]:
+    is_jsonl = path.endswith(".jsonl")
+    buf: List[int] = []
+    batch: List[List[int]] = []
+    with open(path, encoding="utf-8", errors="ignore") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            text = line
+            if is_jsonl:
+                try:
+                    obj = json.loads(line)
+                    if isinstance(obj, dict) and isinstance(obj.get("text"), str):
+                        text = obj["text"]
+                except Exception:
+                    pass
+            toks = encode(text)
+            if not toks:
+                continue
+            buf.extend(toks)
+            while len(buf) >= seq_len:
+                seq = buf[:seq_len]
+                buf = buf[seq_len:]
+                batch.append(seq)
+                if len(batch) >= batch_size:
+                    yield batch[:batch_size]
+                    batch = batch[batch_size:]

nsa/kernels/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from __future__ import annotations

nsa/kernels/flash_wrappers.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from __future__ import annotations
+import torch
+import torch.nn.functional as F
+from nsa.core.debug import log
+def _env_bool(name: str, default: bool = False) -> bool:
+    v = str(name and __import__("os").getenv(name, "1" if default else "0")).lower()
+    return v in ("1", "true", "yes", "on")
+def flash_attn_version() -> str | None:
+    """Return flash-attn version string if importable, else None."""
+    try:
+        import flash_attn as _fa  # type: ignore
+        return getattr(_fa, "__version__", None)
+    except Exception:
+        return None
+def is_flash_available() -> bool:
+    """Return True if flash-attn dense API is importable."""
+    try:
+        from flash_attn import flash_attn_func  # type: ignore
+        _ = flash_attn_func  # silence linter
+        return True
+    except Exception:
+        return False
+def is_flash_varlen_available() -> bool:
+    """Return True if a varlen API is importable (either QKV or KV-packed)."""
+    try:
+        from flash_attn import flash_attn_varlen_func  # type: ignore
+        _ = flash_attn_varlen_func
+        return True
+    except Exception:
+        try:
+            from flash_attn import flash_attn_varlen_kvpacked_func  # type: ignore
+            _ = flash_attn_varlen_kvpacked_func
+            return True
+        except Exception:
+            return False
+def fa2_supported_verbose(
+    device: torch.device, dtype: torch.dtype, head_dim: int
+) -> tuple[bool, str]:
+    """
+    Conservative capability probe with a reason string for logging.
+    We do not hard-fail on dtype, relying on try/except at call sites.
+    """
+    if device.type != "cuda":
+        return False, "device_not_cuda"
+    if head_dim % 8 != 0:
+        return False, "head_dim_not_multiple_of_8"
+    if not (is_flash_varlen_available() or is_flash_available()):
+        return False, "flash_attn_not_importable"
+    # Optional version floor (best-effort)
+    ver = flash_attn_version()
+    if ver is None:
+        # Unknown version; still allow
+        return True, "ok"
+    # Allow all known versions; attach for logs
+    return True, f"ok_v{ver}"
+def fa2_supported(device: torch.device, dtype: torch.dtype, head_dim: int) -> bool:
+    ok, _ = fa2_supported_verbose(device, dtype, head_dim)
+    return ok
+def attention_bgh(
+    Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, causal: bool = True
+) -> torch.Tensor:
+    """
+    Q: [B,G,h,Dk], K/V: [B,G,S,D*] -> out [B,G,h,Dv]
+    Prefer flash-attn if available; fallback to SDPA.
+    """
+    B, G, h, Dk = Q.shape
+    S = K.shape[2]
+    # Try FA-2 dense path first
+    if is_flash_available():
+        try:
+            from flash_attn import flash_attn_func  # type: ignore
+            # Reshape without materializing copies
+            q = Q.transpose(1, 2).reshape(B, G * h, 1, Dk)  # [B,G*h,1,Dk]
+            k = K.unsqueeze(2).expand(B, G, h, S, Dk).reshape(B, G * h, S, Dk)  # [B,G*h,S,Dk]
+            v = (
+                V.unsqueeze(2).expand(B, G, h, S, V.shape[-1]).reshape(B, G * h, S, V.shape[-1])
+            )  # [B,G*h,S,Dv]
+            if _env_bool("NSA_DEBUG_TIMING"):
+                log("fa2.bgh.path", path="fa2.dense", B=B, G=G, h=h, S=S, Dk=Dk)
+            o = flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=causal)
+            o = o.reshape(B, G, h, -1)
+            if not torch.isfinite(o).all():
+                log("warn.flash_bgh_nonfinite", path="fa2.dense")
+            return torch.nan_to_num(o, nan=0.0)
+        except Exception:
+            pass
+    # SDPA fallback
+    if _env_bool("NSA_DEBUG_TIMING"):
+        log("fa2.bgh.path", path="sdpa", B=B, G=G, h=h, S=S, Dk=Dk)
+    # Expand heads via view/expand to avoid materializing copies
+    q2 = Q.reshape(B * G * h, 1, Dk).contiguous()
+    k2 = K.unsqueeze(2).expand(B, G, h, S, Dk).reshape(B * G * h, S, Dk).contiguous()
+    v2 = (
+        V.unsqueeze(2)
+        .expand(B, G, h, S, V.shape[-1])
+        .reshape(B * G * h, S, V.shape[-1])
+        .contiguous()
+    )
+    attn = F.scaled_dot_product_attention(q2, k2, v2, is_causal=causal)
+    o = attn.squeeze(1).reshape(B, G, h, -1)
+    return torch.nan_to_num(o, nan=0.0)
+def attention_fa2_dense_batch(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    *,
+    causal: bool,
+) -> torch.Tensor:
+    """
+    Best-effort dense FA-2 call for a batch of independent rows.
+    Shapes:
+    - q: [N, Tq, h, D]
+    - k: [N, Tk, h, D]
+    - v: [N, Tk, h, Dv]
+    Returns: o [N, Tq, h, Dv]
+    Falls back to SDPA if flash-attn unavailable.
+    """
+    # Ensure contiguous tensors for FA-2
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    try:
+        from flash_attn import flash_attn_func  # type: ignore
+        if _env_bool("NSA_DEBUG_TIMING"):
+            log(
+                "fa2.batch.path",
+                path="fa2.dense",
+                N=int(q.shape[0]),
+                Tq=int(q.shape[1]),
+                Tk=int(k.shape[1]),
+            )
+        return flash_attn_func(q, k, v, dropout_p=0.0, softmax_scale=None, causal=causal)
+    except Exception:
+        # SDPA fallback per row
+        N, Tq, h, D = q.shape
+        Tk = k.shape[1]
+        Dv = v.shape[-1]
+        if _env_bool("NSA_DEBUG_TIMING"):
+            log("fa2.batch.path", path="sdpa", N=int(N), Tq=int(Tq), Tk=int(Tk))
+        q2 = q.reshape(N * h, Tq, D)
+        k2 = k.reshape(N * h, Tk, D)
+        v2 = v.reshape(N * h, Tk, Dv)
+        out = F.scaled_dot_product_attention(q2, k2, v2, is_causal=causal)
+        return out.reshape(N, h, Tq, Dv).permute(0, 2, 1, 3).contiguous()
+def attention_fa2_varlen(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    *,
+    causal: bool,
+) -> torch.Tensor:
+    """
+    Best-effort varlen FA-2 call with separate Q/K/V packing.
+    Shapes:
+    - q: [total_q, h, D], k: [total_k, h, D], v: [total_k, h, Dv]
+    - cu_seqlens_*: int32 [N+1]
+    Returns: [total_q, h, Dv] packed output.
+    Falls back to dense batching by padding per bucket if varlen API unavailable.
+    """
+    # Ensure contiguous tensors for FA-2
+    q = q.contiguous()
+    k = k.contiguous()
+    v = v.contiguous()
+    try:
+        from flash_attn import flash_attn_varlen_func  # type: ignore
+        return flash_attn_varlen_func(
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            dropout_p=0.0,
+            softmax_scale=None,
+            causal=causal,
+        )
+    except Exception:
+        # Try KV-packed API variant
+        try:
+            from flash_attn import flash_attn_varlen_kvpacked_func  # type: ignore
+            # Build KV packed as [total_k, 2, h, D]
+            kv_packed = torch.stack([k, v], dim=1).contiguous()
+            return flash_attn_varlen_kvpacked_func(
+                q,
+                kv_packed,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                dropout_p=0.0,
+                softmax_scale=None,
+                causal=causal,
+            )
+        except Exception:
+            raise NotImplementedError("FA-2 varlen API not available; caller should fallback")

nsa/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from __future__ import annotations

nsa/model/llama_block_nsa.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from nsa.cache.kv_cache import NSA_KV
+from nsa.core.block_index import build_block_meta
+from nsa.core.nsa_attention import NSAAttention
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # x: [B,S,dim]
+        rms = x.pow(2).mean(dim=-1, keepdim=True).add(self.eps).rsqrt()
+        return (x * rms) * self.weight
+class MLP(nn.Module):
+    def __init__(self, dim: int, hidden_mult: int = 4) -> None:
+        super().__init__()
+        h = hidden_mult * dim
+        self.fc1 = nn.Linear(dim, h, bias=False)
+        self.fc2 = nn.Linear(h, dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(F.silu(self.fc1(x)))
+class LlamaBlockNSA(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_groups: int,
+        d_k: int,
+        d_v: int,
+        l: int = 32,
+        d: int = 16,
+        l_sel: int = 64,
+        n_sel: int = 16,
+        w: int = 512,
+    ) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(dim)
+        self.attn = NSAAttention(
+            dim=dim,
+            n_heads=n_heads,
+            n_kv_groups=n_kv_groups,
+            d_k=d_k,
+            d_v=d_v,
+            l=l,
+            d=d,
+            l_sel=l_sel,
+            n_sel=n_sel,
+            w=w,
+        )
+        self.norm2 = RMSNorm(dim)
+        self.mlp = MLP(dim)
+    def _build_empty_kv(self, x: torch.Tensor) -> NSA_KV:
+        B, S, dim = x.shape
+        device = x.device
+        G = self.attn.n_kv_groups
+        Dk = self.attn.d_k
+        Dv = self.attn.d_v
+        zeros_k = torch.zeros((B, G, 0, Dk), device=device, dtype=x.dtype)
+        zeros_v = torch.zeros((B, G, 0, Dv), device=device, dtype=x.dtype)
+        meta = build_block_meta(
+            seq_len=0,
+            l=self.attn.l,
+            d=self.attn.d,
+            l_sel=self.attn.l_sel,
+            n_sel=self.attn.n_sel,
+            w=self.attn.w,
+        )
+        return NSA_KV(
+            K_sel=zeros_k.clone(),
+            V_sel=zeros_v.clone(),
+            K_win=zeros_k.clone(),
+            V_win=zeros_v.clone(),
+            K_cmp_raw_seq=zeros_k.clone(),
+            V_cmp_raw_seq=zeros_v.clone(),
+            K_cmp=zeros_k.clone(),
+            V_cmp=zeros_v.clone(),
+            win_ptr=torch.zeros((B, G), dtype=torch.int64, device=device),
+            cmp_emit_next=torch.zeros((B, G), dtype=torch.int64, device=device),
+            meta=meta,
+            reads_pred=torch.zeros((0,), dtype=torch.int64, device=device),
+            reads_act_total=torch.zeros((0,), dtype=torch.int64, device=device),
+            reads_act_sel=torch.zeros((0,), dtype=torch.int64, device=device),
+            reads_act_cmp=torch.zeros((0,), dtype=torch.int64, device=device),
+            reads_act_win=torch.zeros((0,), dtype=torch.int64, device=device),
+        )
+    def forward_attn(self, x: torch.Tensor) -> torch.Tensor:
+        """Attention sub-layer with residual.
+        Exposed to allow gradient-checkpoint splits that exclude attention from
+        checkpointing when dynamic routing could cause recompute mismatches.
+        """
+        B, S, dim = x.shape
+        res = x
+        xn = self.norm1(x)
+        kv = self._build_empty_kv(x)
+        out, _kv = self.attn(xn, kv=kv, prefill=True)
+        return res + out
+    def forward_mlp(self, x: torch.Tensor) -> torch.Tensor:
+        """MLP sub-layer with residual.
+        Can be safely checkpointed independently from attention.
+        """
+        res = x
+        return res + self.mlp(self.norm2(x))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Default monolithic forward preserves prior behavior
+        x = self.forward_attn(x)
+        x = self.forward_mlp(x)
+        return x
+class _EmptyKVLike:
+    pass

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

tokenization_nsa.py ADDED Viewed

	@@ -0,0 +1,73 @@

+# Remote code: configuration and modeling for NSA
+from typing import List, Optional, Dict
+import json
+from transformers import PreTrainedTokenizer
+class NSAByteTokenizer(PreTrainedTokenizer):
+    """A simple byte-level tokenizer with fixed vocab size 256.
+    - Encodes UTF-8 bytes of the input string as token ids 0..255.
+    - No special tokens by default; EOS/PAD can be configured via special tokens map.
+    - Decoding uses UTF-8 with replacement for invalid sequences.
+    """
+    def __init__(self, **kwargs):
+        # Build a stable 256-entry vocab mapping before base init (base may query the vocab)
+        self._vocab: Dict[str, int] = {f"<{i}>": i for i in range(256)}
+        self._ids_to_tokens: Dict[int, str] = {i: f"<{i}>" for i in range(256)}
+        super().__init__(**kwargs)
+        # Only return input_ids and attention_mask to avoid unused token_type_ids in generation
+        self.model_input_names = ["input_ids", "attention_mask"]
+    @property
+    def vocab_size(self) -> int:  # type: ignore[override]
+        return 256
+    def get_vocab(self) -> Dict[str, int]:  # type: ignore[override]
+        return dict(self._vocab)
+    def _tokenize(self, text: str) -> List[str]:  # type: ignore[override]
+        data = text.encode("utf-8", errors="replace")
+        return [f"<{b}>" for b in data]
+    def _convert_token_to_id(self, token: str) -> int:  # type: ignore[override]
+        if token in self._vocab:
+            return self._vocab[token]
+        # Fallback: try parse numeric inside <..>
+        if token.startswith("<") and token.endswith(">"):
+            try:
+                v = int(token[1:-1])
+                if 0 <= v < 256:
+                    return v
+            except Exception:
+                pass
+        return 0
+    def _convert_id_to_token(self, index: int) -> str:  # type: ignore[override]
+        return self._ids_to_tokens.get(int(index) % 256, "<0>")
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:  # type: ignore[override]
+        bs = []
+        for t in tokens:
+            if t in self._vocab:
+                bs.append(self._vocab[t])
+            else:
+                try:
+                    if t.startswith("<") and t.endswith(">"):
+                        v = int(t[1:-1])
+                        if 0 <= v < 256:
+                            bs.append(v)
+                            continue
+                except Exception:
+                    pass
+        return bytes(bs).decode("utf-8", errors="replace")
+    def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None) -> List[int]:  # type: ignore[override]
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None):  # type: ignore[override]
+        # Nothing to save besides special tokens map handled by the base class.
+        return (), ()

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "tokenizer_class": "NSAByteTokenizer",
+  "model_max_length": 2048,
+  "chat_template": "{% for m in messages %}{% if m['role']=='user' %}<|user|>{{ m['content'] }}\n{% elif m['role']=='assistant' %}<|assistant|>{{ m['content'] }}\n{% endif %}{% endfor %}<|assistant|>",
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_nsa.NSAByteTokenizer",
+      null
+    ]
+  }
+}