XAFT
/

SM-Selective-ViT-Small-224

@@ -63,7 +63,7 @@ class SoftMaskedMultiheadAttention(nn.Module):
             nn.init.constant_(self.out_proj.bias, 0.)
-    def naive_forward(self, query, key, value, key_padding_mask=None,
             attn_mask=None, average_attn_weights=True):
         batch_size, tgt_len, embed_dim = query.size()
         batch_size, src_len, _ = key.size()
@@ -120,7 +120,6 @@ class SoftMaskedMultiheadAttention(nn.Module):
         cu_seq_q, cu_seq_k,
         max_q, max_k,
         attn_mask=None,
-        is_causal=False,
     ):
         """
         FlashAttention-compatible soft-masked attention using varlen_attn
@@ -182,7 +181,6 @@ class SoftMaskedMultiheadAttention(nn.Module):
             cu_seq_k=cu_seq_k,
             max_q=max_q,
             max_k=max_k,
-            is_causal=is_causal,
             scale=scale_attn,
         )
@@ -193,9 +191,9 @@ class SoftMaskedMultiheadAttention(nn.Module):
         return out
-    def forward(self, query, key, value, method="naive", **kwargs):
-        if method == 'naive':
-            out = self.naive_forward(query, key, value, **kwargs)
         elif method == "fa":
             out = self.flash_forward(query, key, value, **kwargs)
         else:
@@ -262,7 +260,7 @@ class EncoderBlock(nn.Module):
         x = self.embed(x)
         x = self.norm1(x)
         # Apply attention mechanism
-        attn_output = self.self_attn(x, x, x, attn_mask=mask if not skip_masks else None, method="naive")
         # Add & Norm
         x = x + self.path_drop(attn_output)
         x = self.norm2(x)
@@ -277,42 +275,111 @@ class EncoderBlock(nn.Module):
         return x
     def flash_forward(self, x, mask, skip_masks=False):
         binary_mask = mask >= self.mask_threshold
-        sel_mask = mask[binary_mask]
-        seq_lengths = binary_mask.sum(1)
-        cum_lengths = torch.zeros(binary_mask.shape[0]+1, dtype=torch.int, device=binary_mask.device)
-        cum_lengths[1:] = seq_lengths.cumsum(-1)
-        max_len = seq_lengths.amax()
-        x1 = x
-        x = x[binary_mask]
-        x = self.embed(x)
-        x = self.norm1(x)
-        # Apply flash attention mechanism
         attn_output = self.self_attn(
-                x, x, x,
-                cu_seq_q=cum_lengths,
-                cu_seq_k=cum_lengths,
-                max_q=max_len,
-                max_k=max_len,
-                attn_mask=sel_mask if not skip_masks else None,
-                method="fa"
-                )
-        # Add & Norm
-        x = x + self.path_drop(attn_output)
-        x = self.norm2(x)
-        # Feed-forward network
-        mlp_output = self.mlp(x)
-        # Add & Norm
-        x = self.path_drop(self.project(x + mlp_output))
-        x = self.norm3(x)
-        if mask is not None:
-            x = x * sel_mask.unsqueeze(-1)
-        x_out = x1.clone()
-        x_out[binary_mask] = x_out[binary_mask] + x
         return x_out
     def get_groups(self, mask, full=False):
@@ -328,7 +395,7 @@ class EncoderBlock(nn.Module):
             groups[-1][0].append(ii)
         return groups
-    def naive_forward(self, x, mask, full=False, skip_masks=False):
         # Step 1: Threshold the mask without in-place ops
         mask_thresholded = mask * (mask >= self.mask_threshold)
         # Step 2: Prepare output tensor (copy of x)
@@ -369,11 +436,11 @@ class EncoderBlock(nn.Module):
                 x = self.flash_forward(x, attn_mask, skip_masks)
             else:
                 warnings.warn(
-                    "Flash Attention requirements not met, falling back to naive attention.",
                     category=UserWarning,
                     stacklevel=2,
                 )
-                x = self.naive_forward(x, attn_mask, full, skip_masks)
         else:
             x = self.forward_common(x, attn_mask, skip_masks)
         return x, attn_mask
@@ -522,7 +589,6 @@ class VisionTransformer(nn.Module):
             dis_cls_token = hidden_states[:, 1]
             dis_logits = self.dis_head(dis_cls_token)
-            # Inference-time averaging (same as original)
             if not self.training:
                 logits = (logits + dis_logits) / 2

             nn.init.constant_(self.out_proj.bias, 0.)
+    def eager_forward(self, query, key, value, key_padding_mask=None,
             attn_mask=None, average_attn_weights=True):
         batch_size, tgt_len, embed_dim = query.size()
         batch_size, src_len, _ = key.size()
         cu_seq_q, cu_seq_k,
         max_q, max_k,
         attn_mask=None,
     ):
         """
         FlashAttention-compatible soft-masked attention using varlen_attn
             cu_seq_k=cu_seq_k,
             max_q=max_q,
             max_k=max_k,
             scale=scale_attn,
         )
         return out
+    def forward(self, query, key, value, method="eager", **kwargs):
+        if method == 'eager':
+            out = self.eager_forward(query, key, value, **kwargs)
         elif method == "fa":
             out = self.flash_forward(query, key, value, **kwargs)
         else:
         x = self.embed(x)
         x = self.norm1(x)
         # Apply attention mechanism
+        attn_output = self.self_attn(x, x, x, attn_mask=mask if not skip_masks else None, method="eager")
         # Add & Norm
         x = x + self.path_drop(attn_output)
         x = self.norm2(x)
         return x
     def flash_forward(self, x, mask, skip_masks=False):
+        # x: [B, N, C]
+        # mask: [B, N]
+        B, N, C = x.shape
+        x_res = x  # residual
         binary_mask = mask >= self.mask_threshold
+        seq_lengths = binary_mask.sum(dim=1, dtype=torch.int32)
+        mean_len = seq_lengths.float().square().mean().sqrt().item()
+        max_len = seq_lengths.amax().item()
+        min_len = seq_lengths.amin().item()
+        # Early exit if nothing selected
+        if not binary_mask.any():
+            return x
+        # Check if nonselective or topk would be easier
+        if ((mean_len / x.shape[1]) > 0.90):
+            x_sel = x.flatten(0, 1)
+            flat_idx = None
+            if not skip_masks:
+                sel_mask = mask.flatten(0, 1)
+            else:
+                sel_mask = None
+            cu_seqlens = torch.arange(0, (B + 1) * N, step=N, dtype=torch.int32, device=x.device)
+        elif max_len > 32:
+            # Regular selective model
+            idx = binary_mask.nonzero(as_tuple=False)
+            b_idx = idx[:, 0]
+            t_idx = idx[:, 1]
+            flat_idx = b_idx * N + t_idx
+            # Pack selected tokens
+            x_sel = x[b_idx, t_idx]
+            if not skip_masks:
+                sel_mask = mask[b_idx, t_idx]
+            else:
+                sel_mask = None
+            # cu_seqlens for varlen FA
+            cu_seqlens = torch.zeros(binary_mask.shape[0]+1, dtype=torch.int, device=binary_mask.device)
+            cu_seqlens[1:] = seq_lengths.cumsum(-1)
+        else:
+            # Small kept lengths: use top-k packing, but keep varlen FA interface
+            k = max_len
+            # topk over score/mask values
+            top_vals, top_idx = mask.topk(k, dim=1, largest=True, sorted=False)  # [B, k]
+            b_idx = torch.arange(B, device=mask.device)[:, None].expand_as(top_idx)
+            flat_idx = (b_idx * N + top_idx).reshape(-1)
+            gather_idx = top_idx.unsqueeze(-1).expand(-1, -1, C)   # [B, k, C]
+            x_top = x.gather(1, gather_idx)                        # [B, k, C]
+            # Flatten, then keep only valid entries so packed layout matches varlen FA
+            x_sel = x_top.flatten(0, 1)
+            if not skip_masks:
+                sel_mask = top_vals.flatten(0, 1)
+            else:
+                sel_mask = None
+            cu_seqlens = torch.arange(0, (B + 1) * max_len, step=max_len, dtype=torch.int32, device=x.device)
+        cu_seqlens = cu_seqlens.to(torch.int32)
+        # Block
+        x_sel = self.embed(x_sel)
+        x_sel = self.norm1(x_sel)
         attn_output = self.self_attn(
+            x_sel, x_sel, x_sel,
+            cu_seq_q=cu_seqlens,
+            cu_seq_k=cu_seqlens,
+            max_q=max_len,
+            max_k=max_len,
+            attn_mask=None if skip_masks else sel_mask,
+            method="fa",
+        )
+        x_sel = x_sel + self.path_drop(attn_output)
+        x_sel = self.norm2(x_sel)
+        mlp_output = self.mlp(x_sel)
+        x_sel = self.path_drop(self.project(x_sel + mlp_output))
+        x_sel = self.norm3(x_sel)
+        if sel_mask is not None:
+            x_sel.mul_(sel_mask.unsqueeze(-1))
+        # Scatter back directly into residual output
+        if flat_idx is None:
+            x_out = x_res + x_sel.view(*x_res.shape)
+        else:
+            B, N, C = x_res.shape
+            flat_out = x_res.reshape(B * N, C)
+            if torch.is_grad_enabled():
+                flat_out = flat_out.clone()
+            flat_out.index_add_(0, flat_idx, x_sel)
+            x_out = flat_out.view(B, N, C)
         return x_out
     def get_groups(self, mask, full=False):
             groups[-1][0].append(ii)
         return groups
+    def eager_forward(self, x, mask, full=False, skip_masks=False):
         # Step 1: Threshold the mask without in-place ops
         mask_thresholded = mask * (mask >= self.mask_threshold)
         # Step 2: Prepare output tensor (copy of x)
                 x = self.flash_forward(x, attn_mask, skip_masks)
             else:
                 warnings.warn(
+                    "Flash Attention requirements not met, falling back to eager attention.",
                     category=UserWarning,
                     stacklevel=2,
                 )
+                x = self.eager_forward(x, attn_mask, full, skip_masks)
         else:
             x = self.forward_common(x, attn_mask, skip_masks)
         return x, attn_mask
             dis_cls_token = hidden_states[:, 1]
             dis_logits = self.dis_head(dis_cls_token)
             if not self.training:
                 logits = (logits + dis_logits) / 2