XAFT
/

SM-Selective-ViT-Base-224

@@ -63,7 +63,7 @@ class SoftMaskedMultiheadAttention(nn.Module):
             nn.init.constant_(self.out_proj.bias, 0.)
-    def naive_forward(self, query, key, value, key_padding_mask=None,
             attn_mask=None, average_attn_weights=True):
         batch_size, tgt_len, embed_dim = query.size()
         batch_size, src_len, _ = key.size()
@@ -191,9 +191,9 @@ class SoftMaskedMultiheadAttention(nn.Module):
         return out
-    def forward(self, query, key, value, method="naive", **kwargs):
-        if method == 'naive':
-            out = self.naive_forward(query, key, value, **kwargs)
         elif method == "fa":
             out = self.flash_forward(query, key, value, **kwargs)
         else:
@@ -260,7 +260,7 @@ class EncoderBlock(nn.Module):
         x = self.embed(x)
         x = self.norm1(x)
         # Apply attention mechanism
-        attn_output = self.self_attn(x, x, x, attn_mask=mask if not skip_masks else None, method="naive")
         # Add & Norm
         x = x + self.path_drop(attn_output)
         x = self.norm2(x)
@@ -395,7 +395,7 @@ class EncoderBlock(nn.Module):
             groups[-1][0].append(ii)
         return groups
-    def naive_forward(self, x, mask, full=False, skip_masks=False):
         # Step 1: Threshold the mask without in-place ops
         mask_thresholded = mask * (mask >= self.mask_threshold)
         # Step 2: Prepare output tensor (copy of x)
@@ -436,11 +436,11 @@ class EncoderBlock(nn.Module):
                 x = self.flash_forward(x, attn_mask, skip_masks)
             else:
                 warnings.warn(
-                    "Flash Attention requirements not met, falling back to naive attention.",
                     category=UserWarning,
                     stacklevel=2,
                 )
-                x = self.naive_forward(x, attn_mask, full, skip_masks)
         else:
             x = self.forward_common(x, attn_mask, skip_masks)
         return x, attn_mask
@@ -589,7 +589,6 @@ class VisionTransformer(nn.Module):
             dis_cls_token = hidden_states[:, 1]
             dis_logits = self.dis_head(dis_cls_token)
-            # Inference-time averaging (same as original)
             if not self.training:
                 logits = (logits + dis_logits) / 2

             nn.init.constant_(self.out_proj.bias, 0.)
+    def eager_forward(self, query, key, value, key_padding_mask=None,
             attn_mask=None, average_attn_weights=True):
         batch_size, tgt_len, embed_dim = query.size()
         batch_size, src_len, _ = key.size()
         return out
+    def forward(self, query, key, value, method="eager", **kwargs):
+        if method == 'eager':
+            out = self.eager_forward(query, key, value, **kwargs)
         elif method == "fa":
             out = self.flash_forward(query, key, value, **kwargs)
         else:
         x = self.embed(x)
         x = self.norm1(x)
         # Apply attention mechanism
+        attn_output = self.self_attn(x, x, x, attn_mask=mask if not skip_masks else None, method="eager")
         # Add & Norm
         x = x + self.path_drop(attn_output)
         x = self.norm2(x)
             groups[-1][0].append(ii)
         return groups
+    def eager_forward(self, x, mask, full=False, skip_masks=False):
         # Step 1: Threshold the mask without in-place ops
         mask_thresholded = mask * (mask >= self.mask_threshold)
         # Step 2: Prepare output tensor (copy of x)
                 x = self.flash_forward(x, attn_mask, skip_masks)
             else:
                 warnings.warn(
+                    "Flash Attention requirements not met, falling back to eager attention.",
                     category=UserWarning,
                     stacklevel=2,
                 )
+                x = self.eager_forward(x, attn_mask, full, skip_masks)
         else:
             x = self.forward_common(x, attn_mask, skip_masks)
         return x, attn_mask
             dis_cls_token = hidden_states[:, 1]
             dis_logits = self.dis_head(dis_cls_token)
             if not self.training:
                 logits = (logits + dis_logits) / 2