KitsuVp
/

NeoLLM

@@ -1556,8 +1556,8 @@ def eager_attention_forward(
     attn_weights = nn.functional.softmax(
         attn_weights, dim=-1, dtype=torch.float32
     ).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output  = torch.matmul(attn_weights, value_states).transpose(1, 2).contiguous()
     return attn_output, attn_weights
@@ -1622,10 +1622,8 @@ def affine_scaled_eager_attention_forward(
     if attn_analysis is not None:
         attn_analysis.attn_weights_post_affine = attn_weights_affine.detach()
-    attn_weights_affine = nn.functional.dropout(
-        attn_weights_affine, p=dropout, training=module.training
-    )
     attn_output  = torch.matmul(attn_weights_affine, value_states).transpose(1, 2).contiguous()
     return attn_output, attn_weights_affine
@@ -1729,6 +1727,7 @@ def affine_scaled_flash_attention_forward(
     # ── Combine and apply dropout to the full affine output ───────────────
     output = alpha_t * flash_out + beta_t * v_cumsum_t           # [B, S, H_q, d_head]
     # attn_weights is None — flash never exposes the softmax weight matrix.
     return output, None
@@ -2471,8 +2470,9 @@ class NeoLLMAttention(nn.Module):
                     **kwargs,
                 )
         else:
-            attn_fn = eager_attention_forward
-            if self.config._attn_implementation != "eager":
                 attn_fn = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
             attn_out, attn_weights = attn_fn(
                 self, q, k, v, attention_mask,

     attn_weights = nn.functional.softmax(
         attn_weights, dim=-1, dtype=torch.float32
     ).to(query.dtype)
     attn_output  = torch.matmul(attn_weights, value_states).transpose(1, 2).contiguous()
+    attn_output  = nn.functional.dropout(attn_output, p=dropout, training=module.training)
     return attn_output, attn_weights
     if attn_analysis is not None:
         attn_analysis.attn_weights_post_affine = attn_weights_affine.detach()
     attn_output  = torch.matmul(attn_weights_affine, value_states).transpose(1, 2).contiguous()
+    attn_output  = nn.functional.dropout(attn_output, p=dropout, training=module.training)
     return attn_output, attn_weights_affine
     # ── Combine and apply dropout to the full affine output ───────────────
     output = alpha_t * flash_out + beta_t * v_cumsum_t           # [B, S, H_q, d_head]
+    output = nn.functional.dropout(output, p=dropout, training=module.training)
     # attn_weights is None — flash never exposes the softmax weight matrix.
     return output, None
                     **kwargs,
                 )
         else:
+            if self.config._attn_implementation == "eager":
+                attn_fn = eager_attention_forward
+            else:
                 attn_fn = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
             attn_out, attn_weights = attn_fn(
                 self, q, k, v, attention_mask,