Update modeling_rcps.py

Enable `prenorm=False` for RCPSAddNormWrapper which prevent returning the residual

Files changed (1) hide show

modeling_rcps.py CHANGED Viewed

@@ -101,11 +101,12 @@ class RCPSAddNormWrapper(RCPSWrapper):
     def __init__(self, submodule: nn.Module):
         super().__init__(submodule)
-    def forward(self, x, residual=None):
         """
         Args:
             x: Input tensor of shape (batch_size, seq_len, channels)
             residual: Residual tensor of shape (batch_size, seq_len, channels) or None.
         """
         n_channels = x.shape[-1]
         if residual is None:
@@ -123,7 +124,7 @@ class RCPSAddNormWrapper(RCPSWrapper):
             residual = torch.cat([residual_fwd, self.rc(residual_rc)], dim=-1)
             x = torch.cat([x_fwd, self.rc(x_rc)], dim=-1)
-        return x, residual
 class RCPSMambaBlock(nn.Module):
@@ -159,7 +160,7 @@ class RCPSMambaBlock(nn.Module):
             inference_params: inference parameters for mixer.
         """
         if not self.fused_add_norm:
-            hidden_states, residual = self.norm(hidden_states, residual=residual)
             if self.residual_in_fp32:
                 residual = residual.to(torch.float32)
         else:

     def __init__(self, submodule: nn.Module):
         super().__init__(submodule)
+    def forward(self, x, residual=None, prenorm=True):
         """
         Args:
             x: Input tensor of shape (batch_size, seq_len, channels)
             residual: Residual tensor of shape (batch_size, seq_len, channels) or None.
+            prenorm: Whether to return residual.
         """
         n_channels = x.shape[-1]
         if residual is None:
             residual = torch.cat([residual_fwd, self.rc(residual_rc)], dim=-1)
             x = torch.cat([x_fwd, self.rc(x_rc)], dim=-1)
+        return x if not prenorm else (x, residual)
 class RCPSMambaBlock(nn.Module):
             inference_params: inference parameters for mixer.
         """
         if not self.fused_add_norm:
+            hidden_states, residual = self.norm(hidden_states, residual=residual, prenorm=True)
             if self.residual_in_fp32:
                 residual = residual.to(torch.float32)
         else: