microsoft
/

phi-2

@@ -525,7 +525,6 @@ class MHA(nn.Module):
         softmax_scale: Optional[float] = None,
         layer_idx: Optional[int] = None,
         return_residual: bool = False,
-        checkpointing: bool = False,
     ) -> None:
         super().__init__()
@@ -585,7 +584,7 @@ class MHA(nn.Module):
         self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
         self.layer_idx = layer_idx
         self.return_residual = return_residual
-        self.checkpointing = checkpointing
     def _forward_self_attn(
         self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]

         softmax_scale: Optional[float] = None,
         layer_idx: Optional[int] = None,
         return_residual: bool = False,
     ) -> None:
         super().__init__()
         self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
         self.layer_idx = layer_idx
         self.return_residual = return_residual
+        self.checkpointing = getattr(config, "checkpointing", False)
     def _forward_self_attn(
         self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]