BucketOfFish
/

simplified_phi2

@@ -10,9 +10,6 @@
     "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
   },
   "embd_pdrop": 0.0,
-  "flash_attn": false,
-  "flash_rotary": false,
-  "fused_dense": false,
   "img_processor": null,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,

     "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
   },
   "embd_pdrop": 0.0,
   "img_processor": null,
   "initializer_range": 0.02,
   "layer_norm_epsilon": 1e-05,

configuration_phi.py CHANGED Viewed

@@ -29,9 +29,6 @@ class PhiConfig(PretrainedConfig):
         n_head_kv: Optional[int] = None,
         rotary_dim: Optional[int] = 32,
         activation_function: Optional[str] = "gelu_new",
-        flash_attn: bool = False,
-        flash_rotary: bool = False,
-        fused_dense: bool = False,
         attn_pdrop: float = 0.0,
         embd_pdrop: float = 0.0,
         resid_pdrop: float = 0.0,
@@ -50,9 +47,6 @@ class PhiConfig(PretrainedConfig):
         self.n_head_kv = n_head_kv
         self.rotary_dim = min(rotary_dim, n_embd // n_head)
         self.activation_function = activation_function
-        self.flash_attn = flash_attn
-        self.flash_rotary = flash_rotary
-        self.fused_dense = fused_dense
         self.attn_pdrop = attn_pdrop
         self.embd_pdrop = embd_pdrop
         self.resid_pdrop = resid_pdrop

         n_head_kv: Optional[int] = None,
         rotary_dim: Optional[int] = 32,
         activation_function: Optional[str] = "gelu_new",
         attn_pdrop: float = 0.0,
         embd_pdrop: float = 0.0,
         resid_pdrop: float = 0.0,
         self.n_head_kv = n_head_kv
         self.rotary_dim = min(rotary_dim, n_embd // n_head)
         self.activation_function = activation_function
         self.attn_pdrop = attn_pdrop
         self.embd_pdrop = embd_pdrop
         self.resid_pdrop = resid_pdrop

modeling_phi.py CHANGED Viewed

@@ -19,17 +19,6 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from .configuration_phi import PhiConfig
-try:
-    from flash_attn.bert_padding import pad_input, unpad_input
-    from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
-    from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
-    from flash_attn.ops.fused_dense import FusedDense
-except:
-    pad_input, unpad_input = None, None
-    FlashRotaryEmbedding = None
-    FlashSelfAttention, FlashCrossAttention = None, None
-    FusedDense = None
 @dataclass
 class InferenceParams:
@@ -532,7 +521,7 @@ class MHA(nn.Module):
         # Rotary embedding
         self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
         if self.rotary_dim > 0:
-            rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
             if rotary_cls is None:
                 rotary_cls = RotaryEmbedding
@@ -555,7 +544,7 @@ class MHA(nn.Module):
         op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
         hidden_size = config.n_embd
-        linear_cls = FusedDense if config.fused_dense else nn.Linear
         if linear_cls is None:
             linear_cls = nn.Linear
@@ -563,11 +552,11 @@ class MHA(nn.Module):
         self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
         # Attention
-        attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
         if attn_cls is None:
             attn_cls = SelfAttention
-        cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
         if cross_attn_cls is None:
             cross_attn_cls = CrossAttention
@@ -582,7 +571,6 @@ class MHA(nn.Module):
             attention_dropout=config.attn_pdrop,
         )
-        self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
         self.layer_idx = layer_idx
         self.return_residual = return_residual
         self.checkpointing = checkpointing
@@ -596,25 +584,6 @@ class MHA(nn.Module):
         if self.rotary_dim > 0:
             qkv = self.rotary_emb(qkv)
-        if self.flash_attn:
-            batch_size, seqlen = qkv.shape[0], qkv.shape[1]
-            cu_seqlens, max_seqlen = None, None
-            if key_padding_mask is not None:
-                # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
-                # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
-                qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
-            if self.checkpointing:
-                attn_output = torch.utils.checkpoint.checkpoint(
-                    self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
-                )
-            else:
-                attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
-            # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
-            return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
@@ -644,54 +613,6 @@ class MHA(nn.Module):
         if past_key_values is not None:
             kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
-        if self.flash_attn:
-            batch_size, seqlen_q = q.shape[0], q.shape[1]
-            seqlen_k = kv.shape[1]
-            cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
-                None,
-                None,
-                None,
-                None,
-            )
-            if key_padding_mask is not None:
-                kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
-                if seqlen_q == 1:
-                    key_padding_mask = torch.ones(batch_size, 1, device=q.device)
-                elif seqlen_q != seqlen_k:
-                    key_padding_mask = key_padding_mask[:, -seqlen_q:]
-                q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
-            if self.checkpointing:
-                attn_output = torch.utils.checkpoint.checkpoint(
-                    self.inner_cross_attn,
-                    q,
-                    kv,
-                    causal=causal,
-                    cu_seqlens=cu_seqlens_q,
-                    max_seqlen=max_seqlen_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_k=max_seqlen_k,
-                )
-            else:
-                attn_output = self.inner_cross_attn(
-                    q,
-                    kv,
-                    causal=causal,
-                    cu_seqlens=cu_seqlens_q,
-                    max_seqlen=max_seqlen_q,
-                    cu_seqlens_k=cu_seqlens_k,
-                    max_seqlen_k=max_seqlen_k,
-                )
-            return (
-                pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
-                if key_padding_mask is not None
-                else attn_output
-            )
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(
                 self.inner_cross_attn,

 from .configuration_phi import PhiConfig
 @dataclass
 class InferenceParams:
         # Rotary embedding
         self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
         if self.rotary_dim > 0:
+            rotary_cls = RotaryEmbedding
             if rotary_cls is None:
                 rotary_cls = RotaryEmbedding
         op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
         hidden_size = config.n_embd
+        linear_cls = nn.Linear
         if linear_cls is None:
             linear_cls = nn.Linear
         self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
         # Attention
+        attn_cls = SelfAttention
         if attn_cls is None:
             attn_cls = SelfAttention
+        cross_attn_cls = CrossAttention
         if cross_attn_cls is None:
             cross_attn_cls = CrossAttention
             attention_dropout=config.attn_pdrop,
         )
         self.layer_idx = layer_idx
         self.return_residual = return_residual
         self.checkpointing = checkpointing
         if self.rotary_dim > 0:
             qkv = self.rotary_emb(qkv)
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
         if past_key_values is not None:
             kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
         if self.checkpointing:
             return torch.utils.checkpoint.checkpoint(
                 self.inner_cross_attn,