adalbertojunior
/

mrpt

@@ -134,7 +134,7 @@ class RobertaEmbeddings(nn.Module):
         return embeddings
-class RobertaSelfAttention(nn.Module):
     """Performs multi-headed self attention on a batch of unpadded sequences.
     If Triton is installed, this module uses Flash Attention to greatly improve throughput.
     The Flash Attention implementation used in Mosaic BERT supports arbitrary attention biases (which
@@ -158,18 +158,9 @@ class RobertaSelfAttention(nn.Module):
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.p_dropout = config.attention_probs_dropout_prob
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        # self.Wqkv = nn.Linear(self.all_head_size, 3 * config.hidden_size)
-    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(new_x_shape)
-        return x.permute(0, 2, 1, 3)
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 max_seqlen_in_batch: int, indices: torch.Tensor,
                 attn_mask: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
@@ -190,17 +181,17 @@ class RobertaSelfAttention(nn.Module):
         Returns:
             attention: (total_nnz, dim)
         """
-        # qkv = self.Wqkv(hidden_states)
-        # qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1,
-        #                 max_seqlen_in_batch)  # batch, max_seqlen_in_batch, thd
-        # qkv = rearrange(qkv,
-        #                 'b s (t h d) -> b s t h d',
-        #                 t=3,
-        #                 h=self.num_attention_heads)
         # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
-        q = self.transpose_for_scores(self.query(hidden_states))#qkv[:, :, 0, :, :].permute(0, 2, 1, 3)  # b h s d
-        k = self.transpose_for_scores(self.key(hidden_states))#qkv[:, :, 1, :, :].permute(0, 2, 3, 1)  # b h d s
-        v = self.transpose_for_scores(self.value(hidden_states))#qkv[:, :, 2, :, :].permute(0, 2, 1, 3)  # b h s d
         if self.p_dropout or xformers_available is False:
@@ -261,12 +252,12 @@ class RobertaSelfOutput(nn.Module):
         return hidden_states
-class RobertaAttention(nn.Module):
     """Chains attention, Dropout, and LayerNorm for Mosaic BERT."""
     def __init__(self, config):
         super().__init__()
-        self.self = RobertaSelfAttention(config)
         self.output = RobertaSelfOutput(config)
     def forward(
@@ -349,7 +340,7 @@ class RobertaLayer(nn.Module):
     def __init__(self, config):
         super(RobertaLayer, self).__init__()
-        self.attention = RobertaAttention(config)
         self.mlp = RobertaGatedLinearUnitMLP(config)
     def forward(

         return embeddings
+class RobertaUnpadSelfAttention(nn.Module):
     """Performs multi-headed self attention on a batch of unpadded sequences.
     If Triton is installed, this module uses Flash Attention to greatly improve throughput.
     The Flash Attention implementation used in Mosaic BERT supports arbitrary attention biases (which
         self.all_head_size = self.num_attention_heads * self.attention_head_size
         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
         self.p_dropout = config.attention_probs_dropout_prob
+        self.Wqkv = nn.Linear(self.all_head_size, 3 * config.hidden_size)
     def forward(self, hidden_states: torch.Tensor, cu_seqlens: torch.Tensor,
                 max_seqlen_in_batch: int, indices: torch.Tensor,
                 attn_mask: torch.Tensor, bias: torch.Tensor) -> torch.Tensor:
         Returns:
             attention: (total_nnz, dim)
         """
+        qkv = self.Wqkv(hidden_states)
+        qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1,
+                        max_seqlen_in_batch)  # batch, max_seqlen_in_batch, thd
+        qkv = rearrange(qkv,
+                        'b s (t h d) -> b s t h d',
+                        t=3,
+                        h=self.num_attention_heads)
         # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
+        q = qkv[:, :, 0, :, :].permute(0, 2, 1, 3)  # b h s d
+        k = qkv[:, :, 1, :, :].permute(0, 2, 3, 1)  # b h d s
+        v = qkv[:, :, 2, :, :].permute(0, 2, 1, 3)  # b h s d
         if self.p_dropout or xformers_available is False:
         return hidden_states
+class RobertaUnpadAttention(nn.Module):
     """Chains attention, Dropout, and LayerNorm for Mosaic BERT."""
     def __init__(self, config):
         super().__init__()
+        self.self = RobertaUnpadSelfAttention(config)
         self.output = RobertaSelfOutput(config)
     def forward(
     def __init__(self, config):
         super(RobertaLayer, self).__init__()
+        self.attention = RobertaUnpadAttention(config)
         self.mlp = RobertaGatedLinearUnitMLP(config)
     def forward(