retrieva-jp
/

bert-1.3b

@@ -34,6 +34,7 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@@ -49,6 +50,10 @@ from transformers.modeling_outputs import (
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import (
@@ -56,6 +61,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     logging,
     replace_return_docstrings,
 )
@@ -407,6 +413,113 @@ class RetrievaBertSelfAttention(nn.Module):
         return outputs
 # Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to RetrievaBertAttention below.
 class RetrievaBertSelfOutput(nn.Module):
     def __init__(self, config):
@@ -420,12 +533,18 @@ class RetrievaBertSelfOutput(nn.Module):
         return residual + hidden_states
 # Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
 class RetrievaBertAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.self = RetrievaBertSelfAttention(config)
         self.output = RetrievaBertSelfOutput(config)
         self.pruned_heads = set()
@@ -808,6 +927,7 @@ class RetrievaBertPreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_megatron_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -953,6 +1073,8 @@ class RetrievaBertModel(RetrievaBertPreTrainedModel):
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
         # Initialize weights and apply final processing
         self.post_init()
@@ -1046,9 +1168,48 @@ class RetrievaBertModel(RetrievaBertPreTrainedModel):
         if position_ids is None:
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
@@ -1057,24 +1218,17 @@ class RetrievaBertModel(RetrievaBertPreTrainedModel):
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
-            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-        embedding_output = self.embeddings(
-            input_ids=input_ids,
-            position_ids=position_ids,
-            token_type_ids=token_type_ids,
-            inputs_embeds=inputs_embeds,
-            past_key_values_length=past_key_values_length,
-        )
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,

 import torch
 import torch.utils.checkpoint
+from packaging import version
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
+from transformers.modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from transformers.modeling_utils import PreTrainedModel
 from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
 from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    get_torch_version,
     logging,
     replace_return_docstrings,
 )
         return outputs
+class RetrievaBertSdpaSelfAttention(RetrievaBertSelfAttention):
+    def __init__(self, config):
+        super().__init__(config)
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        if output_attentions or head_mask is not None:
+            logger.warning_once(
+                "RetrievaBertSdpaSelfAttention is used but `torch.nn.fuctional.scaled_dot_product_attention`  does not support "
+                "`output_attentions=True` or `head_mask`. Falling back to the manual attention implementation. "
+            )
+            return super().forward(
+                hidden_states,
+                attention_mask,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+            )
+        bsz, tgt_len, _ = hidden_states.size()
+        mixed_query_layer = self.query(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer, is_query=True)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        # The following code is based on the implementation of `transformers.BertSdpaSelfAttention`
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            key_layer, value_layer = past_key_value
+        else:
+            key_layer = self.transpose_for_scores(self.key(current_states), is_query=False)
+            value_layer = self.transpose_for_scores(self.value(current_states), is_query=False)
+            if self.rope_emb is not None:
+                cos, sin = self.rope_emb(hidden_states, position_ids)
+                query_layer, key_layer = apply_rotary_pos_emb(query_layer, key_layer, cos, sin)
+            if past_key_value is not None and not is_cross_attention:
+                key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+                value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        # For GQA, we repeat the key/value weights.
+        key_layer = repeat_kv(key_layer, self.num_key_value_groups)
+        value_layer = repeat_kv(value_layer, self.num_key_value_groups)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+        # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom
+        # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0.
+        # Reference: https://github.com/pytorch/pytorch/issues/112577
+        if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create
+        # a causal mask in case tgt_len == 1.
+        is_causal = (
+            True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False
+        )
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=attention_mask,
+            is_causal=is_causal,
+            dropout_p=self.dropout_prob if self.training else 0.0,
+        )
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size)
+        outputs = (attn_output,)
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
 # Based transformers.models.bert.modeling_bert.BertSelfOutput. Moved LayerNorm to RetrievaBertAttention below.
 class RetrievaBertSelfOutput(nn.Module):
     def __init__(self, config):
         return residual + hidden_states
+RETRIEVA_BERT_SELF_ATTENTION_CLASSES = {
+    "eager": RetrievaBertSelfAttention,
+    "sdpa": RetrievaBertSdpaSelfAttention,
+}
 # Based transformers.models.bert.modeling_bert.BertAttention. Added LayerNorm.
 class RetrievaBertAttention(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.ln = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.self = RETRIEVA_BERT_SELF_ATTENTION_CLASSES[config._attn_implementation](config)
         self.output = RetrievaBertSelfOutput(config)
         self.pruned_heads = set()
     load_tf_weights = load_tf_weights_in_megatron_bert
     base_model_prefix = "bert"
     supports_gradient_checkpointing = True
+    _supports_sdpa = True
     def _init_weights(self, module):
         """Initialize the weights"""
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
+        self.attn_implementation = config._attn_implementation
         # Initialize weights and apply final processing
         self.post_init()
         if position_ids is None:
             position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        use_sdpa_attention_masks = (
+            self.attn_implementation == "adpa"
+            and head_mask is None
+            and not output_attentions
+        )
+        extended_attention_mask: torch.Tensor
+        if use_sdpa_attention_masks:
+            # Expand the attention mask for SDPA.
+            # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+            if self.config.is_decoder:
+                extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    input_shape,
+                    embedding_output,
+                    past_key_values_length,
+                )
+            else:
+                extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    attention_mask,
+                    embedding_output.dtype,
+                    tgt_len=seq_length,
+                )
+        else:
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
         # If a 2D or 3D attention mask is provided for the cross-attention
         # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
             encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
             if encoder_attention_mask is None:
                 encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            if use_sdpa_attention_masks:
+                # Expand the attention mask for SDPA.
+                # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len]
+                encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length
+                )
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
         else:
             encoder_extended_attention_mask = None
         encoder_outputs = self.encoder(
             embedding_output,
             attention_mask=extended_attention_mask,