vivo-ai
/

BlueLM-7B-Base-32K

@@ -32,7 +32,12 @@ from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutpu
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bluelm import BlueLMConfig
 try:
     from xformers import ops as xops
@@ -213,6 +218,11 @@ class BlueLMAttention(nn.Module):
             hidden_size,
             bias=False,
         )
         self.rotary_emb = BlueLMRotaryEmbedding(self.head_dim)
         if xops is not None:
             self.causal_mask = xops.LowerTriangularMask()
@@ -230,7 +240,8 @@ class BlueLMAttention(nn.Module):
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
-        bsz, q_len, _ = hidden_states.size()
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
         key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
@@ -245,7 +256,7 @@ class BlueLMAttention(nn.Module):
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, offset=offset)
         # [bsz, t, nh, hd]
-        if past_key_value is not None:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key_value[0], key_states], dim=1)
             value_states = torch.cat([past_key_value[1], value_states], dim=1)
@@ -260,25 +271,12 @@ class BlueLMAttention(nn.Module):
             )
         else:
             # [bsz, t, nh, hd]
-            attn_weights = torch.einsum("bqnh,bknh->bnqk", query_states, key_states) / math.sqrt(self.head_dim)
-            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
-                    f" {attn_weights.size()}"
-                )
-            if attention_mask is not None:
-                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                    raise ValueError(
-                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                    )
-                attn_weights = attn_weights + attention_mask
-                attn_weights = torch.max(attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min))
-            # upcast attention to fp32
-            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-            attn_output = torch.einsum("bnqk,bknh->bqnh", attn_weights, value_states)
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
             raise ValueError(
@@ -612,7 +610,7 @@ class BlueLMModel(BlueLMPreTrainedModel):
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[1]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)

 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
 from .configuration_bluelm import BlueLMConfig
+from flash_attn.flash_attn_interface import (
+    flash_attn_func,
+    flash_attn_kvpacked_func,
+    flash_attn_qkvpacked_func,
+    flash_attn_varlen_kvpacked_func,
+)
 try:
     from xformers import ops as xops
             hidden_size,
             bias=False,
         )
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
         self.rotary_emb = BlueLMRotaryEmbedding(self.head_dim)
         if xops is not None:
             self.causal_mask = xops.LowerTriangularMask()
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
+        bsz, q_len, h_size = hidden_states.size()
+        has_layer_past = past_key_value is not None
         query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
         key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, offset=offset)
         # [bsz, t, nh, hd]
+        if has_layer_past:
             # reuse k, v, self_attention
             key_states = torch.cat([past_key_value[0], key_states], dim=1)
             value_states = torch.cat([past_key_value[1], value_states], dim=1)
             )
         else:
             # [bsz, t, nh, hd]
+            kv = torch.stack([key_states, value_states], 2)
+            attn_outputs = flash_attn_kvpacked_func(
+                query_states, kv, dropout_p=0.0, softmax_scale=1.0/self.norm_factor, causal=(not has_layer_past), return_attn_probs=output_attentions)
+            attn_output = attn_outputs[0] if output_attentions else attn_outputs
+            attn_weights = attn_outputs[2] if output_attentions else None
         if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
             raise ValueError(
         seq_length_with_past = seq_length
         past_key_values_length = 0
         if past_key_values is not None:
+            past_key_values_length = past_key_values[0][0].shape[2]
             seq_length_with_past = seq_length_with_past + past_key_values_length
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)