TaylorAI
/

Flash-Llama-7B

Text Generation

text-generation-inference

Model card Files Files and versions

andersonbcdefg commited on Aug 28, 2023

Commit

422979b

·

1 Parent(s): a9ab032

Upload modeling_flash_llama.py

Files changed (1) hide show

modeling_flash_llama.py +4 -3

modeling_flash_llama.py CHANGED Viewed

@@ -290,9 +290,10 @@ class LlamaAttention(nn.Module):
             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             assert scaling_type == 'linear'
         self.rotary_emb = FlashRotaryEmbedding(
-            self.head_dim, base=10000, interleaved=False, scaling_factor=scaling_factor,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
@@ -362,7 +363,7 @@ class LlamaAttention(nn.Module):
         past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
         # no padding tokens, more efficient
-        attn_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
         attn_outputs = flash_attn_kvpacked_func(
             q.type(attn_dtype), kv.type(attn_dtype), dropout_p=0.0, softmax_scale=1.0/self.norm_factor, causal=(not has_layer_past), return_attn_probs=output_attentions)

             scaling_type = self.config.rope_scaling["type"]
             scaling_factor = self.config.rope_scaling["factor"]
             assert scaling_type == 'linear'
+        rotary_base = self.config.__dict__.get("rope_theta", 10000.0)
         self.rotary_emb = FlashRotaryEmbedding(
+            self.head_dim, base=rotary_base, interleaved=False, scaling_factor=scaling_factor,
         )
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
         # no padding tokens, more efficient
+        attn_dtype = self.o_proj.weight.dtype
         attn_outputs = flash_attn_kvpacked_func(
             q.type(attn_dtype), kv.type(attn_dtype), dropout_p=0.0, softmax_scale=1.0/self.norm_factor, causal=(not has_layer_past), return_attn_probs=output_attentions)