magicslabnu
/

OutEffHop-opt-125m

Text Generation

text-generation-inference

Model card Files Files and versions

robinzixuan commited on Jun 14, 2024

Commit

b0eaf6f

·

verified ·

1 Parent(s): 7f1445a

Update modeling_opt.py

Files changed (1) hide show

modeling_opt.py +4 -3

modeling_opt.py CHANGED Viewed

@@ -38,6 +38,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
@@ -725,10 +726,10 @@ class OPTDecoderLayer(nn.Module):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = OPT_ATTENTION_CLASSES[config.attn_implementation](
             config=config, is_decoder=True)
         print(self.self_attn)
-        print(config.attn_implementation)
         self.do_layer_norm_before = config.do_layer_norm_before
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -970,7 +971,7 @@ class OPTDecoder(OPTPreTrainedModel):
         self.layers = nn.ModuleList(
             [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self._use_flash_attention_2 = config.attn_implementation == "flash_attention_2"
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing

     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
         super().__init__()
         self.embed_dim = config.hidden_size
+        self.self_attn = OPT_ATTENTION_CLASSES[config._attn_implementation](
             config=config, is_decoder=True)
         print(self.self_attn)
+        print(config._attn_implementation)
         self.do_layer_norm_before = config.do_layer_norm_before
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
         self.layers = nn.ModuleList(
             [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing