magicslabnu
/

gate_OutEffHop_opt-125m

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch OPT model."""
 from typing import List, Optional, Tuple, Union
 from functools import partial
 import torch
@@ -46,10 +46,12 @@ from transformers.utils import (
 )
 from .configuration_opt import OPTConfig
 def logit(p, eps=1e-16):
     p = np.clip(p, eps, 1 - eps)
     return -np.log(1 / p - 1)
 class BaseEnumOptions(Flag):
     def __str__(self):
         return self.name
@@ -198,7 +200,8 @@ class OPTAttention(nn.Module):
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
@@ -368,14 +371,16 @@ class OPTAttention(nn.Module):
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -396,7 +401,8 @@ class OPTAttention(nn.Module):
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
@@ -430,7 +436,8 @@ class OPTAttention(nn.Module):
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
@@ -1088,7 +1095,8 @@ class OPTDecoder(OPTPreTrainedModel):
                     batch_size, mask_seq_length, device=inputs_embeds.device)
             elif attention_mask.shape[1] != mask_seq_length:
                 raise ValueError(
-                    f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
                     f"{mask_seq_length} (sum of the lengths of current and past inputs)"
                 )
             causal_attention_mask = _prepare_4d_causal_attention_mask(
@@ -1120,7 +1128,8 @@ class OPTDecoder(OPTPreTrainedModel):
             if attn_mask is not None:
                 if attn_mask.size()[0] != (len(self.layers)):
                     raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
                         f" {head_mask.size()[0]}."
                     )

 # See the License for the specific language governing permissions and
 # limitations under the License.
 """PyTorch OPT model."""
+import numpy as np
 from typing import List, Optional, Tuple, Union
 from functools import partial
 import torch
 )
 from .configuration_opt import OPTConfig
 def logit(p, eps=1e-16):
     p = np.clip(p, eps, 1 - eps)
     return -np.log(1 / p - 1)
 class BaseEnumOptions(Flag):
     def __str__(self):
         return self.name
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {
+                    self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
+                f"Attention weights should be of size {
+                    (bsz * self.num_heads, tgt_len, src_len)}, but is"
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
+                        attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
+                    f"Head mask for a single layer should be of size {
+                        (self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
+                f"`attn_output` should be of size {
+                    (bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )
                     batch_size, mask_seq_length, device=inputs_embeds.device)
             elif attention_mask.shape[1] != mask_seq_length:
                 raise ValueError(
+                    f"The provided attention mask has length {
+                        attention_mask.shape[1]}, but its length should be "
                     f"{mask_seq_length} (sum of the lengths of current and past inputs)"
                 )
             causal_attention_mask = _prepare_4d_causal_attention_mask(
             if attn_mask is not None:
                 if attn_mask.size()[0] != (len(self.layers)):
                     raise ValueError(
+                        f"The `{mask_name}` should be specified for {
+                            len(self.layers)} layers, but it is for"
                         f" {head_mask.size()[0]}."
                     )