magicslabnu
/

OutEffHop-opt-125m

@@ -157,8 +157,8 @@ class OPTAttention(nn.Module):
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {
-                    self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
@@ -236,16 +236,16 @@ class OPTAttention(nn.Module):
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {
-                    (bsz * self.num_heads, tgt_len, src_len)}, but is"
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
-                        attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -266,8 +266,8 @@ class OPTAttention(nn.Module):
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
-                    f"Head mask for a single layer should be of size {
-                        (self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(
@@ -333,8 +333,8 @@ class OPTOutEffHop(OPTAttention):
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {
-                    self.embed_dim}"
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
@@ -412,16 +412,16 @@ class OPTOutEffHop(OPTAttention):
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
-                f"Attention weights should be of size {
                     (bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
-                        attention_mask.size()}"
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
@@ -442,8 +442,8 @@ class OPTOutEffHop(OPTAttention):
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
-                    f"Head mask for a single layer should be of size {
-                        (self.num_heads,)}, but is"
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(
@@ -470,8 +470,8 @@ class OPTOutEffHop(OPTAttention):
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
-                f"`attn_output` should be of size {
-                    (bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
                 f" {attn_output.size()}"
             )

         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
+                f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
+                    self.embed_dim}'''
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
+                f'''Attention weights should be of size {
+                    (bsz * self.num_heads, tgt_len, src_len)}, but is'''
                 f" {attn_weights.size()}"
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
+                    f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
+                        attention_mask.size()}'''
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
+                    f'''Head mask for a single layer should be of size {
+                        (self.num_heads,)}, but is'''
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(
         if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
+                f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
+                    self.embed_dim}'''
                 f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
         if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
             raise ValueError(
+                f'''Attention weights should be of size {
                     (bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}'''
             )
         if attention_mask is not None:
             if attention_mask.size() != (bsz, 1, tgt_len, src_len):
                 raise ValueError(
+                    f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
+                        attention_mask.size()}'''
                 )
             attn_weights = attn_weights.view(
                 bsz, self.num_heads, tgt_len, src_len) + attention_mask
         if layer_head_mask is not None:
             if layer_head_mask.size() != (self.num_heads,):
                 raise ValueError(
+                    f'''Head mask for a single layer should be of size {
+                        (self.num_heads,)}, but is'''
                     f" {layer_head_mask.size()}"
                 )
             attn_weights = layer_head_mask.view(
         if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
             raise ValueError(
+                f'''`attn_output` should be of size {
+                    (bsz, self.num_heads, tgt_len, self.head_dim)}, but is'''
                 f" {attn_output.size()}"
             )