Update modeling_opt.py
Browse files- modeling_opt.py +18 -18
modeling_opt.py
CHANGED
|
@@ -157,8 +157,8 @@ class OPTAttention(nn.Module):
|
|
| 157 |
|
| 158 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
| 159 |
raise ValueError(
|
| 160 |
-
f
|
| 161 |
-
self.embed_dim}
|
| 162 |
f" and `num_heads`: {self.num_heads})."
|
| 163 |
)
|
| 164 |
self.scaling = self.head_dim**-0.5
|
|
@@ -236,16 +236,16 @@ class OPTAttention(nn.Module):
|
|
| 236 |
|
| 237 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 238 |
raise ValueError(
|
| 239 |
-
f
|
| 240 |
-
(bsz * self.num_heads, tgt_len, src_len)}, but is
|
| 241 |
f" {attn_weights.size()}"
|
| 242 |
)
|
| 243 |
|
| 244 |
if attention_mask is not None:
|
| 245 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 246 |
raise ValueError(
|
| 247 |
-
f
|
| 248 |
-
attention_mask.size()}
|
| 249 |
)
|
| 250 |
attn_weights = attn_weights.view(
|
| 251 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
@@ -266,8 +266,8 @@ class OPTAttention(nn.Module):
|
|
| 266 |
if layer_head_mask is not None:
|
| 267 |
if layer_head_mask.size() != (self.num_heads,):
|
| 268 |
raise ValueError(
|
| 269 |
-
f
|
| 270 |
-
(self.num_heads,)}, but is
|
| 271 |
f" {layer_head_mask.size()}"
|
| 272 |
)
|
| 273 |
attn_weights = layer_head_mask.view(
|
|
@@ -333,8 +333,8 @@ class OPTOutEffHop(OPTAttention):
|
|
| 333 |
|
| 334 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
| 335 |
raise ValueError(
|
| 336 |
-
f
|
| 337 |
-
self.embed_dim}
|
| 338 |
f" and `num_heads`: {self.num_heads})."
|
| 339 |
)
|
| 340 |
self.scaling = self.head_dim**-0.5
|
|
@@ -412,16 +412,16 @@ class OPTOutEffHop(OPTAttention):
|
|
| 412 |
|
| 413 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 414 |
raise ValueError(
|
| 415 |
-
f
|
| 416 |
(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
| 417 |
-
f" {attn_weights.size()}
|
| 418 |
)
|
| 419 |
|
| 420 |
if attention_mask is not None:
|
| 421 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 422 |
raise ValueError(
|
| 423 |
-
f
|
| 424 |
-
attention_mask.size()}
|
| 425 |
)
|
| 426 |
attn_weights = attn_weights.view(
|
| 427 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
@@ -442,8 +442,8 @@ class OPTOutEffHop(OPTAttention):
|
|
| 442 |
if layer_head_mask is not None:
|
| 443 |
if layer_head_mask.size() != (self.num_heads,):
|
| 444 |
raise ValueError(
|
| 445 |
-
f
|
| 446 |
-
(self.num_heads,)}, but is
|
| 447 |
f" {layer_head_mask.size()}"
|
| 448 |
)
|
| 449 |
attn_weights = layer_head_mask.view(
|
|
@@ -470,8 +470,8 @@ class OPTOutEffHop(OPTAttention):
|
|
| 470 |
|
| 471 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
| 472 |
raise ValueError(
|
| 473 |
-
f
|
| 474 |
-
(bsz, self.num_heads, tgt_len, self.head_dim)}, but is
|
| 475 |
f" {attn_output.size()}"
|
| 476 |
)
|
| 477 |
|
|
|
|
| 157 |
|
| 158 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
| 159 |
raise ValueError(
|
| 160 |
+
f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
|
| 161 |
+
self.embed_dim}'''
|
| 162 |
f" and `num_heads`: {self.num_heads})."
|
| 163 |
)
|
| 164 |
self.scaling = self.head_dim**-0.5
|
|
|
|
| 236 |
|
| 237 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 238 |
raise ValueError(
|
| 239 |
+
f'''Attention weights should be of size {
|
| 240 |
+
(bsz * self.num_heads, tgt_len, src_len)}, but is'''
|
| 241 |
f" {attn_weights.size()}"
|
| 242 |
)
|
| 243 |
|
| 244 |
if attention_mask is not None:
|
| 245 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 246 |
raise ValueError(
|
| 247 |
+
f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
|
| 248 |
+
attention_mask.size()}'''
|
| 249 |
)
|
| 250 |
attn_weights = attn_weights.view(
|
| 251 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
|
|
| 266 |
if layer_head_mask is not None:
|
| 267 |
if layer_head_mask.size() != (self.num_heads,):
|
| 268 |
raise ValueError(
|
| 269 |
+
f'''Head mask for a single layer should be of size {
|
| 270 |
+
(self.num_heads,)}, but is'''
|
| 271 |
f" {layer_head_mask.size()}"
|
| 272 |
)
|
| 273 |
attn_weights = layer_head_mask.view(
|
|
|
|
| 333 |
|
| 334 |
if (self.head_dim * self.num_heads) != self.embed_dim:
|
| 335 |
raise ValueError(
|
| 336 |
+
f'''embed_dim must be divisible by num_heads (got `embed_dim`: {
|
| 337 |
+
self.embed_dim}'''
|
| 338 |
f" and `num_heads`: {self.num_heads})."
|
| 339 |
)
|
| 340 |
self.scaling = self.head_dim**-0.5
|
|
|
|
| 412 |
|
| 413 |
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 414 |
raise ValueError(
|
| 415 |
+
f'''Attention weights should be of size {
|
| 416 |
(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
| 417 |
+
f" {attn_weights.size()}'''
|
| 418 |
)
|
| 419 |
|
| 420 |
if attention_mask is not None:
|
| 421 |
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 422 |
raise ValueError(
|
| 423 |
+
f'''Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {
|
| 424 |
+
attention_mask.size()}'''
|
| 425 |
)
|
| 426 |
attn_weights = attn_weights.view(
|
| 427 |
bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
|
|
|
| 442 |
if layer_head_mask is not None:
|
| 443 |
if layer_head_mask.size() != (self.num_heads,):
|
| 444 |
raise ValueError(
|
| 445 |
+
f'''Head mask for a single layer should be of size {
|
| 446 |
+
(self.num_heads,)}, but is'''
|
| 447 |
f" {layer_head_mask.size()}"
|
| 448 |
)
|
| 449 |
attn_weights = layer_head_mask.view(
|
|
|
|
| 470 |
|
| 471 |
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
| 472 |
raise ValueError(
|
| 473 |
+
f'''`attn_output` should be of size {
|
| 474 |
+
(bsz, self.num_heads, tgt_len, self.head_dim)}, but is'''
|
| 475 |
f" {attn_output.size()}"
|
| 476 |
)
|
| 477 |
|