soiz1 commited on
Commit
de2b10a
·
verified ·
1 Parent(s): 35a78c5

Create transformer.py

Browse files
Files changed (1) hide show
  1. modules/transformer.py +683 -0
modules/transformer.py ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import numbers
3
+ from functools import partial
4
+ from typing import Any, Callable, List, Optional, Tuple, Union
5
+
6
+ import torch
7
+ from torch import Tensor, nn
8
+ from torch.nn import functional as F
9
+
10
+ from .activation import MultiheadAttention
11
+ from .scaling import ActivationBalancer, BalancedDoubleSwish
12
+ from .scaling import BasicNorm as _BasicNorm
13
+
14
+ _shape_t = Union[int, List[int], torch.Size]
15
+
16
+
17
+ class LayerNorm(nn.Module):
18
+ __constants__ = ["normalized_shape", "eps", "elementwise_affine"]
19
+ normalized_shape: Tuple[int, ...]
20
+ eps: float
21
+ elementwise_affine: bool
22
+
23
+ def __init__(
24
+ self,
25
+ normalized_shape: _shape_t,
26
+ eps: float = 1e-5,
27
+ elementwise_affine: bool = True,
28
+ device=None,
29
+ dtype=None,
30
+ ) -> None:
31
+ factory_kwargs = {"device": device, "dtype": dtype}
32
+ super(LayerNorm, self).__init__()
33
+ if isinstance(normalized_shape, numbers.Integral):
34
+ # mypy error: incompatible types in assignment
35
+ normalized_shape = (normalized_shape,) # type: ignore[assignment]
36
+ self.normalized_shape = tuple(normalized_shape) # type: ignore[arg-type]
37
+ self.eps = eps
38
+ self.elementwise_affine = elementwise_affine
39
+ if self.elementwise_affine:
40
+ self.weight = nn.Parameter(
41
+ torch.empty(self.normalized_shape, **factory_kwargs)
42
+ )
43
+ self.bias = nn.Parameter(
44
+ torch.empty(self.normalized_shape, **factory_kwargs)
45
+ )
46
+ else:
47
+ self.register_parameter("weight", None)
48
+ self.register_parameter("bias", None)
49
+
50
+ self.reset_parameters()
51
+
52
+ def reset_parameters(self) -> None:
53
+ if self.elementwise_affine:
54
+ nn.init.ones_(self.weight)
55
+ nn.init.zeros_(self.bias)
56
+
57
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
58
+ if isinstance(input, tuple):
59
+ input, embedding = input
60
+ return (
61
+ F.layer_norm(
62
+ input,
63
+ self.normalized_shape,
64
+ self.weight,
65
+ self.bias,
66
+ self.eps,
67
+ ),
68
+ embedding,
69
+ )
70
+
71
+ assert embedding is None
72
+ return F.layer_norm(
73
+ input, self.normalized_shape, self.weight, self.bias, self.eps
74
+ )
75
+
76
+ def extra_repr(self) -> str:
77
+ return (
78
+ "{normalized_shape}, eps={eps}, "
79
+ "elementwise_affine={elementwise_affine}".format(**self.__dict__)
80
+ )
81
+
82
+
83
+ class AdaptiveLayerNorm(nn.Module):
84
+ r"""Adaptive Layer Normalization"""
85
+
86
+ def __init__(self, d_model, norm) -> None:
87
+ super(AdaptiveLayerNorm, self).__init__()
88
+ self.project_layer = nn.Linear(d_model, 2 * d_model)
89
+ self.norm = norm
90
+ self.d_model = d_model
91
+ self.eps = self.norm.eps
92
+
93
+ def forward(self, input: Tensor, embedding: Tensor = None) -> Tensor:
94
+ if isinstance(input, tuple):
95
+ input, embedding = input
96
+ weight, bias = torch.split(
97
+ self.project_layer(embedding),
98
+ split_size_or_sections=self.d_model,
99
+ dim=-1,
100
+ )
101
+ return (weight * self.norm(input) + bias, embedding)
102
+
103
+ weight, bias = torch.split(
104
+ self.project_layer(embedding),
105
+ split_size_or_sections=self.d_model,
106
+ dim=-1,
107
+ )
108
+ return weight * self.norm(input) + bias
109
+
110
+
111
+ class BasicNorm(_BasicNorm):
112
+ def __init__(
113
+ self,
114
+ d_model: int,
115
+ eps: float = 1e-5,
116
+ device=None,
117
+ dtype=None,
118
+ ):
119
+ super(BasicNorm, self).__init__(d_model, eps=eps)
120
+
121
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
122
+ if isinstance(input, tuple):
123
+ input, embedding = input
124
+ return (
125
+ super(BasicNorm, self).forward(input),
126
+ embedding,
127
+ )
128
+
129
+ assert embedding is None
130
+ return super(BasicNorm, self).forward(input)
131
+
132
+
133
+ class BalancedBasicNorm(nn.Module):
134
+ def __init__(
135
+ self,
136
+ d_model: int,
137
+ eps: float = 1e-5,
138
+ device=None,
139
+ dtype=None,
140
+ ):
141
+ super(BalancedBasicNorm, self).__init__()
142
+ self.balancer = ActivationBalancer(
143
+ d_model,
144
+ channel_dim=-1,
145
+ min_positive=0.45,
146
+ max_positive=0.55,
147
+ max_abs=6.0,
148
+ )
149
+ self.norm = BasicNorm(d_model, eps, device=device, dtype=dtype)
150
+
151
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
152
+ if isinstance(input, tuple):
153
+ input, embedding = input
154
+ return self.norm((self.balancer(input), embedding))
155
+
156
+ assert embedding is None
157
+ return self.norm(self.balancer(input))
158
+
159
+
160
+ class IdentityNorm(nn.Module):
161
+ def __init__(
162
+ self,
163
+ d_model: int,
164
+ eps: float = 1e-5,
165
+ device=None,
166
+ dtype=None,
167
+ ) -> None:
168
+ super(IdentityNorm, self).__init__()
169
+
170
+ def forward(self, input: Tensor, embedding: Any = None) -> Tensor:
171
+ if isinstance(input, tuple):
172
+ return input
173
+
174
+ assert embedding is None
175
+ return input
176
+
177
+
178
+ class TransformerEncoderLayer(nn.Module):
179
+ __constants__ = ["batch_first", "norm_first"]
180
+
181
+ def __init__(
182
+ self,
183
+ d_model: int,
184
+ nhead: int,
185
+ dim_feedforward: int = 2048,
186
+ dropout: float = 0.1,
187
+ activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
188
+ batch_first: bool = False,
189
+ norm_first: bool = False,
190
+ device=None,
191
+ dtype=None,
192
+ linear1_self_attention_cls: nn.Module = nn.Linear,
193
+ linear2_self_attention_cls: nn.Module = nn.Linear,
194
+ linear1_feedforward_cls: nn.Module = nn.Linear,
195
+ linear2_feedforward_cls: nn.Module = nn.Linear,
196
+ layer_norm_cls: nn.Module = LayerNorm,
197
+ layer_norm_eps: float = 1e-5,
198
+ adaptive_layer_norm=False,
199
+ ) -> None:
200
+ factory_kwargs = {"device": device, "dtype": dtype}
201
+ super(TransformerEncoderLayer, self).__init__()
202
+ self.self_attn = MultiheadAttention(
203
+ d_model,
204
+ nhead,
205
+ dropout=dropout,
206
+ batch_first=batch_first,
207
+ linear1_cls=linear1_self_attention_cls,
208
+ linear2_cls=linear2_self_attention_cls,
209
+ **factory_kwargs,
210
+ )
211
+
212
+ # Implementation of Feedforward model
213
+ self.linear1 = linear1_feedforward_cls(
214
+ d_model, dim_feedforward, **factory_kwargs
215
+ )
216
+ self.dropout = nn.Dropout(dropout)
217
+ self.linear2 = linear2_feedforward_cls(
218
+ dim_feedforward, d_model, **factory_kwargs
219
+ )
220
+
221
+ self.norm_first = norm_first
222
+ self.dropout1 = nn.Dropout(dropout)
223
+ self.dropout2 = nn.Dropout(dropout)
224
+
225
+ # Legacy string support for activation function.
226
+ if isinstance(activation, str):
227
+ activation = _get_activation_fn(activation)
228
+ elif isinstance(activation, partial):
229
+ activation = activation(d_model)
230
+ elif activation == BalancedDoubleSwish:
231
+ activation = BalancedDoubleSwish(d_model)
232
+
233
+ # # We can't test self.activation in forward() in TorchScript,
234
+ # # so stash some information about it instead.
235
+ # if activation is F.relu or isinstance(activation, torch.nn.ReLU):
236
+ # self.activation_relu_or_gelu = 1
237
+ # elif activation is F.gelu or isinstance(activation, torch.nn.GELU):
238
+ # self.activation_relu_or_gelu = 2
239
+ # else:
240
+ # self.activation_relu_or_gelu = 0
241
+ self.activation = activation
242
+
243
+ norm1 = layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
244
+ if layer_norm_cls == IdentityNorm:
245
+ norm2 = BalancedBasicNorm(
246
+ d_model, eps=layer_norm_eps, **factory_kwargs
247
+ )
248
+ else:
249
+ norm2 = layer_norm_cls(
250
+ d_model, eps=layer_norm_eps, **factory_kwargs
251
+ )
252
+
253
+ if adaptive_layer_norm:
254
+ self.norm1 = AdaptiveLayerNorm(d_model, norm1)
255
+ self.norm2 = AdaptiveLayerNorm(d_model, norm2)
256
+ else:
257
+ self.norm1 = norm1
258
+ self.norm2 = norm2
259
+
260
+ def __setstate__(self, state):
261
+ super(TransformerEncoderLayer, self).__setstate__(state)
262
+ if not hasattr(self, "activation"):
263
+ self.activation = F.relu
264
+
265
+ def forward(
266
+ self,
267
+ src: Tensor,
268
+ src_mask: Optional[Tensor] = None,
269
+ src_key_padding_mask: Optional[Tensor] = None,
270
+ ) -> Tensor:
271
+ r"""Pass the input through the encoder layer.
272
+
273
+ Args:
274
+ src: the sequence to the encoder layer (required).
275
+ src_mask: the mask for the src sequence (optional).
276
+ src_key_padding_mask: the mask for the src keys per batch (optional).
277
+
278
+ Shape:
279
+ see the docs in Transformer class.
280
+ """
281
+ x, stage_embedding = src, None
282
+ is_src_tuple = False
283
+ if isinstance(src, tuple):
284
+ x, stage_embedding = src
285
+ is_src_tuple = True
286
+
287
+ if src_key_padding_mask is not None:
288
+ _skpm_dtype = src_key_padding_mask.dtype
289
+ if _skpm_dtype != torch.bool and not torch.is_floating_point(
290
+ src_key_padding_mask
291
+ ):
292
+ raise AssertionError(
293
+ "only bool and floating types of key_padding_mask are supported"
294
+ )
295
+
296
+ if self.norm_first:
297
+ x = x + self._sa_block(
298
+ self.norm1(x, stage_embedding),
299
+ src_mask,
300
+ src_key_padding_mask,
301
+ )
302
+ x = x + self._ff_block(self.norm2(x, stage_embedding))
303
+ else:
304
+ x = self.norm1(
305
+ x + self._sa_block(x, src_mask, src_key_padding_mask),
306
+ stage_embedding,
307
+ )
308
+ x = self.norm2(x + self._ff_block(x), stage_embedding)
309
+
310
+ if is_src_tuple:
311
+ return (x, stage_embedding)
312
+ return x
313
+
314
+ def infer(
315
+ self,
316
+ src: Tensor,
317
+ src_mask: Optional[Tensor] = None,
318
+ src_key_padding_mask: Optional[Tensor] = None,
319
+ past_kv: Optional[Tensor] = None,
320
+ use_cache: bool = False,
321
+ ):
322
+ x, stage_embedding = src, None
323
+ is_src_tuple = False
324
+ if isinstance(src, tuple):
325
+ x, stage_embedding = src
326
+ is_src_tuple = True
327
+
328
+ if src_key_padding_mask is not None:
329
+ _skpm_dtype = src_key_padding_mask.dtype
330
+ if _skpm_dtype != torch.bool and not torch.is_floating_point(
331
+ src_key_padding_mask
332
+ ):
333
+ raise AssertionError(
334
+ "only bool and floating types of key_padding_mask are supported"
335
+ )
336
+
337
+ if self.norm_first:
338
+ x_attn_out, kv = self.self_attn.infer(
339
+ self.norm1(x, stage_embedding),
340
+ attn_mask=src_mask,
341
+ key_padding_mask=src_key_padding_mask,
342
+ need_weights=False,
343
+ past_kv=past_kv,
344
+ use_cache=use_cache,
345
+ )
346
+ x = x + x_attn_out
347
+ x = x + self._ff_block(self.norm2(x, stage_embedding))
348
+
349
+ if is_src_tuple:
350
+ return (x, stage_embedding)
351
+ return (x, kv)
352
+
353
+ # self-attention block
354
+ def _sa_block(
355
+ self,
356
+ x: Tensor,
357
+ attn_mask: Optional[Tensor],
358
+ key_padding_mask: Optional[Tensor],
359
+ ) -> Tensor:
360
+ x = self.self_attn(
361
+ x,
362
+ x,
363
+ x,
364
+ attn_mask=attn_mask,
365
+ key_padding_mask=key_padding_mask,
366
+ need_weights=False,
367
+ )[0]
368
+ return self.dropout1(x)
369
+
370
+ # feed forward block
371
+ def _ff_block(self, x: Tensor) -> Tensor:
372
+ x = self.linear2(self.dropout(self.activation(self.linear1(x))))
373
+ return self.dropout2(x)
374
+
375
+
376
+ class TransformerEncoder(nn.Module):
377
+ r"""TransformerEncoder is a stack of N encoder layers. Users can build the
378
+ BERT(https://arxiv.org/abs/1810.04805) model with corresponding parameters.
379
+
380
+ Args:
381
+ encoder_layer: an instance of the TransformerEncoderLayer() class (required).
382
+ num_layers: the number of sub-encoder-layers in the encoder (required).
383
+ norm: the layer normalization component (optional).
384
+ enable_nested_tensor: if True, input will automatically convert to nested tensor
385
+ (and convert back on output). This will improve the overall performance of
386
+ TransformerEncoder when padding rate is high. Default: ``True`` (enabled).
387
+
388
+ Examples::
389
+ >>> encoder_layer = TransformerEncoderLayer(d_model=512, nhead=8)
390
+ >>> transformer_encoder = TransformerEncoder(encoder_layer, num_layers=6)
391
+ >>> src = torch.rand(10, 32, 512)
392
+ >>> out = transformer_encoder(src)
393
+ """
394
+ __constants__ = ["norm"]
395
+
396
+ def __init__(self, encoder_layer, num_layers, norm=None):
397
+ super(TransformerEncoder, self).__init__()
398
+ self.layers = _get_clones(encoder_layer, num_layers)
399
+ self.num_layers = num_layers
400
+ self.norm = norm
401
+
402
+ def forward(
403
+ self,
404
+ src: Tensor,
405
+ mask: Optional[Tensor] = None,
406
+ src_key_padding_mask: Optional[Tensor] = None,
407
+ return_layer_states: bool = False,
408
+ ) -> Tensor:
409
+ r"""Pass the input through the encoder layers in turn.
410
+
411
+ Args:
412
+ src: the sequence to the encoder (required).
413
+ mask: the mask for the src sequence (optional).
414
+ src_key_padding_mask: the mask for the src keys per batch (optional).
415
+ return_layer_states: return layers' state (optional).
416
+
417
+ Shape:
418
+ see the docs in Transformer class.
419
+ """
420
+ if return_layer_states:
421
+ layer_states = [] # layers' output
422
+ output = src
423
+ for mod in self.layers:
424
+ output = mod(
425
+ output,
426
+ src_mask=mask,
427
+ src_key_padding_mask=src_key_padding_mask,
428
+ )
429
+ layer_states.append(output[0])
430
+
431
+ if self.norm is not None:
432
+ output = self.norm(output)
433
+
434
+ return layer_states, output
435
+
436
+ output = src
437
+ for mod in self.layers:
438
+ output = mod(
439
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask
440
+ )
441
+
442
+ if self.norm is not None:
443
+ output = self.norm(output)
444
+
445
+ return output
446
+
447
+ def infer(
448
+ self,
449
+ src: Tensor,
450
+ mask: Optional[Tensor] = None,
451
+ src_key_padding_mask: Optional[Tensor] = None,
452
+ return_layer_states: bool = False,
453
+ past_kv: Optional[Tensor] = None,
454
+ use_cache: bool = False,
455
+ ):
456
+ if past_kv is None:
457
+ past_length = 0
458
+ past_kv = tuple([None] * self.num_layers)
459
+ else:
460
+ past_length = past_kv[0][0].size(-2)
461
+ new_kv = () if use_cache else None
462
+ output = src
463
+ for mod, past_layer_kv in zip(self.layers, past_kv):
464
+ output, kv = mod.infer(
465
+ output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, past_kv=past_layer_kv, use_cache=use_cache
466
+ )
467
+ if use_cache:
468
+ new_kv = new_kv + (kv,)
469
+
470
+ if self.norm is not None:
471
+ output = self.norm(output)
472
+
473
+ return output, new_kv
474
+
475
+
476
+ class TransformerDecoderLayer(nn.Module):
477
+ __constants__ = ["batch_first", "norm_first"]
478
+
479
+ def __init__(
480
+ self,
481
+ d_model: int,
482
+ nhead: int,
483
+ dim_feedforward: int = 2048,
484
+ dropout: float = 0.1,
485
+ activation: Union[str, Callable[[Tensor], Tensor]] = F.relu,
486
+ linear1_self_attention_cls: nn.Module = nn.Linear,
487
+ linear2_self_attention_cls: nn.Module = nn.Linear,
488
+ linear1_feedforward_cls: nn.Module = nn.Linear,
489
+ linear2_feedforward_cls: nn.Module = nn.Linear,
490
+ batch_first: bool = False,
491
+ norm_first: bool = False,
492
+ device=None,
493
+ dtype=None,
494
+ layer_norm_cls: nn.Module = LayerNorm,
495
+ layer_norm_eps: float = 1e-5,
496
+ adaptive_layer_norm=False,
497
+ ) -> None:
498
+ factory_kwargs = {"device": device, "dtype": dtype}
499
+ super(TransformerDecoderLayer, self).__init__()
500
+ self.self_attn = MultiheadAttention(
501
+ d_model,
502
+ nhead,
503
+ dropout=dropout,
504
+ batch_first=batch_first,
505
+ linear1_cls=linear1_self_attention_cls,
506
+ linear2_cls=linear2_self_attention_cls,
507
+ **factory_kwargs,
508
+ )
509
+ self.multihead_attn = MultiheadAttention(
510
+ d_model,
511
+ nhead,
512
+ dropout=dropout,
513
+ batch_first=batch_first,
514
+ linear1_cls=linear1_self_attention_cls,
515
+ linear2_cls=linear2_self_attention_cls,
516
+ **factory_kwargs,
517
+ )
518
+ # Implementation of Feedforward model
519
+ self.linear1 = linear1_feedforward_cls(
520
+ d_model, dim_feedforward, **factory_kwargs
521
+ )
522
+ self.dropout = nn.Dropout(dropout)
523
+ self.linear2 = linear2_feedforward_cls(
524
+ dim_feedforward, d_model, **factory_kwargs
525
+ )
526
+
527
+ self.norm_first = norm_first
528
+ self.dropout1 = nn.Dropout(dropout)
529
+ self.dropout2 = nn.Dropout(dropout)
530
+ self.dropout3 = nn.Dropout(dropout)
531
+
532
+ # Legacy string support for activation function.
533
+ if isinstance(activation, str):
534
+ self.activation = _get_activation_fn(activation)
535
+ elif isinstance(activation, partial):
536
+ self.activation = activation(d_model)
537
+ elif activation == BalancedDoubleSwish:
538
+ self.activation = BalancedDoubleSwish(d_model)
539
+ else:
540
+ self.activation = activation
541
+
542
+ if adaptive_layer_norm:
543
+ norm1 = layer_norm_cls(
544
+ d_model, eps=layer_norm_eps, **factory_kwargs
545
+ )
546
+ norm2 = layer_norm_cls(
547
+ d_model, eps=layer_norm_eps, **factory_kwargs
548
+ )
549
+ norm3 = layer_norm_cls(
550
+ d_model, eps=layer_norm_eps, **factory_kwargs
551
+ )
552
+
553
+ self.norm1 = AdaptiveLayerNorm(d_model, norm1)
554
+ self.norm2 = AdaptiveLayerNorm(d_model, norm2)
555
+ self.norm3 = AdaptiveLayerNorm(d_model, norm3)
556
+ else:
557
+ self.norm1 = layer_norm_cls(
558
+ d_model, eps=layer_norm_eps, **factory_kwargs
559
+ )
560
+ self.norm2 = layer_norm_cls(
561
+ d_model, eps=layer_norm_eps, **factory_kwargs
562
+ )
563
+ if layer_norm_cls == IdentityNorm:
564
+ self.norm3 = BalancedBasicNorm(
565
+ d_model, eps=layer_norm_eps, **factory_kwargs
566
+ )
567
+ else:
568
+ self.norm3 = layer_norm_cls(
569
+ d_model, eps=layer_norm_eps, **factory_kwargs
570
+ )
571
+
572
+ def forward(
573
+ self,
574
+ tgt: Tensor,
575
+ memory: Tensor,
576
+ tgt_mask: Optional[Tensor] = None,
577
+ memory_mask: Optional[Tensor] = None,
578
+ tgt_key_padding_mask: Optional[Tensor] = None,
579
+ memory_key_padding_mask: Optional[Tensor] = None,
580
+ ) -> Tensor:
581
+ r"""Pass the inputs (and mask) through the decoder layer.
582
+
583
+ Args:
584
+ tgt: the sequence to the decoder layer (required).
585
+ memory: the sequence from the last layer of the encoder (required).
586
+ tgt_mask: the mask for the tgt sequence (optional).
587
+ memory_mask: the mask for the memory sequence (optional).
588
+ tgt_key_padding_mask: the mask for the tgt keys per batch (optional).
589
+ memory_key_padding_mask: the mask for the memory keys per batch (optional).
590
+
591
+ Shape:
592
+ see the docs in Transformer class.
593
+ """
594
+ tgt_is_tuple = False
595
+ if isinstance(tgt, tuple):
596
+ x, stage_embedding = tgt
597
+ tgt_is_tuple = True
598
+ else:
599
+ x, stage_embedding = tgt, None
600
+
601
+ if self.norm_first:
602
+ x = x + self._sa_block(
603
+ self.norm1(x, stage_embedding), tgt_mask, tgt_key_padding_mask
604
+ )
605
+ x = x + self._mha_block(
606
+ self.norm2(x, stage_embedding),
607
+ memory,
608
+ memory_mask,
609
+ memory_key_padding_mask,
610
+ )
611
+ x = x + self._ff_block(self.norm3(x, stage_embedding))
612
+ else:
613
+ x = self.norm1(
614
+ x + self._sa_block(x, tgt_mask, tgt_key_padding_mask),
615
+ stage_embedding,
616
+ )
617
+ x = self.norm2(
618
+ x
619
+ + self._mha_block(
620
+ x, memory, memory_mask, memory_key_padding_mask
621
+ ),
622
+ stage_embedding,
623
+ )
624
+ x = self.norm3(x + self._ff_block(x), stage_embedding)
625
+
626
+ if tgt_is_tuple:
627
+ return (x, stage_embedding)
628
+ return x
629
+
630
+ # self-attention block
631
+ def _sa_block(
632
+ self,
633
+ x: Tensor,
634
+ attn_mask: Optional[Tensor],
635
+ key_padding_mask: Optional[Tensor],
636
+ ) -> Tensor:
637
+ x = self.self_attn(
638
+ x,
639
+ x,
640
+ x,
641
+ attn_mask=attn_mask,
642
+ key_padding_mask=key_padding_mask,
643
+ need_weights=False,
644
+ )[0]
645
+ return self.dropout1(x)
646
+
647
+ # multihead attention block
648
+ def _mha_block(
649
+ self,
650
+ x: Tensor,
651
+ mem: Tensor,
652
+ attn_mask: Optional[Tensor],
653
+ key_padding_mask: Optional[Tensor],
654
+ ) -> Tensor:
655
+ x = self.multihead_attn(
656
+ x,
657
+ mem,
658
+ mem,
659
+ attn_mask=attn_mask,
660
+ key_padding_mask=key_padding_mask,
661
+ need_weights=False,
662
+ )[0]
663
+ return self.dropout2(x)
664
+
665
+ # feed forward block
666
+ def _ff_block(self, x: Tensor) -> Tensor:
667
+ x = self.linear2(self.dropout(self.activation(self.linear1(x))))
668
+ return self.dropout3(x)
669
+
670
+
671
+ def _get_clones(module, N):
672
+ return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
673
+
674
+
675
+ def _get_activation_fn(activation: str) -> Callable[[Tensor], Tensor]:
676
+ if activation == "relu":
677
+ return F.relu
678
+ elif activation == "gelu":
679
+ return F.gelu
680
+
681
+ raise RuntimeError(
682
+ "activation should be relu/gelu, not {}".format(activation)
683
+ )