soiz1 commited on
Commit
966ebd7
·
verified ·
1 Parent(s): c9eaca3

Create activation.py

Browse files
Files changed (1) hide show
  1. modules/activation.py +612 -0
modules/activation.py ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Tuple, List
2
+ import math
3
+
4
+ import torch
5
+ from torch import Tensor
6
+ from torch.nn import Linear, Module
7
+ from torch.nn import functional as F
8
+ from torch.nn.init import constant_, xavier_normal_, xavier_uniform_
9
+ from torch.nn.modules.linear import NonDynamicallyQuantizableLinear
10
+ from torch.nn.parameter import Parameter
11
+
12
+ def _in_projection_packed(
13
+ q: Tensor,
14
+ k: Tensor,
15
+ v: Tensor,
16
+ w: Tensor,
17
+ b: Optional[Tensor] = None,
18
+ ) -> List[Tensor]:
19
+ r"""
20
+ Performs the in-projection step of the attention operation, using packed weights.
21
+ Output is a triple containing projection tensors for query, key and value.
22
+
23
+ Args:
24
+ q, k, v: query, key and value tensors to be projected. For self-attention,
25
+ these are typically the same tensor; for encoder-decoder attention,
26
+ k and v are typically the same tensor. (We take advantage of these
27
+ identities for performance if they are present.) Regardless, q, k and v
28
+ must share a common embedding dimension; otherwise their shapes may vary.
29
+ w: projection weights for q, k and v, packed into a single tensor. Weights
30
+ are packed along dimension 0, in q, k, v order.
31
+ b: optional projection biases for q, k and v, packed into a single tensor
32
+ in q, k, v order.
33
+
34
+ Shape:
35
+ Inputs:
36
+ - q: :math:`(..., E)` where E is the embedding dimension
37
+ - k: :math:`(..., E)` where E is the embedding dimension
38
+ - v: :math:`(..., E)` where E is the embedding dimension
39
+ - w: :math:`(E * 3, E)` where E is the embedding dimension
40
+ - b: :math:`E * 3` where E is the embedding dimension
41
+
42
+ Output:
43
+ - in output list :math:`[q', k', v']`, each output tensor will have the
44
+ same shape as the corresponding input tensor.
45
+ """
46
+ E = q.size(-1)
47
+ if k is v:
48
+ if q is k:
49
+ # self-attention
50
+ return F.linear(q, w, b).chunk(3, dim=-1)
51
+ else:
52
+ # encoder-decoder attention
53
+ w_q, w_kv = w.split([E, E * 2])
54
+ if b is None:
55
+ b_q = b_kv = None
56
+ else:
57
+ b_q, b_kv = b.split([E, E * 2])
58
+ return (F.linear(q, w_q, b_q),) + F.linear(k, w_kv, b_kv).chunk(2, dim=-1)
59
+ else:
60
+ w_q, w_k, w_v = w.chunk(3)
61
+ if b is None:
62
+ b_q = b_k = b_v = None
63
+ else:
64
+ b_q, b_k, b_v = b.chunk(3)
65
+ return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
66
+
67
+ def _scaled_dot_product_attention(
68
+ q: Tensor,
69
+ k: Tensor,
70
+ v: Tensor,
71
+ attn_mask: Optional[Tensor] = None,
72
+ dropout_p: float = 0.0,
73
+ ) -> Tuple[Tensor, Tensor]:
74
+ r"""
75
+ Computes scaled dot product attention on query, key and value tensors, using
76
+ an optional attention mask if passed, and applying dropout if a probability
77
+ greater than 0.0 is specified.
78
+ Returns a tensor pair containing attended values and attention weights.
79
+
80
+ Args:
81
+ q, k, v: query, key and value tensors. See Shape section for shape details.
82
+ attn_mask: optional tensor containing mask values to be added to calculated
83
+ attention. May be 2D or 3D; see Shape section for details.
84
+ dropout_p: dropout probability. If greater than 0.0, dropout is applied.
85
+
86
+ Shape:
87
+ - q: :math:`(B, Nt, E)` where B is batch size, Nt is the target sequence length,
88
+ and E is embedding dimension.
89
+ - key: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
90
+ and E is embedding dimension.
91
+ - value: :math:`(B, Ns, E)` where B is batch size, Ns is the source sequence length,
92
+ and E is embedding dimension.
93
+ - attn_mask: either a 3D tensor of shape :math:`(B, Nt, Ns)` or a 2D tensor of
94
+ shape :math:`(Nt, Ns)`.
95
+
96
+ - Output: attention values have shape :math:`(B, Nt, E)`; attention weights
97
+ have shape :math:`(B, Nt, Ns)`
98
+ """
99
+ B, Nt, E = q.shape
100
+ q = q / math.sqrt(E)
101
+ # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns)
102
+ if attn_mask is not None:
103
+ attn = torch.baddbmm(attn_mask, q, k.transpose(-2, -1))
104
+ else:
105
+ attn = torch.bmm(q, k.transpose(-2, -1))
106
+
107
+ attn = F.softmax(attn, dim=-1)
108
+ if dropout_p > 0.0:
109
+ attn = F.dropout(attn, p=dropout_p)
110
+ # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E)
111
+ output = torch.bmm(attn, v)
112
+ return output, attn
113
+
114
+ def multi_head_attention_forward(
115
+ x,
116
+ ipw,
117
+ ipb,
118
+ opw,
119
+ opb,
120
+ n_head,
121
+ attn_mask,
122
+ past_kv=None,
123
+ use_cache=False,
124
+ ):
125
+ # x = x.transpose(1, 0)
126
+ # tgt_len, bsz, embed_dim = x.shape
127
+ # head_dim = embed_dim // n_head
128
+ # q, k, v = _in_projection_packed(x, x, x, ipw, ipb)
129
+ # q = q.contiguous().view(tgt_len, bsz * n_head, head_dim).transpose(0, 1)
130
+ # k = k.contiguous().view(k.shape[0], bsz * n_head, head_dim).transpose(0, 1)
131
+ # v = v.contiguous().view(v.shape[0], bsz * n_head, head_dim).transpose(0, 1)
132
+
133
+ # new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
134
+ # new_attn_mask.masked_fill_(attn_mask, float("-inf"))
135
+ # attn_mask = new_attn_mask
136
+ #
137
+ # attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, 0.0)
138
+ # attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
139
+ # attn_output = torch._C._nn.linear(attn_output, opw, opb)
140
+ # attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
141
+
142
+ B, T, C = x.size()
143
+
144
+ q, k, v = torch._C._nn.linear(x, ipw, ipb).chunk(3, dim=-1)
145
+ k = k.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
146
+ q = q.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
147
+ v = v.view(B, T, n_head, C // n_head).transpose(1, 2) # (B, nh, T, hs)
148
+ if past_kv is not None:
149
+ past_key = past_kv[0]
150
+ past_value = past_kv[1]
151
+ k = torch.cat((past_key, k), dim=-2)
152
+ v = torch.cat((past_value, v), dim=-2)
153
+
154
+ FULL_T = k.shape[-2]
155
+
156
+ if use_cache is True:
157
+ present = (k, v)
158
+ else:
159
+ present = None
160
+
161
+ att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
162
+ att = att.masked_fill(attn_mask[FULL_T - T:FULL_T, :FULL_T], float('-inf'))
163
+ att = F.softmax(att, dim=-1)
164
+ y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
165
+ y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
166
+ y = torch._C._nn.linear(y, opw, opb)
167
+ return (y, present)
168
+
169
+
170
+ class MultiheadAttention(Module):
171
+ r"""Allows the model to jointly attend to information
172
+ from different representation subspaces as described in the paper:
173
+ `Attention Is All You Need <https://arxiv.org/abs/1706.03762>`_.
174
+
175
+ Multi-Head Attention is defined as:
176
+
177
+ .. math::
178
+ \text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
179
+
180
+ where :math:`head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)`.
181
+
182
+ ``forward()`` will use a special optimized implementation if all of the following
183
+ conditions are met:
184
+
185
+ - self attention is being computed (i.e., ``query``, ``key``, and ``value`` are the same tensor. This
186
+ restriction will be loosened in the future.)
187
+ - Either autograd is disabled (using ``torch.inference_mode`` or ``torch.no_grad``) or no tensor argument ``requires_grad``
188
+ - training is disabled (using ``.eval()``)
189
+ - dropout is 0
190
+ - ``add_bias_kv`` is ``False``
191
+ - ``add_zero_attn`` is ``False``
192
+ - ``batch_first`` is ``True`` and the input is batched
193
+ - ``kdim`` and ``vdim`` are equal to ``embed_dim``
194
+ - at most one of ``key_padding_mask`` or ``attn_mask`` is passed
195
+ - if a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ is passed, neither ``key_padding_mask``
196
+ nor ``attn_mask`` is passed
197
+
198
+ If the optimized implementation is in use, a
199
+ `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_ can be passed for
200
+ ``query``/``key``/``value`` to represent padding more efficiently than using a
201
+ padding mask. In this case, a `NestedTensor <https://pytorch.org/docs/stable/nested.html>`_
202
+ will be returned, and an additional speedup proportional to the fraction of the input
203
+ that is padding can be expected.
204
+
205
+ Args:
206
+ embed_dim: Total dimension of the model.
207
+ num_heads: Number of parallel attention heads. Note that ``embed_dim`` will be split
208
+ across ``num_heads`` (i.e. each head will have dimension ``embed_dim // num_heads``).
209
+ dropout: Dropout probability on ``attn_output_weights``. Default: ``0.0`` (no dropout).
210
+ bias: If specified, adds bias to input / output projection layers. Default: ``True``.
211
+ add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
212
+ add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
213
+ Default: ``False``.
214
+ kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
215
+ vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
216
+ batch_first: If ``True``, then the input and output tensors are provided
217
+ as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
218
+
219
+ Examples::
220
+
221
+ >>> # xdoctest: +SKIP
222
+ >>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
223
+ >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
224
+
225
+ """
226
+ __constants__ = ["batch_first"]
227
+ bias_k: Optional[torch.Tensor]
228
+ bias_v: Optional[torch.Tensor]
229
+
230
+ def __init__(
231
+ self,
232
+ embed_dim,
233
+ num_heads,
234
+ dropout=0.0,
235
+ bias=True,
236
+ add_bias_kv=False,
237
+ add_zero_attn=False,
238
+ kdim=None,
239
+ vdim=None,
240
+ batch_first=False,
241
+ linear1_cls=Linear,
242
+ linear2_cls=Linear,
243
+ device=None,
244
+ dtype=None,
245
+ ) -> None:
246
+ factory_kwargs = {"device": device, "dtype": dtype}
247
+ super(MultiheadAttention, self).__init__()
248
+ self.embed_dim = embed_dim
249
+ self.kdim = kdim if kdim is not None else embed_dim
250
+ self.vdim = vdim if vdim is not None else embed_dim
251
+ self._qkv_same_embed_dim = (
252
+ self.kdim == embed_dim and self.vdim == embed_dim
253
+ )
254
+
255
+ self.num_heads = num_heads
256
+ self.dropout = dropout
257
+ self.batch_first = batch_first
258
+ self.head_dim = embed_dim // num_heads
259
+ assert (
260
+ self.head_dim * num_heads == self.embed_dim
261
+ ), "embed_dim must be divisible by num_heads"
262
+
263
+ if add_bias_kv:
264
+ self.bias_k = Parameter(
265
+ torch.empty((1, 1, embed_dim), **factory_kwargs)
266
+ )
267
+ self.bias_v = Parameter(
268
+ torch.empty((1, 1, embed_dim), **factory_kwargs)
269
+ )
270
+ else:
271
+ self.bias_k = self.bias_v = None
272
+
273
+ if linear1_cls == Linear:
274
+ if not self._qkv_same_embed_dim:
275
+ self.q_proj_weight = Parameter(
276
+ torch.empty((embed_dim, embed_dim), **factory_kwargs)
277
+ )
278
+ self.k_proj_weight = Parameter(
279
+ torch.empty((embed_dim, self.kdim), **factory_kwargs)
280
+ )
281
+ self.v_proj_weight = Parameter(
282
+ torch.empty((embed_dim, self.vdim), **factory_kwargs)
283
+ )
284
+ self.register_parameter("in_proj_weight", None)
285
+ else:
286
+ self.in_proj_weight = Parameter(
287
+ torch.empty((3 * embed_dim, embed_dim), **factory_kwargs)
288
+ )
289
+ self.register_parameter("q_proj_weight", None)
290
+ self.register_parameter("k_proj_weight", None)
291
+ self.register_parameter("v_proj_weight", None)
292
+
293
+ if bias:
294
+ self.in_proj_bias = Parameter(
295
+ torch.empty(3 * embed_dim, **factory_kwargs)
296
+ )
297
+ else:
298
+ self.register_parameter("in_proj_bias", None)
299
+ self.out_proj = NonDynamicallyQuantizableLinear(
300
+ embed_dim, embed_dim, bias=bias, **factory_kwargs
301
+ )
302
+
303
+ self._reset_parameters()
304
+ else:
305
+ if not self._qkv_same_embed_dim:
306
+ raise NotImplementedError
307
+ else:
308
+ self.in_proj_linear = linear1_cls(
309
+ embed_dim, 3 * embed_dim, bias=bias, **factory_kwargs
310
+ )
311
+ self.in_proj_weight = self.in_proj_linear.weight
312
+
313
+ self.register_parameter("q_proj_weight", None)
314
+ self.register_parameter("k_proj_weight", None)
315
+ self.register_parameter("v_proj_weight", None)
316
+
317
+ if bias:
318
+ self.in_proj_bias = self.in_proj_linear.bias
319
+ else:
320
+ self.register_parameter("in_proj_bias", None)
321
+
322
+ self.out_proj = linear2_cls(
323
+ embed_dim, embed_dim, bias=bias, **factory_kwargs
324
+ )
325
+
326
+ if self.bias_k is not None:
327
+ xavier_normal_(self.bias_k)
328
+ if self.bias_v is not None:
329
+ xavier_normal_(self.bias_v)
330
+
331
+ self.add_zero_attn = add_zero_attn
332
+
333
+ def _reset_parameters(self):
334
+ if self._qkv_same_embed_dim:
335
+ xavier_uniform_(self.in_proj_weight)
336
+ else:
337
+ xavier_uniform_(self.q_proj_weight)
338
+ xavier_uniform_(self.k_proj_weight)
339
+ xavier_uniform_(self.v_proj_weight)
340
+
341
+ if self.in_proj_bias is not None:
342
+ constant_(self.in_proj_bias, 0.0)
343
+ constant_(self.out_proj.bias, 0.0)
344
+
345
+ if self.bias_k is not None:
346
+ xavier_normal_(self.bias_k)
347
+ if self.bias_v is not None:
348
+ xavier_normal_(self.bias_v)
349
+
350
+ def __setstate__(self, state):
351
+ # Support loading old MultiheadAttention checkpoints generated by v1.1.0
352
+ if "_qkv_same_embed_dim" not in state:
353
+ state["_qkv_same_embed_dim"] = True
354
+
355
+ super(MultiheadAttention, self).__setstate__(state)
356
+
357
+ def forward(
358
+ self,
359
+ query: Tensor,
360
+ key: Tensor,
361
+ value: Tensor,
362
+ key_padding_mask: Optional[Tensor] = None,
363
+ need_weights: bool = True,
364
+ attn_mask: Optional[Tensor] = None,
365
+ average_attn_weights: bool = True,
366
+ ) -> Tuple[Tensor, Optional[Tensor]]:
367
+ r"""
368
+ Args:
369
+ query: Query embeddings of shape :math:`(L, E_q)` for unbatched input, :math:`(L, N, E_q)` when ``batch_first=False``
370
+ or :math:`(N, L, E_q)` when ``batch_first=True``, where :math:`L` is the target sequence length,
371
+ :math:`N` is the batch size, and :math:`E_q` is the query embedding dimension ``embed_dim``.
372
+ Queries are compared against key-value pairs to produce the output.
373
+ See "Attention Is All You Need" for more details.
374
+ key: Key embeddings of shape :math:`(S, E_k)` for unbatched input, :math:`(S, N, E_k)` when ``batch_first=False``
375
+ or :math:`(N, S, E_k)` when ``batch_first=True``, where :math:`S` is the source sequence length,
376
+ :math:`N` is the batch size, and :math:`E_k` is the key embedding dimension ``kdim``.
377
+ See "Attention Is All You Need" for more details.
378
+ value: Value embeddings of shape :math:`(S, E_v)` for unbatched input, :math:`(S, N, E_v)` when
379
+ ``batch_first=False`` or :math:`(N, S, E_v)` when ``batch_first=True``, where :math:`S` is the source
380
+ sequence length, :math:`N` is the batch size, and :math:`E_v` is the value embedding dimension ``vdim``.
381
+ See "Attention Is All You Need" for more details.
382
+ key_padding_mask: If specified, a mask of shape :math:`(N, S)` indicating which elements within ``key``
383
+ to ignore for the purpose of attention (i.e. treat as "padding"). For unbatched `query`, shape should be :math:`(S)`.
384
+ Binary and byte masks are supported.
385
+ For a binary mask, a ``True`` value indicates that the corresponding ``key`` value will be ignored for
386
+ the purpose of attention. For a float mask, it will be directly added to the corresponding ``key`` value.
387
+ need_weights: If specified, returns ``attn_output_weights`` in addition to ``attn_outputs``.
388
+ Default: ``True``.
389
+ attn_mask: If specified, a 2D or 3D mask preventing attention to certain positions. Must be of shape
390
+ :math:`(L, S)` or :math:`(N\cdot\text{num\_heads}, L, S)`, where :math:`N` is the batch size,
391
+ :math:`L` is the target sequence length, and :math:`S` is the source sequence length. A 2D mask will be
392
+ broadcasted across the batch while a 3D mask allows for a different mask for each entry in the batch.
393
+ Binary, byte, and float masks are supported. For a binary mask, a ``True`` value indicates that the
394
+ corresponding position is not allowed to attend. For a byte mask, a non-zero value indicates that the
395
+ corresponding position is not allowed to attend. For a float mask, the mask values will be added to
396
+ the attention weight.
397
+ average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across
398
+ heads. Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an
399
+ effect when ``need_weights=True``. Default: ``True`` (i.e. average weights across heads)
400
+
401
+ Outputs:
402
+ - **attn_output** - Attention outputs of shape :math:`(L, E)` when input is unbatched,
403
+ :math:`(L, N, E)` when ``batch_first=False`` or :math:`(N, L, E)` when ``batch_first=True``,
404
+ where :math:`L` is the target sequence length, :math:`N` is the batch size, and :math:`E` is the
405
+ embedding dimension ``embed_dim``.
406
+ - **attn_output_weights** - Only returned when ``need_weights=True``. If ``average_attn_weights=True``,
407
+ returns attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
408
+ :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
409
+ :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
410
+ head of shape :math:`(\text{num\_heads}, L, S)` when input is unbatched or :math:`(N, \text{num\_heads}, L, S)`.
411
+
412
+ .. note::
413
+ `batch_first` argument is ignored for unbatched inputs.
414
+ """
415
+ is_batched = query.dim() == 3
416
+ if key_padding_mask is not None:
417
+ _kpm_dtype = key_padding_mask.dtype
418
+ if _kpm_dtype != torch.bool and not torch.is_floating_point(
419
+ key_padding_mask
420
+ ):
421
+ raise AssertionError(
422
+ "only bool and floating types of key_padding_mask are supported"
423
+ )
424
+ why_not_fast_path = ""
425
+ if not is_batched:
426
+ why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
427
+ elif query is not key or key is not value:
428
+ # When lifting this restriction, don't forget to either
429
+ # enforce that the dtypes all match or test cases where
430
+ # they don't!
431
+ why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
432
+ elif (
433
+ self.in_proj_bias is not None
434
+ and query.dtype != self.in_proj_bias.dtype
435
+ ):
436
+ why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
437
+ elif (
438
+ self.in_proj_weight is not None
439
+ and query.dtype != self.in_proj_weight.dtype
440
+ ):
441
+ # this case will fail anyway, but at least they'll get a useful error message.
442
+ why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
443
+ elif self.training:
444
+ why_not_fast_path = "training is enabled"
445
+ elif not self.batch_first:
446
+ why_not_fast_path = "batch_first was not True"
447
+ elif self.bias_k is not None:
448
+ why_not_fast_path = "self.bias_k was not None"
449
+ elif self.bias_v is not None:
450
+ why_not_fast_path = "self.bias_v was not None"
451
+ elif self.dropout:
452
+ why_not_fast_path = f"dropout was {self.dropout}, required zero"
453
+ elif self.add_zero_attn:
454
+ why_not_fast_path = "add_zero_attn was enabled"
455
+ elif not self._qkv_same_embed_dim:
456
+ why_not_fast_path = "_qkv_same_embed_dim was not True"
457
+ elif attn_mask is not None:
458
+ why_not_fast_path = "attn_mask was not None"
459
+ elif query.is_nested and key_padding_mask is not None:
460
+ why_not_fast_path = (
461
+ "key_padding_mask is not supported with NestedTensor input"
462
+ )
463
+ elif self.num_heads % 2 == 1:
464
+ why_not_fast_path = "num_heads is odd"
465
+ elif torch.is_autocast_enabled():
466
+ why_not_fast_path = "autocast is enabled"
467
+
468
+ if not why_not_fast_path:
469
+ tensor_args = (
470
+ query,
471
+ key,
472
+ value,
473
+ self.in_proj_weight,
474
+ self.in_proj_bias,
475
+ self.out_proj.weight,
476
+ self.out_proj.bias,
477
+ )
478
+ # We have to use list comprehensions below because TorchScript does not support
479
+ # generator expressions.
480
+ if torch.overrides.has_torch_function(tensor_args):
481
+ why_not_fast_path = "some Tensor argument has_torch_function"
482
+ elif not all(
483
+ [
484
+ (x is None or x.is_cuda or "cpu" in str(x.device))
485
+ for x in tensor_args
486
+ ]
487
+ ):
488
+ why_not_fast_path = (
489
+ "some Tensor argument is neither CUDA nor CPU"
490
+ )
491
+ elif torch.is_grad_enabled() and any(
492
+ [x is not None and x.requires_grad for x in tensor_args]
493
+ ):
494
+ why_not_fast_path = (
495
+ "grad is enabled and at least one of query or the "
496
+ "input/output projection weights or biases requires_grad"
497
+ )
498
+ if not why_not_fast_path:
499
+ return torch._native_multi_head_attention(
500
+ query,
501
+ key,
502
+ value,
503
+ self.embed_dim,
504
+ self.num_heads,
505
+ self.in_proj_weight,
506
+ self.in_proj_bias,
507
+ self.out_proj.weight,
508
+ self.out_proj.bias,
509
+ key_padding_mask
510
+ if key_padding_mask is not None
511
+ else attn_mask,
512
+ need_weights,
513
+ average_attn_weights,
514
+ 1
515
+ if key_padding_mask is not None
516
+ else 0
517
+ if attn_mask is not None
518
+ else None,
519
+ )
520
+
521
+ any_nested = query.is_nested or key.is_nested or value.is_nested
522
+ assert not any_nested, (
523
+ "MultiheadAttention does not support NestedTensor outside of its fast path. "
524
+ + f"The fast path was not hit because {why_not_fast_path}"
525
+ )
526
+
527
+ if self.batch_first and is_batched:
528
+ # make sure that the transpose op does not affect the "is" property
529
+ if key is value:
530
+ if query is key:
531
+ query = key = value = query.transpose(1, 0)
532
+ else:
533
+ query, key = [x.transpose(1, 0) for x in (query, key)]
534
+ value = key
535
+ else:
536
+ query, key, value = [
537
+ x.transpose(1, 0) for x in (query, key, value)
538
+ ]
539
+
540
+ if not self._qkv_same_embed_dim:
541
+ attn_output, attn_output_weights = F.multi_head_attention_forward(
542
+ query,
543
+ key,
544
+ value,
545
+ self.embed_dim,
546
+ self.num_heads,
547
+ self.in_proj_weight,
548
+ self.in_proj_bias,
549
+ self.bias_k,
550
+ self.bias_v,
551
+ self.add_zero_attn,
552
+ self.dropout,
553
+ self.out_proj.weight,
554
+ self.out_proj.bias,
555
+ training=self.training,
556
+ key_padding_mask=key_padding_mask,
557
+ need_weights=need_weights,
558
+ attn_mask=attn_mask,
559
+ use_separate_proj_weight=True,
560
+ q_proj_weight=self.q_proj_weight,
561
+ k_proj_weight=self.k_proj_weight,
562
+ v_proj_weight=self.v_proj_weight,
563
+ average_attn_weights=average_attn_weights,
564
+ )
565
+ else:
566
+ attn_output, attn_output_weights = F.multi_head_attention_forward(
567
+ query,
568
+ key,
569
+ value,
570
+ self.embed_dim,
571
+ self.num_heads,
572
+ self.in_proj_weight,
573
+ self.in_proj_bias,
574
+ self.bias_k,
575
+ self.bias_v,
576
+ self.add_zero_attn,
577
+ self.dropout,
578
+ self.out_proj.weight,
579
+ self.out_proj.bias,
580
+ training=self.training,
581
+ key_padding_mask=key_padding_mask,
582
+ need_weights=need_weights,
583
+ attn_mask=attn_mask,
584
+ average_attn_weights=average_attn_weights,
585
+ )
586
+ if self.batch_first and is_batched:
587
+ return attn_output.transpose(1, 0), attn_output_weights
588
+ else:
589
+ return attn_output, attn_output_weights
590
+
591
+ def infer(self,
592
+ x: Tensor,
593
+ key_padding_mask: Optional[Tensor] = None,
594
+ need_weights: bool = True,
595
+ attn_mask: Optional[Tensor] = None,
596
+ average_attn_weights: bool = True,
597
+ past_kv = None,
598
+ use_cache = False
599
+ ):
600
+ # x = x.transpose(1, 0)
601
+ y, kv = multi_head_attention_forward(
602
+ x=x,
603
+ ipw=self.in_proj_weight,
604
+ ipb=self.in_proj_bias,
605
+ opw=self.out_proj.weight,
606
+ opb=self.out_proj.bias,
607
+ n_head=self.num_heads,
608
+ attn_mask=attn_mask,
609
+ past_kv=past_kv,
610
+ use_cache=use_cache,
611
+ )
612
+ return (y, kv)