File size: 12,954 Bytes
f880dff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

import math
from typing import Optional, Tuple

import numpy as np
import torch
from torch import nn


class PositionEmbeddingSine(nn.Module):
    """
    This is a more standard version of the position embedding, very similar to the one
    used by the Attention Is All You Need paper, generalized to work on images.
    """

    def __init__(
            self,
            num_pos_feats,
            temperature: int = 10000,
            normalize: bool = True,
            scale: Optional[float] = None,
            # Following settings only relevant
            # for warmping up cache for compilation
            warmup_cache: bool = True,
            image_size: int = 1024,
            strides: Tuple[int] = (4, 8, 16, 32),
    ):
        super().__init__()
        assert num_pos_feats % 2 == 0, "Expecting even model width"
        self.num_pos_feats = num_pos_feats // 2
        self.temperature = temperature
        self.normalize = normalize
        if scale is not None and normalize is False:
            raise ValueError("normalize should be True if scale is passed")
        if scale is None:
            scale = 2 * math.pi
        self.scale = scale

        self.cache = {}
        if warmup_cache:
            # Warmup cache for cuda and npu, to help with compilation
            try:
                import torch_npu
                has_npu = torch_npu.npu.is_available()
            except ImportError:
                has_npu = False
            if torch.cuda.is_available() or has_npu:
                device = torch.device("cuda" if torch.cuda.is_available() else "npu")
                for stride in strides:
                    cache_key = (image_size // stride, image_size // stride)
                    self._pe(1, device, None, *cache_key)

    def _encode_xy(self, x, y):
        # NOTE: disable autocasting here
        raise NotImplementedError
        # The positions are expected to be normalized
        assert len(x) == len(y) and x.ndim == y.ndim == 1
        x_embed = x * self.scale
        y_embed = y * self.scale

        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)

        pos_x = x_embed[:, None] / dim_t
        pos_y = y_embed[:, None] / dim_t
        pos_x = torch.stack((pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2).flatten(1)
        pos_y = torch.stack((pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2).flatten(1)
        return pos_x, pos_y

    @torch.no_grad()
    def encode_boxes(self, x, y, w, h):
        # NOTE: disable autocasting here
        raise NotImplementedError
        pos_x, pos_y = self._encode_xy(x, y)
        pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1)
        return pos

    encode = encode_boxes  # Backwards compatibility

    @torch.no_grad()
    def encode_points(self, x, y, labels):
        # NOTE: disable autocasting here
        raise NotImplementedError
        (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape
        assert bx == by and nx == ny and bx == bl and nx == nl
        pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten())
        pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1)
        pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2)
        return pos

    @torch.no_grad()
    def _pe(self, B, device, dtype, *cache_key):
        H, W = cache_key
        if cache_key in self.cache:
            return self.cache[cache_key].to(device)[None].repeat(B, 1, 1, 1)

        # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
        with torch.autocast(device_type=device.type, enabled=False):
            y_embed = torch.arange(1, H + 1, dtype=torch.float32, device=device).view(1, -1, 1).repeat(B, 1, W)
            x_embed = torch.arange(1, W + 1, dtype=torch.float32, device=device).view(1, 1, -1).repeat(B, H, 1)

            if self.normalize:
                eps = 1e-6
                y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
                x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale

            dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=device)
            dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)

            pos_x = x_embed[:, :, :, None] / dim_t
            pos_y = y_embed[:, :, :, None] / dim_t
            pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
            pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
            pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)

        if dtype is not None:
            pos = pos.to(dtype)

        self.cache[cache_key] = pos[0]
        return pos

    @torch.no_grad()
    def forward(self, x: torch.Tensor):
        B = x.shape[0]
        cache_key = (x.shape[-2], x.shape[-1])
        return self._pe(B, x.device, x.dtype, *cache_key)


class PositionEmbeddingRandom(nn.Module):
    """
    Positional encoding using random spatial frequencies.
    """

    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
        super().__init__()
        if scale is None or scale <= 0.0:
            scale = 1.0
        self.register_buffer(
            "positional_encoding_gaussian_matrix",
            scale * torch.randn((2, num_pos_feats)),
        )

    @torch.no_grad()
    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
        """Positionally encode points that are normalized to [0,1]."""
        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
        coords = 2 * coords - 1
        coords = coords @ self.positional_encoding_gaussian_matrix.to(coords.dtype)
        coords = 2 * np.pi * coords
        # outputs d_1 x ... x d_n x C shape
        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)

    @torch.no_grad()
    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
        """Generate positional encoding for a grid of the specified size."""
        h, w = size
        device = self.positional_encoding_gaussian_matrix.device

        # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
        with torch.autocast(device_type=device.type, enabled=False):
            grid = torch.ones((h, w), device=device, dtype=torch.float32)
            y_embed = grid.cumsum(dim=0) - 0.5
            x_embed = grid.cumsum(dim=1) - 0.5
            y_embed = y_embed / h
            x_embed = x_embed / w
            pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))

        pe = pe.to(self.positional_encoding_gaussian_matrix.dtype)
        return pe.permute(2, 0, 1)  # C x H x W

    @torch.no_grad()
    def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
        """Positionally encode points that are not normalized to [0,1]."""
        assert coords_input.dtype == torch.float, 'coords_input must be in float32'

        # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
        with torch.autocast(device_type=coords_input.device.type, enabled=False):
            coords = coords_input.clone()
            coords[:, :, 0] = coords[:, :, 0] / image_size[1]
            coords[:, :, 1] = coords[:, :, 1] / image_size[0]
            pe = self._pe_encoding(coords.to(torch.float))  # B x N x C

        pe = pe.to(self.positional_encoding_gaussian_matrix.dtype)
        return pe


class PositionEmbedding1DRandom(nn.Module):
    """
    Positional encoding using random frequencies for 1D inputs.
    """

    def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
        super().__init__()
        if scale is None or scale <= 0.0:
            scale = 1.0
        self.register_buffer(
            "positional_encoding_gaussian_matrix",
            scale * torch.randn((1, num_pos_feats)),
        )

    @torch.no_grad()
    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
        """Positionally encode points that are normalized to [0,1]."""
        coords = 2 * coords - 1
        coords = coords @ self.positional_encoding_gaussian_matrix.to(coords.dtype)
        coords = 2 * np.pi * coords
        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)

    @torch.no_grad()
    def forward(self, size: int) -> torch.Tensor:
        """Generate positional encoding for a sequence of the specified length."""
        device = self.positional_encoding_gaussian_matrix.device

        # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
        with torch.autocast(device_type=device.type, enabled=False):
            positions = torch.arange(size, device=device, dtype=torch.float32)
            positions = positions / (size - 1)
            positions = positions.unsqueeze(-1)
            pe = self._pe_encoding(positions)

        pe = pe.to(self.positional_encoding_gaussian_matrix.dtype)
        return pe.permute(1, 0)  # C x L

    @torch.no_grad()
    def forward_with_coords(self, coords_input: torch.Tensor, seq_length: int) -> torch.Tensor:
        """Positionally encode raw coordinates by normalizing to [0,1]."""
        assert coords_input.dtype == torch.float, 'coords_input must be in float32'

        # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
        with torch.autocast(device_type=coords_input.device.type, enabled=False):
            coords = coords_input.clone()
            coords = coords / (seq_length - 1)
            if coords.dim() == 2:
                coords = coords.unsqueeze(-1)
            pe = self._pe_encoding(coords.to(torch.float))  # B x N x C

        pe = pe.to(self.positional_encoding_gaussian_matrix.dtype)
        return pe


# Rotary Positional Encoding, adapted from:
# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py
# 2. https://github.com/naver-ai/rope-vit
# 3. https://github.com/lucidrains/rotary-embedding-torch


@torch.no_grad()
def init_t_xy(end_x: int, end_y: int):
    t = torch.arange(end_x * end_y, dtype=torch.float32)
    t_x = (t % end_x).float()
    t_y = torch.div(t, end_x, rounding_mode="floor").float()
    return t_x, t_y


@torch.no_grad()
def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0):
    # Force fp32 on CPU (see https://github.com/huggingface/transformers/pull/29285)
    with torch.autocast(device_type='cpu', enabled=False):
        freqs_x = 1.0 / (theta**(torch.arange(0, dim, 4)[:(dim // 4)].float() / dim))
        freqs_y = 1.0 / (theta**(torch.arange(0, dim, 4)[:(dim // 4)].float() / dim))

        t_x, t_y = init_t_xy(end_x, end_y)
        freqs_x = torch.outer(t_x, freqs_x)
        freqs_y = torch.outer(t_y, freqs_y)
        freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x)
        freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y)

    return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1)


@torch.no_grad()
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
    ndim = x.ndim
    assert 0 <= 1 < ndim
    assert freqs_cis.shape == (x.shape[-2], x.shape[-1])
    shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)]
    return freqs_cis.view(*shape)


@torch.no_grad()
def apply_rotary_enc(
    xq: torch.Tensor,
    xk: torch.Tensor,
    freqs_cis: torch.Tensor,
    repeat_freqs_k: bool = False,
):
    # Force fp32 (https://github.com/huggingface/transformers/pull/29285)
    with torch.autocast(device_type=freqs_cis.device.type, enabled=False):
        xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
        xk_ = (torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) if xk.shape[-2] != 0 else None)
        freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
        xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
        if xk_ is None:
            # no keys to rotate, due to dropout
            return xq_out.type_as(xq).to(xq.device), xk
        # repeat freqs along seq_len dim to match k seq_len
        if repeat_freqs_k:
            r = xk_.shape[-2] // xq_.shape[-2]
            if freqs_cis.is_cuda:
                freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1)
            else:
                # torch.repeat on complex numbers may not be supported on non-CUDA devices
                # (freqs_cis has 4 dims and we repeat on dim 2) so we use expand + flatten
                freqs_cis = freqs_cis.unsqueeze(2).expand(-1, -1, r, -1, -1).flatten(2, 3)
        xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)

    return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device)