# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # -------------------------------------------------------- # Variable size position embedding utils for handling different image dimensions # -------------------------------------------------------- import numpy as np import torch import torch.nn.functional as F def get_2d_sincos_pos_embed_variable(embed_dim, grid_h, grid_w, cls_token=False): """ Create 2D sine-cosine position embeddings for variable grid sizes Args: embed_dim: embedding dimension grid_h: height of the grid (number of patches in height) grid_w: width of the grid (number of patches in width) cls_token: whether to include class token Returns: pos_embed: [grid_h*grid_w, embed_dim] or [1+grid_h*grid_w, embed_dim] (w/ or w/o cls_token) """ grid_h_coords = np.arange(grid_h, dtype=np.float32) grid_w_coords = np.arange(grid_w, dtype=np.float32) grid = np.meshgrid(grid_w_coords, grid_h_coords) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_h, grid_w]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token: pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float) omega /= embed_dim / 2. omega = 1. / 10000**omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb def interpolate_pos_embed_variable(original_pos_embed, target_h, target_w, cls_token=True): """ Interpolate position embeddings for arbitrary target sizes Args: original_pos_embed: original positional embeddings [1, N, D] target_h: target height in patches target_w: target width in patches cls_token: whether the first token is a class token Returns: interpolated_pos_embed: [1, target_h*target_w + cls_token, D] """ embed_dim = original_pos_embed.shape[-1] if cls_token: class_pos_embed = original_pos_embed[:, 0:1] # [1, 1, D] patch_pos_embed = original_pos_embed[:, 1:] # [1, N-1, D] orig_num_patches = patch_pos_embed.shape[1] else: class_pos_embed = None patch_pos_embed = original_pos_embed orig_num_patches = patch_pos_embed.shape[1] # Determine original grid size (assume square for original) orig_h = orig_w = int(np.sqrt(orig_num_patches)) if orig_h * orig_w != orig_num_patches: raise ValueError(f"Original number of patches {orig_num_patches} is not a perfect square") # Reshape to spatial dimensions patch_pos_embed = patch_pos_embed.reshape(1, orig_h, orig_w, embed_dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) # [1, D, orig_h, orig_w] # Interpolate to target size patch_pos_embed = F.interpolate( patch_pos_embed, size=(target_h, target_w), mode='bicubic', align_corners=False ) # Reshape back to token sequence patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1) # [1, target_h, target_w, D] patch_pos_embed = patch_pos_embed.flatten(1, 2) # [1, target_h*target_w, D] if cls_token: new_pos_embed = torch.cat([class_pos_embed, patch_pos_embed], dim=1) else: new_pos_embed = patch_pos_embed return new_pos_embed def create_variable_pos_embed(embed_dim, height_patches, width_patches, cls_token=True): """ Create positional embeddings for specific patch grid dimensions Args: embed_dim: embedding dimension height_patches: number of patches in height width_patches: number of patches in width cls_token: whether to include class token Returns: pos_embed: positional embeddings tensor """ pos_embed_np = get_2d_sincos_pos_embed_variable( embed_dim, height_patches, width_patches, cls_token=cls_token ) pos_embed = torch.from_numpy(pos_embed_np).float().unsqueeze(0) return pos_embed