Spaces:
Build error
Build error
| # Copyright (c) Meta Platforms, Inc. and affiliates. | |
| # All rights reserved. | |
| # This source code is licensed under the license found in the | |
| # LICENSE file in the root directory of this source tree. | |
| # -------------------------------------------------------- | |
| # Variable size position embedding utils for handling different image dimensions | |
| # -------------------------------------------------------- | |
| import numpy as np | |
| import torch | |
| import torch.nn.functional as F | |
| def get_2d_sincos_pos_embed_variable(embed_dim, grid_h, grid_w, cls_token=False): | |
| """ | |
| Create 2D sine-cosine position embeddings for variable grid sizes | |
| Args: | |
| embed_dim: embedding dimension | |
| grid_h: height of the grid (number of patches in height) | |
| grid_w: width of the grid (number of patches in width) | |
| cls_token: whether to include class token | |
| Returns: | |
| pos_embed: [grid_h*grid_w, embed_dim] or [1+grid_h*grid_w, embed_dim] (w/ or w/o cls_token) | |
| """ | |
| grid_h_coords = np.arange(grid_h, dtype=np.float32) | |
| grid_w_coords = np.arange(grid_w, dtype=np.float32) | |
| grid = np.meshgrid(grid_w_coords, grid_h_coords) # here w goes first | |
| grid = np.stack(grid, axis=0) | |
| grid = grid.reshape([2, 1, grid_h, grid_w]) | |
| pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) | |
| if cls_token: | |
| pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) | |
| return pos_embed | |
| def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): | |
| assert embed_dim % 2 == 0 | |
| # use half of dimensions to encode grid_h | |
| emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) | |
| emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) | |
| emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) | |
| return emb | |
| def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): | |
| """ | |
| embed_dim: output dimension for each position | |
| pos: a list of positions to be encoded: size (M,) | |
| out: (M, D) | |
| """ | |
| assert embed_dim % 2 == 0 | |
| omega = np.arange(embed_dim // 2, dtype=np.float) | |
| omega /= embed_dim / 2. | |
| omega = 1. / 10000**omega # (D/2,) | |
| pos = pos.reshape(-1) # (M,) | |
| out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product | |
| emb_sin = np.sin(out) # (M, D/2) | |
| emb_cos = np.cos(out) # (M, D/2) | |
| emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) | |
| return emb | |
| def interpolate_pos_embed_variable(original_pos_embed, target_h, target_w, cls_token=True): | |
| """ | |
| Interpolate position embeddings for arbitrary target sizes | |
| Args: | |
| original_pos_embed: original positional embeddings [1, N, D] | |
| target_h: target height in patches | |
| target_w: target width in patches | |
| cls_token: whether the first token is a class token | |
| Returns: | |
| interpolated_pos_embed: [1, target_h*target_w + cls_token, D] | |
| """ | |
| embed_dim = original_pos_embed.shape[-1] | |
| if cls_token: | |
| class_pos_embed = original_pos_embed[:, 0:1] # [1, 1, D] | |
| patch_pos_embed = original_pos_embed[:, 1:] # [1, N-1, D] | |
| orig_num_patches = patch_pos_embed.shape[1] | |
| else: | |
| class_pos_embed = None | |
| patch_pos_embed = original_pos_embed | |
| orig_num_patches = patch_pos_embed.shape[1] | |
| # Determine original grid size (assume square for original) | |
| orig_h = orig_w = int(np.sqrt(orig_num_patches)) | |
| if orig_h * orig_w != orig_num_patches: | |
| raise ValueError(f"Original number of patches {orig_num_patches} is not a perfect square") | |
| # Reshape to spatial dimensions | |
| patch_pos_embed = patch_pos_embed.reshape(1, orig_h, orig_w, embed_dim) | |
| patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) # [1, D, orig_h, orig_w] | |
| # Interpolate to target size | |
| patch_pos_embed = F.interpolate( | |
| patch_pos_embed, | |
| size=(target_h, target_w), | |
| mode='bicubic', | |
| align_corners=False | |
| ) | |
| # Reshape back to token sequence | |
| patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1) # [1, target_h, target_w, D] | |
| patch_pos_embed = patch_pos_embed.flatten(1, 2) # [1, target_h*target_w, D] | |
| if cls_token: | |
| new_pos_embed = torch.cat([class_pos_embed, patch_pos_embed], dim=1) | |
| else: | |
| new_pos_embed = patch_pos_embed | |
| return new_pos_embed | |
| def create_variable_pos_embed(embed_dim, height_patches, width_patches, cls_token=True): | |
| """ | |
| Create positional embeddings for specific patch grid dimensions | |
| Args: | |
| embed_dim: embedding dimension | |
| height_patches: number of patches in height | |
| width_patches: number of patches in width | |
| cls_token: whether to include class token | |
| Returns: | |
| pos_embed: positional embeddings tensor | |
| """ | |
| pos_embed_np = get_2d_sincos_pos_embed_variable( | |
| embed_dim, height_patches, width_patches, cls_token=cls_token | |
| ) | |
| pos_embed = torch.from_numpy(pos_embed_np).float().unsqueeze(0) | |
| return pos_embed | |