Spaces:

Ani24
/

SFM_Inference_Demo

Build error

SFM_Inference_Demo / util /variable_pos_embed.py

Anirudh Bhalekar

added models and util folder

a3f0d6c 6 months ago

5.03 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.
	# --------------------------------------------------------
	# Variable size position embedding utils for handling different image dimensions
	# --------------------------------------------------------

	import numpy as np
	import torch
	import torch.nn.functional as F


	def get_2d_sincos_pos_embed_variable(embed_dim, grid_h, grid_w, cls_token=False):
	"""
	Create 2D sine-cosine position embeddings for variable grid sizes

	Args:
	embed_dim: embedding dimension
	grid_h: height of the grid (number of patches in height)
	grid_w: width of the grid (number of patches in width)
	cls_token: whether to include class token

	Returns:
	pos_embed: [grid_hgrid_w, embed_dim] or [1+grid_hgrid_w, embed_dim] (w/ or w/o cls_token)
	"""
	grid_h_coords = np.arange(grid_h, dtype=np.float32)
	grid_w_coords = np.arange(grid_w, dtype=np.float32)
	grid = np.meshgrid(grid_w_coords, grid_h_coords) # here w goes first
	grid = np.stack(grid, axis=0)

	grid = grid.reshape([2, 1, grid_h, grid_w])
	pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
	if cls_token:
	pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
	return pos_embed


	def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
	assert embed_dim % 2 == 0

	# use half of dimensions to encode grid_h
	emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
	emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)

	emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
	return emb


	def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
	"""
	embed_dim: output dimension for each position
	pos: a list of positions to be encoded: size (M,)
	out: (M, D)
	"""
	assert embed_dim % 2 == 0
	omega = np.arange(embed_dim // 2, dtype=np.float)
	omega /= embed_dim / 2.
	omega = 1. / 10000**omega # (D/2,)

	pos = pos.reshape(-1) # (M,)
	out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product

	emb_sin = np.sin(out) # (M, D/2)
	emb_cos = np.cos(out) # (M, D/2)

	emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
	return emb


	def interpolate_pos_embed_variable(original_pos_embed, target_h, target_w, cls_token=True):
	"""
	Interpolate position embeddings for arbitrary target sizes

	Args:
	original_pos_embed: original positional embeddings [1, N, D]
	target_h: target height in patches
	target_w: target width in patches
	cls_token: whether the first token is a class token

	Returns:
	interpolated_pos_embed: [1, target_h*target_w + cls_token, D]
	"""
	embed_dim = original_pos_embed.shape[-1]

	if cls_token:
	class_pos_embed = original_pos_embed[:, 0:1] # [1, 1, D]
	patch_pos_embed = original_pos_embed[:, 1:] # [1, N-1, D]
	orig_num_patches = patch_pos_embed.shape[1]
	else:
	class_pos_embed = None
	patch_pos_embed = original_pos_embed
	orig_num_patches = patch_pos_embed.shape[1]

	# Determine original grid size (assume square for original)
	orig_h = orig_w = int(np.sqrt(orig_num_patches))

	if orig_h * orig_w != orig_num_patches:
	raise ValueError(f"Original number of patches {orig_num_patches} is not a perfect square")

	# Reshape to spatial dimensions
	patch_pos_embed = patch_pos_embed.reshape(1, orig_h, orig_w, embed_dim)
	patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) # [1, D, orig_h, orig_w]

	# Interpolate to target size
	patch_pos_embed = F.interpolate(
	patch_pos_embed,
	size=(target_h, target_w),
	mode='bicubic',
	align_corners=False
	)

	# Reshape back to token sequence
	patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1) # [1, target_h, target_w, D]
	patch_pos_embed = patch_pos_embed.flatten(1, 2) # [1, target_h*target_w, D]

	if cls_token:
	new_pos_embed = torch.cat([class_pos_embed, patch_pos_embed], dim=1)
	else:
	new_pos_embed = patch_pos_embed

	return new_pos_embed


	def create_variable_pos_embed(embed_dim, height_patches, width_patches, cls_token=True):
	"""
	Create positional embeddings for specific patch grid dimensions

	Args:
	embed_dim: embedding dimension
	height_patches: number of patches in height
	width_patches: number of patches in width
	cls_token: whether to include class token

	Returns:
	pos_embed: positional embeddings tensor
	"""
	pos_embed_np = get_2d_sincos_pos_embed_variable(
	embed_dim, height_patches, width_patches, cls_token=cls_token
	)
	pos_embed = torch.from_numpy(pos_embed_np).float().unsqueeze(0)
	return pos_embed