init commit

03ae676 10 months ago

14.3 kB

	from collections import OrderedDict
	import math
	from typing import Callable, List, Optional, Sequence, Tuple, Union

	import torch
	from torch import nn
	from torch.nn import functional as F

	from einops import pack, repeat

	from .flex_attn import Flex_Attention



	class LayerNormFp32(nn.LayerNorm):
	"""Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""

	def forward(self, x: torch.Tensor):
	orig_type = x.dtype
	x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
	return x.to(orig_type)


	class LayerNorm(nn.LayerNorm):
	"""Subclass torch's LayerNorm (with cast back to input dtype)."""

	def forward(self, x: torch.Tensor):
	orig_type = x.dtype
	x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
	return x.to(orig_type)


	class QuickGELU(nn.Module):
	# NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
	def forward(self, x: torch.Tensor):
	return x * torch.sigmoid(1.702 * x)


	class LayerScale(nn.Module):
	def __init__(self, dim, init_values=1e-5, inplace=False):
	super().__init__()
	self.inplace = inplace
	self.gamma = nn.Parameter(init_values * torch.ones(dim))

	def forward(self, x):
	return x.mul_(self.gamma) if self.inplace else x * self.gamma


	class PatchDropout(nn.Module):
	"""
	https://arxiv.org/abs/2212.00794
	"""

	def __init__(self, prob, exclude_first_token=True):
	super().__init__()
	assert 0 <= prob < 1.
	self.prob = prob
	self.exclude_first_token = exclude_first_token # exclude CLS token

	def forward(self, x):
	if not self.training or self.prob == 0.:
	return x

	if self.exclude_first_token:
	cls_tokens, x = x[:, :1], x[:, 1:]
	else:
	cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])

	batch = x.size()[0]
	num_tokens = x.size()[1]

	batch_indices = torch.arange(batch)
	batch_indices = batch_indices[..., None]

	keep_prob = 1 - self.prob
	num_patches_keep = max(1, int(num_tokens * keep_prob))

	rand = torch.randn(batch, num_tokens)
	patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices

	x = x[batch_indices, patch_indices_keep]

	if self.exclude_first_token:
	x = torch.cat((cls_tokens, x), dim=1)

	return x


	class Attention(nn.Module):
	def __init__(
	self,
	dim: int,
	num_heads: int = 8,
	qkv_bias: bool = True,
	scaled_cosine: bool = True,
	scale_heads: bool = False,
	logit_scale_max: float = math.log(1. / 0.01),
	batch_first: bool = True,
	attn_drop: float = 0.,
	proj_drop: float = 0.
	):
	super().__init__()
	self.scaled_cosine = scaled_cosine
	self.scale_heads = scale_heads
	assert dim % num_heads == 0, 'dim should be divisible by num_heads'
	self.num_heads = num_heads
	self.head_dim = dim // num_heads
	self.scale = self.head_dim ** -0.5
	self.logit_scale_max = logit_scale_max
	self.batch_first = batch_first
	self.use_fsdpa = hasattr(nn.functional, 'scaled_dot_product_attention')

	# keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
	self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
	if qkv_bias:
	self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
	else:
	self.in_proj_bias = None

	if self.scaled_cosine:
	self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
	else:
	self.logit_scale = None
	self.attn_drop = nn.Dropout(attn_drop)
	if self.scale_heads:
	self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
	else:
	self.head_scale = None
	self.out_proj = nn.Linear(dim, dim)
	self.out_drop = nn.Dropout(proj_drop)


	def forward(self, x, coords, attn_mask: Optional[torch.Tensor] = None):
	if self.batch_first:
	x = x.transpose(0, 1)

	L, N, C = x.shape
	q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
	q = q.reshape(L, N * self.num_heads, -1).transpose(0, 1)
	k = k.reshape(L, N * self.num_heads, -1).transpose(0, 1)
	v = v.reshape(L, N * self.num_heads, -1).transpose(0, 1)

	if attn_mask is not None and attn_mask.dtype == torch.bool:
	new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
	new_attn_mask.masked_fill_(attn_mask, float("-inf"))
	attn_mask = new_attn_mask

	# if self.logit_scale is not None:
	attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
	logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
	attn = attn.view(N, self.num_heads, L, L) * logit_scale

	if attn_mask is not None:
	attn = attn + attn_mask[:, None, None, :]
	attn = attn.view(-1, L, L)
	attn = attn.softmax(dim=-1)
	attn = self.attn_drop(attn)

	x = torch.bmm(attn, v)

	if self.head_scale is not None:
	x = x.view(N, self.num_heads, L, C) * self.head_scale
	x = x.view(-1, L, C)

	x = x.transpose(0, 1).reshape(L, N, C)

	if self.batch_first:
	x = x.transpose(0, 1)

	x = self.out_proj(x)
	x = self.out_drop(x)
	return x


	class AttentionalPooler(nn.Module):
	def __init__(
	self,
	d_model: int,
	context_dim: int,
	n_head: int = 8,
	n_queries: int = 256,
	norm_layer: Callable = LayerNorm,
	):
	super().__init__()
	self.query = nn.Parameter(torch.randn(n_queries, d_model))
	self.attn = nn.MultiheadAttention(d_model, n_head, kdim=context_dim, vdim=context_dim, batch_first=True)
	self.ln_q = norm_layer(d_model)
	self.ln_k = norm_layer(context_dim)

	def forward(self, x: torch.Tensor):
	N = x.shape[0]
	x = self.ln_k(x)
	q = self.ln_q(self.query)
	out = self.attn(q.unsqueeze(0).expand(N, -1, -1), x, x, need_weights=False)[0]
	return out


	class ResidualAttentionBlock(nn.Module):
	def __init__(
	self,
	d_model: int,
	n_head: int,
	mlp_ratio: float = 4.0,
	ls_init_value: float = None,
	act_layer: Callable = nn.GELU,
	norm_layer: Callable = LayerNorm,
	is_cross_attention: bool = False,
	batch_first: bool = True,
	use_flex:bool = False,
	dropout:float = 0.2,
	use_rel_bias:bool = True,
	):
	super().__init__()

	self.ln_1 = norm_layer(d_model)

	if use_flex:
	print("Flex_Attention!")
	self.attn = Flex_Attention(dim = d_model, num_heads=n_head, proj_drop=dropout, use_rel_bias=use_rel_bias)
	else:
	self.attn = Attention(dim = d_model, num_heads=n_head, batch_first=batch_first, proj_drop=dropout, attn_drop=dropout)

	self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
	if is_cross_attention:
	self.ln_1_kv = norm_layer(d_model)

	self.ln_2 = norm_layer(d_model)
	mlp_width = int(d_model * mlp_ratio)

	self.mlp = nn.Sequential(OrderedDict([
	("c_fc", nn.Linear(d_model, mlp_width)),
	("gelu", act_layer()),
	("c_proj", nn.Linear(mlp_width, d_model))
	]))
	self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()

	def attention(
	self,
	q_x: torch.Tensor,
	k_x: Optional[torch.Tensor] = None,
	v_x: Optional[torch.Tensor] = None,
	coords = None,
	attn_mask: Optional[torch.Tensor] = None,
	key_padding_mask=None,
	):
	k_x = k_x if k_x is not None else q_x
	v_x = v_x if v_x is not None else q_x

	attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None

	return self.attn(
	q_x, coords=coords, attn_mask=key_padding_mask
	)

	def forward(
	self,
	q_x: torch.Tensor,
	k_x: Optional[torch.Tensor] = None,
	v_x: Optional[torch.Tensor] = None,
	coords = None,
	attn_mask: Optional[torch.Tensor] = None,
	key_padding_mask = None,
	):
	k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
	v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
	x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, coords=coords, attn_mask=attn_mask, key_padding_mask=key_padding_mask))
	x = x + self.ls_2(self.mlp(self.ln_2(x)))
	return x


	def _expand_token(token, batch_size: int):
	return token.view(1, 1, -1).expand(batch_size, -1, -1)


	class Transformer(nn.Module):
	def __init__(
	self,
	width: int,
	layers: int,
	heads: int,
	mlp_ratio: float = 4.0,
	ls_init_value: float = None,
	act_layer: Callable = nn.GELU,
	norm_layer: Callable = LayerNorm,
	batch_first: bool = True,
	use_flex: bool = False,
	dropout: float = False,
	use_rel_bias: bool = True,
	):
	super().__init__()
	self.width = width
	self.layers = layers
	self.batch_first = batch_first
	self.grad_checkpointing = False

	self.resblocks = nn.ModuleList([
	ResidualAttentionBlock(
	width,
	heads,
	mlp_ratio,
	ls_init_value=ls_init_value,
	act_layer=act_layer,
	norm_layer=norm_layer,
	batch_first=batch_first,
	use_flex=use_flex,
	dropout=dropout,
	use_rel_bias=use_rel_bias
	)
	for _ in range(layers)
	])

	def get_cast_dtype(self) -> torch.dtype:
	if hasattr(self.resblocks[0].mlp.c_fc, 'int8_original_dtype'):
	return self.resblocks[0].mlp.c_fc.int8_original_dtype
	return self.resblocks[0].mlp.c_fc.weight.dtype

	def forward(self, x: torch.Tensor, coords = None, attn_mask: Optional[torch.Tensor] = None, key_padding_mask=None):
	if not self.batch_first:
	x = x.transpose(0, 1).contiguous() # NLD -> LND
	for r in self.resblocks:
	x = r(x, attn_mask=attn_mask, key_padding_mask=key_padding_mask, coords=coords)
	if not self.batch_first:
	x = x.transpose(0, 1).contiguous() # LND -> NLD
	return x



	class VisionTransformer(nn.Module):
	def __init__(
	self,
	width: int,
	layers: int,
	heads: int,
	mlp_ratio: float,
	ls_init_value: float = None,
	output_dim: int = 512,
	patch_dropout: float = 0.,
	no_ln_pre: bool = False,
	pool_type: str = 'tok',
	final_ln_after_pool: bool = False,
	act_layer: Callable = nn.GELU,
	norm_layer: Callable = LayerNorm,
	output_tokens: bool = False,
	img_embed: bool = False,
	use_flex:bool = False,
	dropout:float = 0.1,
	num_registers: int = 0,
	use_rel_bias: bool = True,
	):
	super().__init__()
	assert pool_type in ('tok', 'avg', 'none')
	self.output_tokens = output_tokens

	self.final_ln_after_pool = final_ln_after_pool # currently ignored w/ attn pool enabled
	self.output_dim = output_dim
	self.img_embed = img_embed
	self.num_registers = num_registers
	self.positional_embedding = None
	self.pre_linear = nn.Linear(768, width)


	if num_registers>0:
	self.register_token = nn.Parameter(torch.empty(num_registers, width))
	nn.init.normal_(self.register_token, std=0.02)


	self.positional_embedding = None


	self.positional_embedding = None

	# setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
	self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity()

	self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width)
	self.transformer = Transformer(
	width,
	layers,
	heads,
	mlp_ratio,
	ls_init_value=ls_init_value,
	act_layer=act_layer,
	norm_layer=norm_layer,
	use_flex=use_flex,
	dropout=dropout,
	use_rel_bias=use_rel_bias,
	)

	pool_dim = width
	self.pool_type = pool_type

	self.ln_post = norm_layer(pool_dim)

	def _global_pool(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	if self.pool_type == 'avg':
	pooled, tokens = x[:, 1:].mean(dim=1), x[:, 1:]
	elif self.pool_type == 'tok':
	pooled, tokens = x[:, 0], x[:, 1:]
	else:
	pooled = tokens = x

	return pooled, tokens

	def forward(self, x: torch.Tensor, coords=None, mask=None, key_padding_mask=None):
	x = self.pre_linear(x)

	if self.num_registers > 0:
	r = repeat(self.register_token, 'n d -> b n d', b=x.size(0))
	x, ps = pack([x, r], 'b * d')

	x = self.patch_dropout(x)
	x = self.ln_pre(x)
	x = self.transformer(x, coords, mask, key_padding_mask=key_padding_mask)

	if self.final_ln_after_pool:
	pooled, tokens = self._global_pool(x)
	pooled = self.ln_post(pooled)
	else:
	x = self.ln_post(x)
	pooled, tokens = self._global_pool(x)

	return pooled