Spaces:

genzoo-org
/

genzoo

Running

App Files Files Community

genzoo / src /models /components /pose_transformer.py

x-tomasz

initial commit

d8b3f49 over 1 year ago

raw

history blame contribute delete

7.51 kB

	from inspect import isfunction
	from typing import Callable, Optional

	import torch
	from einops import rearrange
	from torch import nn


	def exists(val):
	return val is not None


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	def normalization_layer(norm: Optional[str], dim: int, norm_cond_dim: int = -1):
	if norm == "batch":
	return torch.nn.BatchNorm1d(dim)
	elif norm == "layer":
	return torch.nn.LayerNorm(dim)
	elif norm is None:
	return torch.nn.Identity()
	else:
	raise ValueError(f"Unknown norm: {norm}")


	class PreNorm(nn.Module):
	def __init__(self, dim: int, fn: Callable, norm: str = "layer", norm_cond_dim: int = -1):
	super().__init__()
	self.norm = normalization_layer(norm, dim, norm_cond_dim)
	self.fn = fn

	def forward(self, x: torch.Tensor, args, *kwargs):
	return self.fn(self.norm(x), **kwargs)


	class FeedForward(nn.Module):
	def __init__(self, dim, hidden_dim, dropout=0.0):
	super().__init__()
	self.net = nn.Sequential(
	nn.Linear(dim, hidden_dim),
	nn.GELU(),
	nn.Dropout(dropout),
	nn.Linear(hidden_dim, dim),
	nn.Dropout(dropout),
	)

	def forward(self, x):
	return self.net(x)


	class Attention(nn.Module):
	def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
	super().__init__()
	inner_dim = dim_head * heads
	project_out = not (heads == 1 and dim_head == dim)

	self.heads = heads
	self.scale = dim_head**-0.5

	self.attend = nn.Softmax(dim=-1)
	self.dropout = nn.Dropout(dropout)

	self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)

	self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout)) if project_out else nn.Identity()

	def forward(self, x):
	qkv = self.to_qkv(x).chunk(3, dim=-1)
	q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), qkv)

	dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

	attn = self.attend(dots)
	attn = self.dropout(attn)

	out = torch.matmul(attn, v)
	out = rearrange(out, "b h n d -> b n (h d)")
	return self.to_out(out)


	class CrossAttention(nn.Module):
	def __init__(self, dim, context_dim=None, heads=8, dim_head=64, dropout=0.0):
	super().__init__()
	inner_dim = dim_head * heads
	project_out = not (heads == 1 and dim_head == dim)

	self.heads = heads
	self.scale = dim_head**-0.5

	self.attend = nn.Softmax(dim=-1)
	self.dropout = nn.Dropout(dropout)

	context_dim = default(context_dim, dim)
	self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=False)
	self.to_q = nn.Linear(dim, inner_dim, bias=False)

	self.to_out = nn.Sequential(nn.Linear(inner_dim, dim), nn.Dropout(dropout)) if project_out else nn.Identity()

	def forward(self, x, context=None):
	context = default(context, x)
	k, v = self.to_kv(context).chunk(2, dim=-1)
	q = self.to_q(x)
	q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.heads), [q, k, v])

	dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

	attn = self.attend(dots)
	attn = self.dropout(attn)

	out = torch.matmul(attn, v)
	out = rearrange(out, "b h n d -> b n (h d)")
	return self.to_out(out)


	class TransformerCrossAttn(nn.Module):
	def __init__(
	self,
	dim: int,
	depth: int,
	heads: int,
	dim_head: int,
	mlp_dim: int,
	dropout: float = 0.0,
	norm: str = "layer",
	norm_cond_dim: int = -1,
	context_dim: Optional[int] = None,
	):
	super().__init__()
	self.layers = nn.ModuleList([])
	for _ in range(depth):
	sa = Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)
	ca = CrossAttention(
	dim,
	context_dim=context_dim,
	heads=heads,
	dim_head=dim_head,
	dropout=dropout,
	)
	ff = FeedForward(dim, mlp_dim, dropout=dropout)
	self.layers.append(
	nn.ModuleList(
	[
	PreNorm(dim, sa, norm=norm, norm_cond_dim=norm_cond_dim),
	PreNorm(dim, ca, norm=norm, norm_cond_dim=norm_cond_dim),
	PreNorm(dim, ff, norm=norm, norm_cond_dim=norm_cond_dim),
	]
	)
	)

	def forward(self, x: torch.Tensor, *args, context=None, context_list=None):
	if context_list is None:
	context_list = [context] * len(self.layers)
	if len(context_list) != len(self.layers):
	raise ValueError(f"len(context_list) != len(self.layers) ({len(context_list)} != {len(self.layers)})")

	for i, (self_attn, cross_attn, ff) in enumerate(self.layers):
	x = self_attn(x, *args) + x
	x = cross_attn(x, *args, context=context_list[i]) + x
	x = ff(x, *args) + x
	return x


	class DropTokenDropout(nn.Module):
	def __init__(self, p: float = 0.1):
	super().__init__()
	if p < 0 or p > 1:
	raise ValueError("dropout probability has to be between 0 and 1, " "but got {}".format(p))
	self.p = p

	def forward(self, x: torch.Tensor):
	# x: (batch_size, seq_len, dim)
	if self.training and self.p > 0:
	zero_mask = torch.full_like(x[0, :, 0], self.p).bernoulli().bool()
	# TODO: permutation idx for each batch using torch.argsort
	if zero_mask.any():
	x = x[:, ~zero_mask, :]
	return x


	class TransformerDecoder(nn.Module):
	def __init__(
	self,
	num_tokens: int,
	token_dim: int,
	dim: int,
	depth: int,
	heads: int,
	mlp_dim: int,
	dim_head: int = 64,
	dropout: float = 0.0,
	emb_dropout: float = 0.0,
	emb_dropout_type: str = "drop",
	norm: str = "layer",
	norm_cond_dim: int = -1,
	context_dim: Optional[int] = None,
	skip_token_embedding: bool = False,
	):
	super().__init__()
	if not skip_token_embedding:
	self.to_token_embedding = nn.Linear(token_dim, dim)
	else:
	self.to_token_embedding = nn.Identity()
	if token_dim != dim:
	raise ValueError(f"token_dim ({token_dim}) != dim ({dim}) when skip_token_embedding is True")

	self.pos_embedding = nn.Parameter(torch.randn(1, num_tokens, dim))
	if emb_dropout_type == "drop":
	self.dropout = DropTokenDropout(emb_dropout)
	elif emb_dropout_type == "normal":
	self.dropout = nn.Dropout(emb_dropout)

	self.transformer = TransformerCrossAttn(
	dim,
	depth,
	heads,
	dim_head,
	mlp_dim,
	dropout,
	norm=norm,
	norm_cond_dim=norm_cond_dim,
	context_dim=context_dim,
	)

	def forward(self, inp: torch.Tensor, *args, context=None, context_list=None):
	x = self.to_token_embedding(inp)
	b, n, _ = x.shape

	x = self.dropout(x)
	x += self.pos_embedding[:, :n]

	x = self.transformer(x, *args, context=context, context_list=context_list)
	return x