| | import torch |
| | import torch.nn as nn |
| | from math import log2, pi |
| | from typing import Tuple |
| | import torch.nn.functional as F |
| | from einops import rearrange |
| | from functools import partial |
| |
|
| |
|
| | def fourier_dimension_expansion( |
| | x: torch.Tensor, |
| | dim: int = 512, |
| | max_freq: int = 64, |
| | use_cos: bool = True, |
| | use_log: bool = True, |
| | ): |
| | device, dtype, input_dim = x.device, x.dtype, x.shape[-1] |
| | |
| | num_bands = dim // (2 * input_dim) if use_cos else dim // input_dim |
| | |
| | if use_log: |
| | scales = 2.0 ** torch.linspace( |
| | 0.0, log2(max_freq), steps=num_bands, device=device, dtype=dtype |
| | ) |
| | else: |
| | scales = torch.linspace( |
| | 1.0, max_freq / 2, num_bands, device=device, dtype=dtype |
| | ) |
| | x = x.unsqueeze(-1) |
| | scales = scales[(*((None,) * (len(x.shape) - 1)), Ellipsis)] |
| | x = x * scales * pi |
| | x = torch.cat( |
| | ( |
| | [x.sin(), x.cos()] |
| | if use_cos |
| | else [ |
| | x.sin(), |
| | ] |
| | ), |
| | dim=-1, |
| | ) |
| | x = x.flatten(-2) |
| | return x |
| |
|
| | def flatten( |
| | flat_tensor: torch.Tensor, |
| | old: Tuple[int, int], |
| | new: Tuple[int, int], |
| | ) -> torch.Tensor: |
| | if old[0] == new[0] and old[1] == new[1]: |
| | return flat_tensor |
| | tensor = flat_tensor.view(flat_tensor.shape[0], old[0], old[1], -1).permute( |
| | 0, 3, 1, 2 |
| | ) |
| | tensor_interp = F.interpolate( |
| | tensor, |
| | size=(new[0], new[1]), |
| | mode='nearest', |
| | ) |
| | flat_tensor_interp = tensor_interp.view( |
| | flat_tensor.shape[0], -1, new[0] * new[1] |
| | ).permute( |
| | 0, 2, 1 |
| | ) |
| | return flat_tensor_interp.contiguous() |
| |
|
| |
|
| | class DimensionAligner(nn.Module): |
| | def __init__(self, input_dims: list[int], hidden_dim: int): |
| | super().__init__() |
| | self.aligners = nn.ModuleList([]) |
| | self.num_chunks = len(input_dims) |
| | self.checkpoint = True |
| | for input_dim in input_dims: |
| | self.aligners.append(nn.Linear(input_dim, hidden_dim)) |
| |
|
| | def forward(self, xs: torch.Tensor) -> torch.Tensor: |
| | outs = [self.aligners[i](x) for i, x in enumerate(xs)] |
| | return outs |
| |
|
| |
|
| | class LayerScale(nn.Module): |
| | def __init__( |
| | self, |
| | dim: int, |
| | init_values: float | torch.Tensor = 1e-5, |
| | inplace: bool = False, |
| | ) -> None: |
| | super().__init__() |
| | self.inplace = inplace |
| | self.gamma = nn.Parameter(init_values * torch.ones(dim)) |
| |
|
| | def forward(self, x: torch.Tensor) -> torch.Tensor: |
| | return x.mul_(self.gamma) if self.inplace else x * self.gamma |
| |
|
| |
|
| | def exists(val): |
| | return val is not None |
| |
|
| | def default(val, d): |
| | if exists(val): |
| | return val |
| | return d() if callable(d) else d |
| |
|
| |
|
| | class SwiGLU(nn.Module): |
| | def forward(self, x: torch.Tensor) -> torch.Tensor: |
| | x, gates = x.chunk(2, dim=-1) |
| | return x * F.silu(gates) |
| |
|
| |
|
| | class MLP(nn.Module): |
| | def __init__( |
| | self, |
| | input_dim: int, |
| | expansion: int = 4, |
| | dropout: float = 0.0, |
| | gated: bool = False, |
| | output_dim: int | None = None, |
| | ): |
| | super().__init__() |
| | if gated: |
| | expansion = int(expansion * 2 / 3) |
| | hidden_dim = int(input_dim * expansion) |
| | output_dim = default(output_dim, input_dim) |
| | self.norm = nn.LayerNorm(input_dim) |
| | self.proj1 = nn.Linear(input_dim, hidden_dim) |
| | self.proj2 = nn.Linear(hidden_dim, output_dim) |
| | self.act = nn.GELU() if not gated else SwiGLU() |
| | self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity() |
| |
|
| | def forward(self, x: torch.Tensor) -> torch.Tensor: |
| | x = self.norm(x) |
| | x = self.proj1(x) |
| | x = self.act(x) |
| | x = self.proj2(x) |
| | x = self.dropout(x) |
| | return x |
| |
|
| |
|
| | class AttentionBlock(nn.Module): |
| | def __init__( |
| | self, |
| | dim: int, |
| | num_heads: int = 4, |
| | expansion: int = 4, |
| | dropout: float = 0.0, |
| | cosine: bool = False, |
| | gated: bool = False, |
| | layer_scale: float = 1.0, |
| | context_dim: int | None = None, |
| | detach_query: bool = False, |
| | residual_ls: bool = False, |
| | ): |
| | super().__init__() |
| | self.dropout = dropout |
| | self.num_heads = num_heads |
| | self.hidden_dim = dim |
| | context_dim = dim if context_dim is None else context_dim |
| | self.mlp = MLP(dim, expansion=expansion, dropout=dropout, gated=gated) |
| | self.kv = nn.Linear(context_dim, dim * 2, bias=False) |
| | self.q = nn.Linear(dim, dim, bias=False) |
| | self.norm_attnx = nn.LayerNorm(dim) |
| | self.norm_attnctx = nn.LayerNorm(context_dim) |
| | self.cosine = cosine |
| | self.out = nn.Linear(dim, dim, bias=False) |
| | self.ls1_1 = ( |
| | LayerScale(dim, layer_scale) |
| | if layer_scale > 0.0 and not residual_ls |
| | else nn.Identity() |
| | ) |
| | self.ls1_2 = ( |
| | LayerScale(dim, layer_scale) |
| | if layer_scale > 0.0 and residual_ls |
| | else nn.Identity() |
| | ) |
| | self.ls2 = LayerScale(dim, layer_scale) if layer_scale > 0.0 else nn.Identity() |
| | self.detach_query = detach_query |
| |
|
| | def attn( |
| | self, |
| | x: torch.Tensor, |
| | attn_bias: torch.Tensor | None = None, |
| | context: torch.Tensor | None = None, |
| | pos_embed: torch.Tensor | None = None, |
| | pos_embed_context: torch.Tensor | None = None, |
| | rope: nn.Module | None = None, |
| | rope_pos: torch.Tensor | None = None, |
| | ) -> torch.Tensor: |
| | if self.detach_query: |
| | x = x.detach() |
| | x = self.norm_attnx(x) |
| | context = self.norm_attnctx(context) |
| | k, v = rearrange( |
| | self.kv(context), 'b n (kv h d) -> b h n d kv', h=self.num_heads, kv=2 |
| | ).unbind(dim=-1) |
| | q = rearrange(self.q(x), 'b n (h d) -> b h n d', h=self.num_heads) |
| |
|
| | if rope is not None: |
| | q = rope(q.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) |
| | k = rope(k.permute(0, 2, 1, 3), input_pos=rope_pos).permute(0, 2, 1, 3) |
| | else: |
| | if pos_embed is not None: |
| | pos_embed = rearrange( |
| | pos_embed, 'b n (h d) -> b h n d', h=self.num_heads |
| | ) |
| | q = q + pos_embed |
| | if pos_embed_context is not None: |
| | pos_embed_context = rearrange( |
| | pos_embed_context, 'b n (h d) -> b h n d', h=self.num_heads |
| | ) |
| | k = k + pos_embed_context |
| |
|
| | if self.cosine: |
| | q, k = map(partial(F.normalize, p=2, dim=-1), (q, k)) |
| |
|
| | x = F.scaled_dot_product_attention( |
| | q, k, v, dropout_p=self.dropout, attn_mask=attn_bias |
| | ) |
| | x = rearrange(x, 'b h n d -> b n (h d)') |
| | x = self.out(x) |
| | return x |
| |
|
| | def forward( |
| | self, |
| | x: torch.Tensor, |
| | context: torch.Tensor | None = None, |
| | pos_embed: torch.Tensor | None = None, |
| | pos_embed_context: torch.Tensor | None = None, |
| | attn_bias: torch.Tensor | None = None, |
| | rope: nn.Module | None = None, |
| | rope_pos: torch.Tensor | None = None, |
| | ) -> torch.Tensor: |
| | context = x if context is None else context |
| | x = self.ls1_1( |
| | self.attn( |
| | x, |
| | rope=rope, |
| | rope_pos=rope_pos, |
| | attn_bias=attn_bias, |
| | context=context, |
| | pos_embed=pos_embed, |
| | pos_embed_context=pos_embed_context, |
| | ) |
| | ) + self.ls1_2(x) |
| | x = self.ls2(self.mlp(x)) + x |
| | return x |
| |
|
| |
|
| | class AttentionSeq(nn.Module): |
| | def __init__( |
| | self, |
| | num_blocks: int, |
| | dim: int, |
| | num_heads: int = 4, |
| | expansion: int = 4, |
| | dropout: float = 0.0, |
| | cosine: bool = False, |
| | gated: bool = False, |
| | layer_scale: float = 1.0, |
| | context_dim: int | None = None, |
| | detach_query: bool = False, |
| | residual_ls: bool = False, |
| | ): |
| | super().__init__() |
| | self.layers = nn.ModuleList( |
| | [ |
| | AttentionBlock( |
| | dim=dim, |
| | num_heads=num_heads, |
| | expansion=expansion, |
| | dropout=dropout, |
| | cosine=cosine, |
| | gated=gated, |
| | layer_scale=layer_scale, |
| | context_dim=context_dim, |
| | detach_query=detach_query, |
| | residual_ls=residual_ls, |
| | ) |
| | for _ in range(num_blocks) |
| | ] |
| | ) |
| |
|
| | def forward( |
| | self, |
| | x: torch.Tensor, |
| | context: torch.Tensor | None = None, |
| | pos_embed: torch.Tensor | None = None, |
| | pos_embed_context: torch.Tensor | None = None, |
| | attn_bias: torch.Tensor | None = None, |
| | rope: nn.Module | None = None, |
| | rope_pos: torch.Tensor | None = None, |
| | ) -> torch.Tensor: |
| | for layer in self.layers: |
| | x = layer( |
| | x, |
| | context=context, |
| | pos_embed=pos_embed, |
| | pos_embed_context=pos_embed_context, |
| | attn_bias=attn_bias, |
| | rope=rope, |
| | rope_pos=rope_pos, |
| | ) |
| | return x |
| |
|
| |
|
| | class ResidualConvNet(nn.Module): |
| | def __init__( |
| | self, |
| | dim, |
| | kernel_size: int = 3, |
| | padding_mode: str = 'zeros', |
| | dilation: int = 1, |
| | layer_scale: float = 1.0, |
| | use_norm: bool = False, |
| | ): |
| | super().__init__() |
| | self.conv1 = nn.Conv2d( |
| | dim, |
| | dim, |
| | kernel_size=kernel_size, |
| | padding=dilation * (kernel_size - 1) // 2, |
| | dilation=dilation, |
| | padding_mode=padding_mode, |
| | ) |
| | self.conv2 = nn.Conv2d( |
| | dim, |
| | dim, |
| | kernel_size=kernel_size, |
| | padding=dilation * (kernel_size - 1) // 2, |
| | dilation=dilation, |
| | padding_mode=padding_mode, |
| | ) |
| | self.activation = nn.LeakyReLU() |
| | self.gamma = ( |
| | nn.Parameter(layer_scale * torch.ones(1, dim, 1, 1)) |
| | if layer_scale > 0.0 |
| | else 1.0 |
| | ) |
| | self.norm1 = nn.GroupNorm(dim // 16, dim) if use_norm else nn.Identity() |
| | self.norm2 = nn.GroupNorm(dim // 16, dim) if use_norm else nn.Identity() |
| |
|
| | def forward(self, x): |
| | out = self.activation(x) |
| | out = self.conv1(out) |
| | out = self.norm1(out) |
| | out = self.activation(out) |
| | out = self.conv2(out) |
| | out = self.norm2(out) |
| | return self.gamma * out + x |
| |
|
| |
|
| | class ResidualUpsampler(nn.Module): |
| | def __init__( |
| | self, |
| | hidden_dim, |
| | output_dim: int = None, |
| | num_layers: int = 2, |
| | kernel_size: int = 3, |
| | layer_scale: float = 1.0, |
| | padding_mode: str = 'zeros', |
| | use_norm: bool = False, |
| | **kwargs, |
| | ): |
| | super().__init__() |
| | output_dim = output_dim if output_dim is not None else hidden_dim // 2 |
| | self.convs = nn.ModuleList([]) |
| | for _ in range(num_layers): |
| | self.convs.append( |
| | ResidualConvNet( |
| | hidden_dim, |
| | kernel_size=kernel_size, |
| | layer_scale=layer_scale, |
| | padding_mode=padding_mode, |
| | use_norm=use_norm, |
| | ) |
| | ) |
| | self.up = nn.Sequential( |
| | nn.Conv2d( |
| | hidden_dim, |
| | output_dim, |
| | kernel_size=1, |
| | padding=0, |
| | padding_mode=padding_mode, |
| | ), |
| | nn.Upsample(scale_factor=2, mode='bilinear', align_corners=False), |
| | ) |
| |
|
| | def forward(self, x: torch.Tensor): |
| | for conv in self.convs: |
| | x = conv(x) |
| | x = self.up(x) |
| | return x |
| |
|