| import math |
| from typing import Any, Dict, Optional, Tuple |
|
|
| import torch |
| import torch.nn as nn |
| import torch.nn.functional as F |
| from diffusers.models.attention import (GEGLU, GELU, AdaLayerNorm, |
| AdaLayerNormZero, ApproximateGELU) |
| from diffusers.models.attention_processor import Attention |
| from diffusers.models.lora import LoRACompatibleLinear |
| from diffusers.utils.torch_utils import maybe_allow_in_graph |
| from einops import pack, rearrange, repeat |
|
|
| from flashcosyvoice.modules.flow_components.upsample_encoder import \ |
| add_optional_chunk_mask |
|
|
|
|
| def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor: |
| assert mask.dtype == torch.bool |
| assert dtype in [torch.float32, torch.bfloat16, torch.float16] |
| mask = mask.to(dtype) |
| |
| |
| |
| mask = (1.0 - mask) * -1.0e+10 |
| return mask |
|
|
|
|
| class SnakeBeta(nn.Module): |
| """ |
| A modified Snake function which uses separate parameters for the magnitude of the periodic components |
| Shape: |
| - Input: (B, C, T) |
| - Output: (B, C, T), same shape as the input |
| Parameters: |
| - alpha - trainable parameter that controls frequency |
| - beta - trainable parameter that controls magnitude |
| References: |
| - This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda: |
| https://arxiv.org/abs/2006.08195 |
| Examples: |
| >>> a1 = snakebeta(256) |
| >>> x = torch.randn(256) |
| >>> x = a1(x) |
| |
| Args: |
| in_features: shape of the input |
| out_features: shape of the output |
| alpha: trainable parameter that controls frequency |
| alpha_trainable: whether alpha is trainable |
| alpha_logscale: whether to use log scale for alpha |
| alpha is initialized to 1 by default, higher values = higher-frequency. |
| beta is initialized to 1 by default, higher values = higher-magnitude. |
| alpha will be trained along with the rest of your model. |
| """ |
|
|
| def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True): |
| super().__init__() |
| self.in_features = out_features if isinstance(out_features, list) else [out_features] |
| self.proj = LoRACompatibleLinear(in_features, out_features) |
|
|
| |
| self.alpha_logscale = alpha_logscale |
| if self.alpha_logscale: |
| self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha) |
| self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha) |
| else: |
| self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha) |
| self.beta = nn.Parameter(torch.ones(self.in_features) * alpha) |
|
|
| self.alpha.requires_grad = alpha_trainable |
| self.beta.requires_grad = alpha_trainable |
|
|
| self.no_div_by_zero = 0.000000001 |
|
|
| def forward(self, x): |
| """ |
| Forward pass of the function. |
| Applies the function to the input elementwise. |
| SnakeBeta ∶= x + 1/b * sin^2 (xa) |
| """ |
| x = self.proj(x) |
| if self.alpha_logscale: |
| alpha = torch.exp(self.alpha) |
| beta = torch.exp(self.beta) |
| else: |
| alpha = self.alpha |
| beta = self.beta |
|
|
| x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2) |
|
|
| return x |
|
|
|
|
| class FeedForward(nn.Module): |
| r""" |
| A feed-forward layer. |
| |
| Parameters: |
| dim (`int`): The number of channels in the input. |
| dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. |
| mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. |
| dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. |
| activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. |
| final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. |
| """ |
|
|
| def __init__( |
| self, |
| dim: int, |
| dim_out: Optional[int] = None, |
| mult: int = 4, |
| dropout: float = 0.0, |
| activation_fn: str = "geglu", |
| final_dropout: bool = False, |
| ): |
| super().__init__() |
| inner_dim = int(dim * mult) |
| dim_out = dim_out if dim_out is not None else dim |
|
|
| if activation_fn == "gelu": |
| act_fn = GELU(dim, inner_dim) |
| if activation_fn == "gelu-approximate": |
| act_fn = GELU(dim, inner_dim, approximate="tanh") |
| elif activation_fn == "geglu": |
| act_fn = GEGLU(dim, inner_dim) |
| elif activation_fn == "geglu-approximate": |
| act_fn = ApproximateGELU(dim, inner_dim) |
| elif activation_fn == "snakebeta": |
| act_fn = SnakeBeta(dim, inner_dim) |
|
|
| self.net = nn.ModuleList([]) |
| |
| self.net.append(act_fn) |
| |
| self.net.append(nn.Dropout(dropout)) |
| |
| self.net.append(LoRACompatibleLinear(inner_dim, dim_out)) |
| |
| if final_dropout: |
| self.net.append(nn.Dropout(dropout)) |
|
|
| def forward(self, hidden_states): |
| for module in self.net: |
| hidden_states = module(hidden_states) |
| return hidden_states |
|
|
|
|
| @maybe_allow_in_graph |
| class BasicTransformerBlock(nn.Module): |
| r""" |
| A basic Transformer block. |
| |
| Parameters: |
| dim (`int`): The number of channels in the input and output. |
| num_attention_heads (`int`): The number of heads to use for multi-head attention. |
| attention_head_dim (`int`): The number of channels in each head. |
| dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. |
| cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention. |
| only_cross_attention (`bool`, *optional*): |
| Whether to use only cross-attention layers. In this case two cross attention layers are used. |
| double_self_attention (`bool`, *optional*): |
| Whether to use two self-attention layers. In this case no cross attention layers are used. |
| activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. |
| num_embeds_ada_norm (: |
| obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`. |
| attention_bias (: |
| obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter. |
| """ |
|
|
| def __init__( |
| self, |
| dim: int, |
| num_attention_heads: int, |
| attention_head_dim: int, |
| dropout=0.0, |
| cross_attention_dim: Optional[int] = None, |
| activation_fn: str = "geglu", |
| num_embeds_ada_norm: Optional[int] = None, |
| attention_bias: bool = False, |
| only_cross_attention: bool = False, |
| double_self_attention: bool = False, |
| upcast_attention: bool = False, |
| norm_elementwise_affine: bool = True, |
| norm_type: str = "layer_norm", |
| final_dropout: bool = False, |
| ): |
| super().__init__() |
| self.only_cross_attention = only_cross_attention |
|
|
| self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero" |
| self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm" |
|
|
| if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: |
| raise ValueError( |
| f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to" |
| f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}." |
| ) |
|
|
| |
| |
| if self.use_ada_layer_norm: |
| self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) |
| elif self.use_ada_layer_norm_zero: |
| self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm) |
| else: |
| self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) |
| self.attn1 = Attention( |
| query_dim=dim, |
| heads=num_attention_heads, |
| dim_head=attention_head_dim, |
| dropout=dropout, |
| bias=attention_bias, |
| cross_attention_dim=cross_attention_dim if only_cross_attention else None, |
| upcast_attention=upcast_attention, |
| ) |
|
|
| |
| if cross_attention_dim is not None or double_self_attention: |
| |
| |
| |
| self.norm2 = ( |
| AdaLayerNorm(dim, num_embeds_ada_norm) |
| if self.use_ada_layer_norm |
| else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) |
| ) |
| self.attn2 = Attention( |
| query_dim=dim, |
| cross_attention_dim=cross_attention_dim if not double_self_attention else None, |
| heads=num_attention_heads, |
| dim_head=attention_head_dim, |
| dropout=dropout, |
| bias=attention_bias, |
| upcast_attention=upcast_attention, |
| |
| ) |
| else: |
| self.norm2 = None |
| self.attn2 = None |
|
|
| |
| self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine) |
| self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout) |
|
|
| |
| self._chunk_size = None |
| self._chunk_dim = 0 |
|
|
| def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int): |
| |
| self._chunk_size = chunk_size |
| self._chunk_dim = dim |
|
|
| def forward( |
| self, |
| hidden_states: torch.FloatTensor, |
| attention_mask: Optional[torch.FloatTensor] = None, |
| encoder_hidden_states: Optional[torch.FloatTensor] = None, |
| encoder_attention_mask: Optional[torch.FloatTensor] = None, |
| timestep: Optional[torch.LongTensor] = None, |
| cross_attention_kwargs: Dict[str, Any] = None, |
| class_labels: Optional[torch.LongTensor] = None, |
| ): |
| |
| |
| if self.use_ada_layer_norm: |
| norm_hidden_states = self.norm1(hidden_states, timestep) |
| elif self.use_ada_layer_norm_zero: |
| norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1( |
| hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype |
| ) |
| else: |
| norm_hidden_states = self.norm1(hidden_states) |
|
|
| cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {} |
|
|
| attn_output = self.attn1( |
| norm_hidden_states, |
| encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None, |
| attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask, |
| **cross_attention_kwargs, |
| ) |
| if self.use_ada_layer_norm_zero: |
| attn_output = gate_msa.unsqueeze(1) * attn_output |
| hidden_states = attn_output + hidden_states |
|
|
| |
| if self.attn2 is not None: |
| norm_hidden_states = ( |
| self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states) |
| ) |
|
|
| attn_output = self.attn2( |
| norm_hidden_states, |
| encoder_hidden_states=encoder_hidden_states, |
| attention_mask=encoder_attention_mask, |
| **cross_attention_kwargs, |
| ) |
| hidden_states = attn_output + hidden_states |
|
|
| |
| norm_hidden_states = self.norm3(hidden_states) |
|
|
| if self.use_ada_layer_norm_zero: |
| norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] |
|
|
| if self._chunk_size is not None: |
| |
| if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0: |
| raise ValueError( |
| f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`." |
| ) |
|
|
| num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size |
| ff_output = torch.cat( |
| [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)], |
| dim=self._chunk_dim, |
| ) |
| else: |
| ff_output = self.ff(norm_hidden_states) |
|
|
| if self.use_ada_layer_norm_zero: |
| ff_output = gate_mlp.unsqueeze(1) * ff_output |
|
|
| hidden_states = ff_output + hidden_states |
|
|
| return hidden_states |
|
|
|
|
| class SinusoidalPosEmb(torch.nn.Module): |
| def __init__(self, dim): |
| super().__init__() |
| self.dim = dim |
| assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even" |
|
|
| def forward(self, x, scale=1000): |
| if x.ndim < 1: |
| x = x.unsqueeze(0) |
| device = x.device |
| half_dim = self.dim // 2 |
| emb = math.log(10000) / (half_dim - 1) |
| emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb) |
| emb = scale * x.unsqueeze(1) * emb.unsqueeze(0) |
| emb = torch.cat((emb.sin(), emb.cos()), dim=-1) |
| return emb |
|
|
|
|
| class Block1D(torch.nn.Module): |
| def __init__(self, dim, dim_out, groups=8): |
| super().__init__() |
| self.block = torch.nn.Sequential( |
| torch.nn.Conv1d(dim, dim_out, 3, padding=1), |
| torch.nn.GroupNorm(groups, dim_out), |
| nn.Mish(), |
| ) |
|
|
| def forward(self, x, mask): |
| output = self.block(x * mask) |
| return output * mask |
|
|
|
|
| class ResnetBlock1D(torch.nn.Module): |
| def __init__(self, dim, dim_out, time_emb_dim, groups=8): |
| super().__init__() |
| self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out)) |
|
|
| self.block1 = Block1D(dim, dim_out, groups=groups) |
| self.block2 = Block1D(dim_out, dim_out, groups=groups) |
|
|
| self.res_conv = torch.nn.Conv1d(dim, dim_out, 1) |
|
|
| def forward(self, x, mask, time_emb): |
| h = self.block1(x, mask) |
| h += self.mlp(time_emb).unsqueeze(-1) |
| h = self.block2(h, mask) |
| output = h + self.res_conv(x * mask) |
| return output |
|
|
|
|
| class Downsample1D(nn.Module): |
| def __init__(self, dim): |
| super().__init__() |
| self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1) |
|
|
| def forward(self, x): |
| return self.conv(x) |
|
|
|
|
| class TimestepEmbedding(nn.Module): |
| def __init__( |
| self, |
| in_channels: int, |
| time_embed_dim: int, |
| act_fn: str = "silu", |
| out_dim: int = None, |
| post_act_fn: Optional[str] = None, |
| cond_proj_dim=None, |
| ): |
| super().__init__() |
| assert act_fn == "silu", "act_fn must be silu" |
|
|
| self.linear_1 = nn.Linear(in_channels, time_embed_dim) |
|
|
| if cond_proj_dim is not None: |
| self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) |
| else: |
| self.cond_proj = None |
|
|
| self.act = nn.SiLU() |
|
|
| if out_dim is not None: |
| time_embed_dim_out = out_dim |
| else: |
| time_embed_dim_out = time_embed_dim |
| self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out) |
|
|
| if post_act_fn is None: |
| self.post_act = None |
| else: |
| self.post_act = nn.SiLU() |
|
|
| def forward(self, sample, condition=None): |
| if condition is not None: |
| sample = sample + self.cond_proj(condition) |
| sample = self.linear_1(sample) |
|
|
| if self.act is not None: |
| sample = self.act(sample) |
|
|
| sample = self.linear_2(sample) |
|
|
| if self.post_act is not None: |
| sample = self.post_act(sample) |
| return sample |
|
|
|
|
| class Upsample1D(nn.Module): |
| """A 1D upsampling layer with an optional convolution. |
| |
| Parameters: |
| channels (`int`): |
| number of channels in the inputs and outputs. |
| use_conv (`bool`, default `False`): |
| option to use a convolution. |
| use_conv_transpose (`bool`, default `False`): |
| option to use a convolution transpose. |
| out_channels (`int`, optional): |
| number of output channels. Defaults to `channels`. |
| """ |
|
|
| def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"): |
| super().__init__() |
| self.channels = channels |
| self.out_channels = out_channels or channels |
| self.use_conv = use_conv |
| self.use_conv_transpose = use_conv_transpose |
| self.name = name |
|
|
| self.conv = None |
| if use_conv_transpose: |
| self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1) |
| elif use_conv: |
| self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1) |
|
|
| def forward(self, inputs): |
| assert inputs.shape[1] == self.channels |
| if self.use_conv_transpose: |
| return self.conv(inputs) |
|
|
| outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest") |
|
|
| if self.use_conv: |
| outputs = self.conv(outputs) |
|
|
| return outputs |
|
|
|
|
| class Transpose(torch.nn.Module): |
| def __init__(self, dim0: int, dim1: int): |
| super().__init__() |
| self.dim0 = dim0 |
| self.dim1 = dim1 |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| x = torch.transpose(x, self.dim0, self.dim1) |
| return x |
|
|
|
|
| class CausalConv1d(torch.nn.Conv1d): |
| def __init__( |
| self, |
| in_channels: int, |
| out_channels: int, |
| kernel_size: int, |
| stride: int = 1, |
| dilation: int = 1, |
| groups: int = 1, |
| bias: bool = True, |
| padding_mode: str = 'zeros', |
| device=None, |
| dtype=None |
| ) -> None: |
| super(CausalConv1d, self).__init__(in_channels, out_channels, |
| kernel_size, stride, |
| padding=0, dilation=dilation, |
| groups=groups, bias=bias, |
| padding_mode=padding_mode, |
| device=device, dtype=dtype) |
| assert stride == 1 |
| self.causal_padding = kernel_size - 1 |
|
|
| def forward(self, x: torch.Tensor) -> torch.Tensor: |
| x = F.pad(x, (self.causal_padding, 0), value=0.0) |
| x = super(CausalConv1d, self).forward(x) |
| return x |
|
|
|
|
| class CausalBlock1D(Block1D): |
| def __init__(self, dim: int, dim_out: int): |
| super(CausalBlock1D, self).__init__(dim, dim_out) |
| self.block = torch.nn.Sequential( |
| CausalConv1d(dim, dim_out, 3), |
| Transpose(1, 2), |
| nn.LayerNorm(dim_out), |
| Transpose(1, 2), |
| nn.Mish(), |
| ) |
|
|
| def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: |
| output = self.block(x * mask) |
| return output * mask |
|
|
|
|
| class CausalResnetBlock1D(ResnetBlock1D): |
| def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8): |
| super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups) |
| self.block1 = CausalBlock1D(dim, dim_out) |
| self.block2 = CausalBlock1D(dim_out, dim_out) |
|
|
|
|
| class ConditionalDecoder(nn.Module): |
| """ |
| This decoder requires an input with the same shape of the target. So, if your text content |
| is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. |
| |
| Args: |
| in_channels: number of input channels |
| out_channels: number of output channels |
| channels: tuple of channel dimensions |
| dropout: dropout rate |
| attention_head_dim: dimension of attention heads |
| n_blocks: number of transformer blocks |
| num_mid_blocks: number of middle blocks |
| num_heads: number of attention heads |
| act_fn: activation function name |
| """ |
|
|
| def __init__( |
| self, |
| in_channels, |
| out_channels, |
| channels=(256, 256), |
| dropout=0.05, |
| attention_head_dim=64, |
| n_blocks=1, |
| num_mid_blocks=2, |
| num_heads=4, |
| act_fn="snake", |
| ): |
| super().__init__() |
| channels = tuple(channels) |
| self.in_channels = in_channels |
| self.out_channels = out_channels |
|
|
| self.time_embeddings = SinusoidalPosEmb(in_channels) |
| time_embed_dim = channels[0] * 4 |
| self.time_mlp = TimestepEmbedding( |
| in_channels=in_channels, |
| time_embed_dim=time_embed_dim, |
| act_fn="silu", |
| ) |
| self.down_blocks = nn.ModuleList([]) |
| self.mid_blocks = nn.ModuleList([]) |
| self.up_blocks = nn.ModuleList([]) |
|
|
| output_channel = in_channels |
| for i in range(len(channels)): |
| input_channel = output_channel |
| output_channel = channels[i] |
| is_last = i == len(channels) - 1 |
| resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) |
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
| downsample = ( |
| Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1) |
| ) |
| self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) |
|
|
| for _ in range(num_mid_blocks): |
| input_channel = channels[-1] |
| out_channels = channels[-1] |
| resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) |
|
|
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
|
|
| self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) |
|
|
| channels = channels[::-1] + (channels[0],) |
| for i in range(len(channels) - 1): |
| input_channel = channels[i] * 2 |
| output_channel = channels[i + 1] |
| is_last = i == len(channels) - 2 |
| resnet = ResnetBlock1D( |
| dim=input_channel, |
| dim_out=output_channel, |
| time_emb_dim=time_embed_dim, |
| ) |
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
| upsample = ( |
| Upsample1D(output_channel, use_conv_transpose=True) |
| if not is_last |
| else nn.Conv1d(output_channel, output_channel, 3, padding=1) |
| ) |
| self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) |
| self.final_block = Block1D(channels[-1], channels[-1]) |
| self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) |
| self.initialize_weights() |
|
|
| def initialize_weights(self): |
| for m in self.modules(): |
| if isinstance(m, nn.Conv1d): |
| nn.init.kaiming_normal_(m.weight, nonlinearity="relu") |
| if m.bias is not None: |
| nn.init.constant_(m.bias, 0) |
| elif isinstance(m, nn.GroupNorm): |
| nn.init.constant_(m.weight, 1) |
| nn.init.constant_(m.bias, 0) |
| elif isinstance(m, nn.Linear): |
| nn.init.kaiming_normal_(m.weight, nonlinearity="relu") |
| if m.bias is not None: |
| nn.init.constant_(m.bias, 0) |
|
|
| def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False): |
| """Forward pass of the UNet1DConditional model. |
| |
| Args: |
| x (torch.Tensor): shape (batch_size, in_channels, time) |
| mask (_type_): shape (batch_size, 1, time) |
| t (_type_): shape (batch_size) |
| spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. |
| cond (_type_, optional): placeholder for future use. Defaults to None. |
| |
| Raises: |
| ValueError: _description_ |
| ValueError: _description_ |
| |
| Returns: |
| _type_: _description_ |
| """ |
|
|
| t = self.time_embeddings(t).to(t.dtype) |
| t = self.time_mlp(t) |
|
|
| x = pack([x, mu], "b * t")[0] |
|
|
| if spks is not None: |
| spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) |
| x = pack([x, spks], "b * t")[0] |
| if cond is not None: |
| x = pack([x, cond], "b * t")[0] |
|
|
| hiddens = [] |
| masks = [mask] |
| for resnet, transformer_blocks, downsample in self.down_blocks: |
| mask_down = masks[-1] |
| x = resnet(x, mask_down, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
| hiddens.append(x) |
| x = downsample(x * mask_down) |
| masks.append(mask_down[:, :, ::2]) |
| masks = masks[:-1] |
| mask_mid = masks[-1] |
|
|
| for resnet, transformer_blocks in self.mid_blocks: |
| x = resnet(x, mask_mid, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
|
|
| for resnet, transformer_blocks, upsample in self.up_blocks: |
| mask_up = masks.pop() |
| skip = hiddens.pop() |
| x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] |
| x = resnet(x, mask_up, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
| x = upsample(x * mask_up) |
| x = self.final_block(x, mask_up) |
| output = self.final_proj(x * mask_up) |
| return output * mask |
|
|
|
|
| class CausalConditionalDecoder(ConditionalDecoder): |
| """ |
| This decoder requires an input with the same shape of the target. So, if your text content |
| is shorter or longer than the outputs, please re-sampling it before feeding to the decoder. |
| |
| Args: |
| in_channels: number of input channels |
| out_channels: number of output channels |
| channels: list of channel dimensions |
| dropout: dropout rate |
| attention_head_dim: dimension of attention heads |
| n_blocks: number of transformer blocks |
| num_mid_blocks: number of middle blocks |
| num_heads: number of attention heads |
| act_fn: activation function name |
| static_chunk_size: size of static chunks |
| num_decoding_left_chunks: number of left chunks for decoding |
| """ |
|
|
| def __init__( |
| self, |
| in_channels=320, |
| out_channels=80, |
| channels=[256], |
| dropout=0.0, |
| attention_head_dim=64, |
| n_blocks=4, |
| num_mid_blocks=12, |
| num_heads=8, |
| act_fn="gelu", |
| static_chunk_size=50, |
| num_decoding_left_chunks=-1, |
| ): |
| torch.nn.Module.__init__(self) |
| channels = tuple(channels) |
| self.in_channels = in_channels |
| self.out_channels = out_channels |
| self.time_embeddings = SinusoidalPosEmb(in_channels) |
| time_embed_dim = channels[0] * 4 |
| self.time_mlp = TimestepEmbedding( |
| in_channels=in_channels, |
| time_embed_dim=time_embed_dim, |
| act_fn="silu", |
| ) |
| self.static_chunk_size = static_chunk_size |
| self.num_decoding_left_chunks = num_decoding_left_chunks |
| self.down_blocks = nn.ModuleList([]) |
| self.mid_blocks = nn.ModuleList([]) |
| self.up_blocks = nn.ModuleList([]) |
|
|
| output_channel = in_channels |
| for i in range(len(channels)): |
| input_channel = output_channel |
| output_channel = channels[i] |
| is_last = i == len(channels) - 1 |
| resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) |
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
| downsample = ( |
| Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3) |
| ) |
| self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample])) |
|
|
| for _ in range(num_mid_blocks): |
| input_channel = channels[-1] |
| out_channels = channels[-1] |
| resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim) |
|
|
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
|
|
| self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks])) |
|
|
| channels = channels[::-1] + (channels[0],) |
| for i in range(len(channels) - 1): |
| input_channel = channels[i] * 2 |
| output_channel = channels[i + 1] |
| is_last = i == len(channels) - 2 |
| resnet = CausalResnetBlock1D( |
| dim=input_channel, |
| dim_out=output_channel, |
| time_emb_dim=time_embed_dim, |
| ) |
| transformer_blocks = nn.ModuleList( |
| [ |
| BasicTransformerBlock( |
| dim=output_channel, |
| num_attention_heads=num_heads, |
| attention_head_dim=attention_head_dim, |
| dropout=dropout, |
| activation_fn=act_fn, |
| ) |
| for _ in range(n_blocks) |
| ] |
| ) |
| upsample = ( |
| Upsample1D(output_channel, use_conv_transpose=True) |
| if not is_last |
| else CausalConv1d(output_channel, output_channel, 3) |
| ) |
| self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample])) |
| self.final_block = CausalBlock1D(channels[-1], channels[-1]) |
| self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1) |
| self.initialize_weights() |
|
|
| def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False): |
| """Forward pass of the UNet1DConditional model. |
| |
| Args: |
| x (torch.Tensor): shape (batch_size, in_channels, time) |
| mask (_type_): shape (batch_size, 1, time) |
| t (_type_): shape (batch_size) |
| spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None. |
| cond (_type_, optional): placeholder for future use. Defaults to None. |
| |
| Raises: |
| ValueError: _description_ |
| ValueError: _description_ |
| |
| Returns: |
| _type_: _description_ |
| """ |
| t = self.time_embeddings(t).to(t.dtype) |
| t = self.time_mlp(t) |
|
|
| x = pack([x, mu], "b * t")[0] |
|
|
| if spks is not None: |
| spks = repeat(spks, "b c -> b c t", t=x.shape[-1]) |
| x = pack([x, spks], "b * t")[0] |
| if cond is not None: |
| x = pack([x, cond], "b * t")[0] |
|
|
| hiddens = [] |
| masks = [mask] |
| for resnet, transformer_blocks, downsample in self.down_blocks: |
| mask_down = masks[-1] |
| x = resnet(x, mask_down, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| if streaming is True: |
| attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1) |
| else: |
| attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
| hiddens.append(x) |
| x = downsample(x * mask_down) |
| masks.append(mask_down[:, :, ::2]) |
| masks = masks[:-1] |
| mask_mid = masks[-1] |
|
|
| for resnet, transformer_blocks in self.mid_blocks: |
| x = resnet(x, mask_mid, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| if streaming is True: |
| attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1) |
| else: |
| attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
|
|
| for resnet, transformer_blocks, upsample in self.up_blocks: |
| mask_up = masks.pop() |
| skip = hiddens.pop() |
| x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0] |
| x = resnet(x, mask_up, t) |
| x = rearrange(x, "b c t -> b t c").contiguous() |
| if streaming is True: |
| attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1) |
| else: |
| attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1) |
| attn_mask = mask_to_bias(attn_mask, x.dtype) |
| for transformer_block in transformer_blocks: |
| x = transformer_block( |
| hidden_states=x, |
| attention_mask=attn_mask, |
| timestep=t, |
| ) |
| x = rearrange(x, "b t c -> b c t").contiguous() |
| x = upsample(x * mask_up) |
| x = self.final_block(x, mask_up) |
| output = self.final_proj(x * mask_up) |
| return output * mask |
|
|