Spaces:
Sleeping
Sleeping
| import typing as tp | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from .conv import StreamingConv1d, TransposedLayerNorm | |
| from .streaming import StreamingContainer, StreamingAdd | |
| from .spectrogram import StreamingLogMelSpectrogram | |
| from ..utils.compile import torch_compile_lazy | |
| # DropPath copied from timm library | |
| def drop_path( | |
| x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True | |
| ): | |
| """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). | |
| This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, | |
| the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... | |
| See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for | |
| changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use | |
| 'survival rate' as the argument. | |
| """ # noqa: E501 | |
| if drop_prob == 0.0 or not training: | |
| return x | |
| keep_prob = 1 - drop_prob | |
| shape = (x.shape[0],) + (1,) * ( | |
| x.ndim - 1 | |
| ) # work with diff dim tensors, not just 2D ConvNets | |
| random_tensor = x.new_empty(shape).bernoulli_(keep_prob) | |
| if keep_prob > 0.0 and scale_by_keep: | |
| random_tensor.div_(keep_prob) | |
| return x * random_tensor | |
| class DropPath(nn.Module): | |
| """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" # noqa: E501 | |
| def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): | |
| super(DropPath, self).__init__() | |
| self.drop_prob = drop_prob | |
| self.scale_by_keep = scale_by_keep | |
| def forward(self, x): | |
| return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) | |
| def extra_repr(self): | |
| return f"drop_prob={round(self.drop_prob,3):0.3f}" | |
| class LayerNorm(nn.Module): | |
| r"""LayerNorm that supports two data formats: channels_last (default) or channels_first. | |
| The ordering of the dimensions in the inputs. channels_last corresponds to inputs with | |
| shape (batch_size, height, width, channels) while channels_first corresponds to inputs | |
| with shape (batch_size, channels, height, width). | |
| """ # noqa: E501 | |
| def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): | |
| super().__init__() | |
| self.weight = nn.Parameter(torch.ones(normalized_shape)) | |
| self.bias = nn.Parameter(torch.zeros(normalized_shape)) | |
| self.eps = eps | |
| self.data_format = data_format | |
| if self.data_format not in ["channels_last", "channels_first"]: | |
| raise NotImplementedError | |
| self.normalized_shape = (normalized_shape,) | |
| def forward(self, x): | |
| if self.data_format == "channels_last": | |
| return F.layer_norm( | |
| x, self.normalized_shape, self.weight, self.bias, self.eps | |
| ) | |
| elif self.data_format == "channels_first": | |
| u = x.mean(1, keepdim=True) | |
| s = (x - u).pow(2).mean(1, keepdim=True) | |
| x = (x - u) / torch.sqrt(s + self.eps) | |
| x = self.weight[:, None] * x + self.bias[:, None] | |
| return x | |
| # ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py | |
| class ConvNeXtBlock(StreamingContainer): | |
| r"""ConvNeXt Block. There are two equivalent implementations: | |
| (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) | |
| (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back | |
| We use (2) as we find it slightly faster in PyTorch | |
| Args: | |
| dim (int): Number of input channels. | |
| drop_path (float): Stochastic depth rate. Default: 0.0 | |
| layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. | |
| mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0. | |
| kernel_size (int): Kernel size for depthwise conv. Default: 7. | |
| dilation (int): Dilation for depthwise conv. Default: 1. | |
| """ # noqa: E501 | |
| def __init__( | |
| self, | |
| dim: int, | |
| drop_path: float = 0.0, | |
| layer_scale_init_value: float = 1e-6, | |
| mlp_ratio: float = 4.0, | |
| kernel_size: int = 7, | |
| dilation: int = 1, | |
| norm: str = "none", | |
| norm_params: tp.Dict[str, tp.Any] = {}, | |
| causal: bool = False, | |
| pad_mode: str = "reflect", | |
| ): | |
| super().__init__() | |
| self.dwconv = StreamingConv1d( | |
| dim, | |
| dim, | |
| kernel_size=kernel_size, | |
| dilation=dilation, | |
| norm="weight_norm", | |
| norm_kwargs=norm_params, | |
| causal=causal, | |
| pad_mode=pad_mode, | |
| groups=dim, | |
| ) | |
| self.norm = LayerNorm(dim, eps=1e-6) | |
| self.pwconv1 = nn.Linear( | |
| dim, int(mlp_ratio * dim) | |
| ) # pointwise/1x1 convs, implemented with linear layers | |
| self.act = nn.GELU() | |
| self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim) | |
| self.gamma = ( | |
| nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) | |
| if layer_scale_init_value > 0 | |
| else None | |
| ) | |
| self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() | |
| self.add = StreamingAdd() | |
| def forward(self, x, apply_residual: bool = True): | |
| input = x | |
| x = self.dwconv(x) | |
| x = x.permute(0, 2, 1) # (N, C, L) -> (N, L, C) | |
| x = self.norm(x) | |
| x = self.pwconv1(x) | |
| x = self.act(x) | |
| x = self.pwconv2(x) | |
| if self.gamma is not None: | |
| x = self.gamma * x | |
| x = x.permute(0, 2, 1) # (N, L, C) -> (N, C, L) | |
| x = self.drop_path(x) | |
| if apply_residual: | |
| # x = input + x | |
| x = self.add(input, x) | |
| return x | |
| class VocosBackbone(StreamingContainer): | |
| """ | |
| Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization | |
| Args: | |
| input_channels (int): Number of input features channels. | |
| dim (int): Hidden dimension of the model. | |
| intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock. | |
| num_layers (int): Number of ConvNeXtBlock layers. | |
| """ | |
| def __init__( | |
| self, | |
| input_channels: int = 80, | |
| dim: int = 512, | |
| mlp_ratio: float = 3.0, | |
| kernel_size: int = 7, | |
| dilation: int = 1, | |
| norm: str = "none", | |
| norm_params: tp.Dict[str, tp.Any] = {}, | |
| causal: bool = False, | |
| pad_mode: str = "reflect", | |
| num_layers: int = 8, | |
| layer_scale_init_value: float = 1e-6 | |
| ): | |
| super().__init__() | |
| self.input_channels = input_channels | |
| self.embed = StreamingConv1d( | |
| input_channels, | |
| dim, | |
| kernel_size=kernel_size, | |
| dilation=1, | |
| norm=norm, | |
| norm_kwargs=norm_params, | |
| causal=causal, | |
| pad_mode=pad_mode, | |
| ) | |
| self.norm = nn.LayerNorm(dim, eps=1e-6) | |
| layer_scale_init_value = layer_scale_init_value or 1 / num_layers | |
| self.convnext = nn.ModuleList( | |
| [ | |
| ConvNeXtBlock( | |
| dim=dim, | |
| mlp_ratio=mlp_ratio, | |
| layer_scale_init_value=layer_scale_init_value, | |
| kernel_size=kernel_size, | |
| norm=norm, | |
| norm_params=norm_params, | |
| causal=causal, | |
| pad_mode=pad_mode, | |
| ) | |
| for _ in range(num_layers) | |
| ] | |
| ) | |
| self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6) | |
| self.apply(self._init_weights) | |
| def _init_weights(self, m): | |
| if isinstance(m, (nn.Conv1d, nn.Linear)): | |
| nn.init.trunc_normal_(m.weight, std=0.02) | |
| nn.init.constant_(m.bias, 0) | |
| def forward(self, x: torch.Tensor) -> torch.Tensor: | |
| x = self.embed(x) | |
| x = self.norm(x.transpose(1, 2)) | |
| x = x.transpose(1, 2) | |
| for conv_block in self.convnext: | |
| x = conv_block(x) | |
| x = self.final_layer_norm(x.transpose(1, 2)).transpose(1, 2) | |
| return x |