Spaces:

warisqr007
/

StreamingVocos_16khz

Sleeping

App Files Files Community

StreamingVocos_16khz / src /components /convnext.py

warisqr007

Add application file

eb9c81a 19 days ago

raw

history blame contribute delete

8.42 kB

	import typing as tp

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	from .conv import StreamingConv1d, TransposedLayerNorm
	from .streaming import StreamingContainer, StreamingAdd
	from .spectrogram import StreamingLogMelSpectrogram
	from ..utils.compile import torch_compile_lazy


	# DropPath copied from timm library
	def drop_path(
	x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
	):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

	This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
	the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
	See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
	changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
	'survival rate' as the argument.

	""" # noqa: E501

	if drop_prob == 0.0 or not training:
	return x
	keep_prob = 1 - drop_prob
	shape = (x.shape[0],) + (1,) * (
	x.ndim - 1
	) # work with diff dim tensors, not just 2D ConvNets
	random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
	if keep_prob > 0.0 and scale_by_keep:
	random_tensor.div_(keep_prob)
	return x * random_tensor


	class DropPath(nn.Module):
	"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" # noqa: E501

	def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
	super(DropPath, self).__init__()
	self.drop_prob = drop_prob
	self.scale_by_keep = scale_by_keep

	def forward(self, x):
	return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)

	def extra_repr(self):
	return f"drop_prob={round(self.drop_prob,3):0.3f}"


	class LayerNorm(nn.Module):
	r"""LayerNorm that supports two data formats: channels_last (default) or channels_first.
	The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
	shape (batch_size, height, width, channels) while channels_first corresponds to inputs
	with shape (batch_size, channels, height, width).
	""" # noqa: E501

	def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
	super().__init__()
	self.weight = nn.Parameter(torch.ones(normalized_shape))
	self.bias = nn.Parameter(torch.zeros(normalized_shape))
	self.eps = eps
	self.data_format = data_format
	if self.data_format not in ["channels_last", "channels_first"]:
	raise NotImplementedError
	self.normalized_shape = (normalized_shape,)

	def forward(self, x):
	if self.data_format == "channels_last":
	return F.layer_norm(
	x, self.normalized_shape, self.weight, self.bias, self.eps
	)
	elif self.data_format == "channels_first":
	u = x.mean(1, keepdim=True)
	s = (x - u).pow(2).mean(1, keepdim=True)
	x = (x - u) / torch.sqrt(s + self.eps)
	x = self.weight[:, None] * x + self.bias[:, None]
	return x


	# ConvNeXt Block copied from https://github.com/fishaudio/fish-diffusion/blob/main/fish_diffusion/modules/convnext.py
	class ConvNeXtBlock(StreamingContainer):
	r"""ConvNeXt Block. There are two equivalent implementations:
	(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
	(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
	We use (2) as we find it slightly faster in PyTorch

	Args:
	dim (int): Number of input channels.
	drop_path (float): Stochastic depth rate. Default: 0.0
	layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
	mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
	kernel_size (int): Kernel size for depthwise conv. Default: 7.
	dilation (int): Dilation for depthwise conv. Default: 1.
	""" # noqa: E501

	def __init__(
	self,
	dim: int,
	drop_path: float = 0.0,
	layer_scale_init_value: float = 1e-6,
	mlp_ratio: float = 4.0,
	kernel_size: int = 7,
	dilation: int = 1,
	norm: str = "none",
	norm_params: tp.Dict[str, tp.Any] = {},
	causal: bool = False,
	pad_mode: str = "reflect",
	):
	super().__init__()

	self.dwconv = StreamingConv1d(
	dim,
	dim,
	kernel_size=kernel_size,
	dilation=dilation,
	norm="weight_norm",
	norm_kwargs=norm_params,
	causal=causal,
	pad_mode=pad_mode,
	groups=dim,
	)

	self.norm = LayerNorm(dim, eps=1e-6)
	self.pwconv1 = nn.Linear(
	dim, int(mlp_ratio * dim)
	) # pointwise/1x1 convs, implemented with linear layers
	self.act = nn.GELU()
	self.pwconv2 = nn.Linear(int(mlp_ratio * dim), dim)
	self.gamma = (
	nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True)
	if layer_scale_init_value > 0
	else None
	)
	self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
	self.add = StreamingAdd()

	def forward(self, x, apply_residual: bool = True):
	input = x

	x = self.dwconv(x)
	x = x.permute(0, 2, 1) # (N, C, L) -> (N, L, C)
	x = self.norm(x)
	x = self.pwconv1(x)
	x = self.act(x)
	x = self.pwconv2(x)

	if self.gamma is not None:
	x = self.gamma * x

	x = x.permute(0, 2, 1) # (N, L, C) -> (N, C, L)
	x = self.drop_path(x)

	if apply_residual:
	# x = input + x
	x = self.add(input, x)

	return x


	class VocosBackbone(StreamingContainer):
	"""
	Vocos backbone module built with ConvNeXt blocks. Supports additional conditioning with Adaptive Layer Normalization

	Args:
	input_channels (int): Number of input features channels.
	dim (int): Hidden dimension of the model.
	intermediate_dim (int): Intermediate dimension used in ConvNeXtBlock.
	num_layers (int): Number of ConvNeXtBlock layers.
	"""

	def __init__(
	self,
	input_channels: int = 80,
	dim: int = 512,
	mlp_ratio: float = 3.0,
	kernel_size: int = 7,
	dilation: int = 1,
	norm: str = "none",
	norm_params: tp.Dict[str, tp.Any] = {},
	causal: bool = False,
	pad_mode: str = "reflect",
	num_layers: int = 8,
	layer_scale_init_value: float = 1e-6
	):
	super().__init__()
	self.input_channels = input_channels
	self.embed = StreamingConv1d(
	input_channels,
	dim,
	kernel_size=kernel_size,
	dilation=1,
	norm=norm,
	norm_kwargs=norm_params,
	causal=causal,
	pad_mode=pad_mode,
	)
	self.norm = nn.LayerNorm(dim, eps=1e-6)

	layer_scale_init_value = layer_scale_init_value or 1 / num_layers
	self.convnext = nn.ModuleList(
	[
	ConvNeXtBlock(
	dim=dim,
	mlp_ratio=mlp_ratio,
	layer_scale_init_value=layer_scale_init_value,
	kernel_size=kernel_size,
	norm=norm,
	norm_params=norm_params,
	causal=causal,
	pad_mode=pad_mode,
	)
	for _ in range(num_layers)
	]
	)
	self.final_layer_norm = nn.LayerNorm(dim, eps=1e-6)
	self.apply(self._init_weights)

	def _init_weights(self, m):
	if isinstance(m, (nn.Conv1d, nn.Linear)):
	nn.init.trunc_normal_(m.weight, std=0.02)
	nn.init.constant_(m.bias, 0)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = self.embed(x)
	x = self.norm(x.transpose(1, 2))
	x = x.transpose(1, 2)
	for conv_block in self.convnext:
	x = conv_block(x)
	x = self.final_layer_norm(x.transpose(1, 2)).transpose(1, 2)
	return x