Spaces:

reach-vb
/

Step-Audio-2-mini

Paused

Step-Audio-2-mini / flashcosyvoice /modules /flow_components /estimator.py

Vaibhavs10

up.

629b314 8 months ago

38.1 kB

	import math
	from typing import Any, Dict, Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from diffusers.models.attention import (GEGLU, GELU, AdaLayerNorm,
	AdaLayerNormZero, ApproximateGELU)
	from diffusers.models.attention_processor import Attention
	from diffusers.models.lora import LoRACompatibleLinear
	from diffusers.utils.torch_utils import maybe_allow_in_graph
	from einops import pack, rearrange, repeat

	from flashcosyvoice.modules.flow_components.upsample_encoder import \
	add_optional_chunk_mask


	def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
	assert mask.dtype == torch.bool
	assert dtype in [torch.float32, torch.bfloat16, torch.float16]
	mask = mask.to(dtype)
	# attention mask bias
	# NOTE(Mddct): torch.finfo jit issues
	# chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
	mask = (1.0 - mask) * -1.0e+10
	return mask


	class SnakeBeta(nn.Module):
	"""
	A modified Snake function which uses separate parameters for the magnitude of the periodic components
	Shape:
	- Input: (B, C, T)
	- Output: (B, C, T), same shape as the input
	Parameters:
	- alpha - trainable parameter that controls frequency
	- beta - trainable parameter that controls magnitude
	References:
	- This activation function is a modified version based on this paper by Liu Ziyin, Tilman Hartwig, Masahito Ueda:
	https://arxiv.org/abs/2006.08195
	Examples:
	>>> a1 = snakebeta(256)
	>>> x = torch.randn(256)
	>>> x = a1(x)

	Args:
	in_features: shape of the input
	out_features: shape of the output
	alpha: trainable parameter that controls frequency
	alpha_trainable: whether alpha is trainable
	alpha_logscale: whether to use log scale for alpha
	alpha is initialized to 1 by default, higher values = higher-frequency.
	beta is initialized to 1 by default, higher values = higher-magnitude.
	alpha will be trained along with the rest of your model.
	"""

	def __init__(self, in_features, out_features, alpha=1.0, alpha_trainable=True, alpha_logscale=True):
	super().__init__()
	self.in_features = out_features if isinstance(out_features, list) else [out_features]
	self.proj = LoRACompatibleLinear(in_features, out_features)

	# initialize alpha
	self.alpha_logscale = alpha_logscale
	if self.alpha_logscale: # log scale alphas initialized to zeros
	self.alpha = nn.Parameter(torch.zeros(self.in_features) * alpha)
	self.beta = nn.Parameter(torch.zeros(self.in_features) * alpha)
	else: # linear scale alphas initialized to ones
	self.alpha = nn.Parameter(torch.ones(self.in_features) * alpha)
	self.beta = nn.Parameter(torch.ones(self.in_features) * alpha)

	self.alpha.requires_grad = alpha_trainable
	self.beta.requires_grad = alpha_trainable

	self.no_div_by_zero = 0.000000001

	def forward(self, x):
	"""
	Forward pass of the function.
	Applies the function to the input elementwise.
	SnakeBeta ∶= x + 1/b * sin^2 (xa)
	"""
	x = self.proj(x)
	if self.alpha_logscale:
	alpha = torch.exp(self.alpha)
	beta = torch.exp(self.beta)
	else:
	alpha = self.alpha
	beta = self.beta

	x = x + (1.0 / (beta + self.no_div_by_zero)) * torch.pow(torch.sin(x * alpha), 2)

	return x


	class FeedForward(nn.Module):
	r"""
	A feed-forward layer.

	Parameters:
	dim (`int`): The number of channels in the input.
	dim_out (`int`, optional): The number of channels in the output. If not given, defaults to `dim`.
	mult (`int`, optional, defaults to 4): The multiplier to use for the hidden dimension.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	final_dropout (`bool` optional, defaults to False): Apply a final dropout.
	"""

	def __init__(
	self,
	dim: int,
	dim_out: Optional[int] = None,
	mult: int = 4,
	dropout: float = 0.0,
	activation_fn: str = "geglu",
	final_dropout: bool = False,
	):
	super().__init__()
	inner_dim = int(dim * mult)
	dim_out = dim_out if dim_out is not None else dim

	if activation_fn == "gelu":
	act_fn = GELU(dim, inner_dim)
	if activation_fn == "gelu-approximate":
	act_fn = GELU(dim, inner_dim, approximate="tanh")
	elif activation_fn == "geglu":
	act_fn = GEGLU(dim, inner_dim)
	elif activation_fn == "geglu-approximate":
	act_fn = ApproximateGELU(dim, inner_dim)
	elif activation_fn == "snakebeta":
	act_fn = SnakeBeta(dim, inner_dim)

	self.net = nn.ModuleList([])
	# project in
	self.net.append(act_fn)
	# project dropout
	self.net.append(nn.Dropout(dropout))
	# project out
	self.net.append(LoRACompatibleLinear(inner_dim, dim_out))
	# FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
	if final_dropout:
	self.net.append(nn.Dropout(dropout))

	def forward(self, hidden_states):
	for module in self.net:
	hidden_states = module(hidden_states)
	return hidden_states


	@maybe_allow_in_graph
	class BasicTransformerBlock(nn.Module):
	r"""
	A basic Transformer block.

	Parameters:
	dim (`int`): The number of channels in the input and output.
	num_attention_heads (`int`): The number of heads to use for multi-head attention.
	attention_head_dim (`int`): The number of channels in each head.
	dropout (`float`, optional, defaults to 0.0): The dropout probability to use.
	cross_attention_dim (`int`, optional): The size of the encoder_hidden_states vector for cross attention.
	only_cross_attention (`bool`, optional):
	Whether to use only cross-attention layers. In this case two cross attention layers are used.
	double_self_attention (`bool`, optional):
	Whether to use two self-attention layers. In this case no cross attention layers are used.
	activation_fn (`str`, optional, defaults to `"geglu"`): Activation function to be used in feed-forward.
	num_embeds_ada_norm (:
	obj: `int`, optional): The number of diffusion steps used during training. See `Transformer2DModel`.
	attention_bias (:
	obj: `bool`, optional, defaults to `False`): Configure if the attentions should contain a bias parameter.
	"""

	def __init__(
	self,
	dim: int,
	num_attention_heads: int,
	attention_head_dim: int,
	dropout=0.0,
	cross_attention_dim: Optional[int] = None,
	activation_fn: str = "geglu",
	num_embeds_ada_norm: Optional[int] = None,
	attention_bias: bool = False,
	only_cross_attention: bool = False,
	double_self_attention: bool = False,
	upcast_attention: bool = False,
	norm_elementwise_affine: bool = True,
	norm_type: str = "layer_norm",
	final_dropout: bool = False,
	):
	super().__init__()
	self.only_cross_attention = only_cross_attention

	self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
	self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"

	if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
	raise ValueError(
	f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
	f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
	)

	# Define 3 blocks. Each block has its own normalization layer.
	# 1. Self-Attn
	if self.use_ada_layer_norm:
	self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
	elif self.use_ada_layer_norm_zero:
	self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
	else:
	self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
	self.attn1 = Attention(
	query_dim=dim,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	cross_attention_dim=cross_attention_dim if only_cross_attention else None,
	upcast_attention=upcast_attention,
	)

	# 2. Cross-Attn
	if cross_attention_dim is not None or double_self_attention:
	# We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
	# I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
	# the second cross attention block.
	self.norm2 = (
	AdaLayerNorm(dim, num_embeds_ada_norm)
	if self.use_ada_layer_norm
	else nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
	)
	self.attn2 = Attention(
	query_dim=dim,
	cross_attention_dim=cross_attention_dim if not double_self_attention else None,
	heads=num_attention_heads,
	dim_head=attention_head_dim,
	dropout=dropout,
	bias=attention_bias,
	upcast_attention=upcast_attention,
	# scale_qk=False, # uncomment this to not to use flash attention
	) # is self-attn if encoder_hidden_states is none
	else:
	self.norm2 = None
	self.attn2 = None

	# 3. Feed-forward
	self.norm3 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine)
	self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)

	# let chunk size default to None
	self._chunk_size = None
	self._chunk_dim = 0

	def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
	# Sets chunk feed-forward
	self._chunk_size = chunk_size
	self._chunk_dim = dim

	def forward(
	self,
	hidden_states: torch.FloatTensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	timestep: Optional[torch.LongTensor] = None,
	cross_attention_kwargs: Dict[str, Any] = None,
	class_labels: Optional[torch.LongTensor] = None,
	):
	# Notice that normalization is always applied before the real computation in the following blocks.
	# 1. Self-Attention
	if self.use_ada_layer_norm:
	norm_hidden_states = self.norm1(hidden_states, timestep)
	elif self.use_ada_layer_norm_zero:
	norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
	hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
	)
	else:
	norm_hidden_states = self.norm1(hidden_states)

	cross_attention_kwargs = cross_attention_kwargs if cross_attention_kwargs is not None else {}

	attn_output = self.attn1(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
	attention_mask=encoder_attention_mask if self.only_cross_attention else attention_mask,
	**cross_attention_kwargs,
	)
	if self.use_ada_layer_norm_zero:
	attn_output = gate_msa.unsqueeze(1) * attn_output
	hidden_states = attn_output + hidden_states

	# 2. Cross-Attention
	if self.attn2 is not None:
	norm_hidden_states = (
	self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
	)

	attn_output = self.attn2(
	norm_hidden_states,
	encoder_hidden_states=encoder_hidden_states,
	attention_mask=encoder_attention_mask,
	**cross_attention_kwargs,
	)
	hidden_states = attn_output + hidden_states

	# 3. Feed-forward
	norm_hidden_states = self.norm3(hidden_states)

	if self.use_ada_layer_norm_zero:
	norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]

	if self._chunk_size is not None:
	# "feed_forward_chunk_size" can be used to save memory
	if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
	raise ValueError(
	f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
	)

	num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
	ff_output = torch.cat(
	[self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
	dim=self._chunk_dim,
	)
	else:
	ff_output = self.ff(norm_hidden_states)

	if self.use_ada_layer_norm_zero:
	ff_output = gate_mlp.unsqueeze(1) * ff_output

	hidden_states = ff_output + hidden_states

	return hidden_states


	class SinusoidalPosEmb(torch.nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim
	assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"

	def forward(self, x, scale=1000):
	if x.ndim < 1:
	x = x.unsqueeze(0)
	device = x.device
	half_dim = self.dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
	emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
	return emb


	class Block1D(torch.nn.Module):
	def __init__(self, dim, dim_out, groups=8):
	super().__init__()
	self.block = torch.nn.Sequential(
	torch.nn.Conv1d(dim, dim_out, 3, padding=1),
	torch.nn.GroupNorm(groups, dim_out),
	nn.Mish(),
	)

	def forward(self, x, mask):
	output = self.block(x * mask)
	return output * mask


	class ResnetBlock1D(torch.nn.Module):
	def __init__(self, dim, dim_out, time_emb_dim, groups=8):
	super().__init__()
	self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))

	self.block1 = Block1D(dim, dim_out, groups=groups)
	self.block2 = Block1D(dim_out, dim_out, groups=groups)

	self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)

	def forward(self, x, mask, time_emb):
	h = self.block1(x, mask)
	h += self.mlp(time_emb).unsqueeze(-1)
	h = self.block2(h, mask)
	output = h + self.res_conv(x * mask)
	return output


	class Downsample1D(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)

	def forward(self, x):
	return self.conv(x)


	class TimestepEmbedding(nn.Module):
	def __init__(
	self,
	in_channels: int,
	time_embed_dim: int,
	act_fn: str = "silu",
	out_dim: int = None,
	post_act_fn: Optional[str] = None,
	cond_proj_dim=None,
	):
	super().__init__()
	assert act_fn == "silu", "act_fn must be silu"

	self.linear_1 = nn.Linear(in_channels, time_embed_dim)

	if cond_proj_dim is not None:
	self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
	else:
	self.cond_proj = None

	self.act = nn.SiLU()

	if out_dim is not None:
	time_embed_dim_out = out_dim
	else:
	time_embed_dim_out = time_embed_dim
	self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)

	if post_act_fn is None:
	self.post_act = None
	else:
	self.post_act = nn.SiLU()

	def forward(self, sample, condition=None):
	if condition is not None:
	sample = sample + self.cond_proj(condition)
	sample = self.linear_1(sample)

	if self.act is not None:
	sample = self.act(sample)

	sample = self.linear_2(sample)

	if self.post_act is not None:
	sample = self.post_act(sample)
	return sample


	class Upsample1D(nn.Module):
	"""A 1D upsampling layer with an optional convolution.

	Parameters:
	channels (`int`):
	number of channels in the inputs and outputs.
	use_conv (`bool`, default `False`):
	option to use a convolution.
	use_conv_transpose (`bool`, default `False`):
	option to use a convolution transpose.
	out_channels (`int`, optional):
	number of output channels. Defaults to `channels`.
	"""

	def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.use_conv_transpose = use_conv_transpose
	self.name = name

	self.conv = None
	if use_conv_transpose:
	self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
	elif use_conv:
	self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)

	def forward(self, inputs):
	assert inputs.shape[1] == self.channels
	if self.use_conv_transpose:
	return self.conv(inputs)

	outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")

	if self.use_conv:
	outputs = self.conv(outputs)

	return outputs


	class Transpose(torch.nn.Module):
	def __init__(self, dim0: int, dim1: int):
	super().__init__()
	self.dim0 = dim0
	self.dim1 = dim1

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = torch.transpose(x, self.dim0, self.dim1)
	return x


	class CausalConv1d(torch.nn.Conv1d):
	def __init__(
	self,
	in_channels: int,
	out_channels: int,
	kernel_size: int,
	stride: int = 1,
	dilation: int = 1,
	groups: int = 1,
	bias: bool = True,
	padding_mode: str = 'zeros',
	device=None,
	dtype=None
	) -> None:
	super(CausalConv1d, self).__init__(in_channels, out_channels,
	kernel_size, stride,
	padding=0, dilation=dilation,
	groups=groups, bias=bias,
	padding_mode=padding_mode,
	device=device, dtype=dtype)
	assert stride == 1
	self.causal_padding = kernel_size - 1

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	x = F.pad(x, (self.causal_padding, 0), value=0.0)
	x = super(CausalConv1d, self).forward(x)
	return x


	class CausalBlock1D(Block1D):
	def __init__(self, dim: int, dim_out: int):
	super(CausalBlock1D, self).__init__(dim, dim_out)
	self.block = torch.nn.Sequential(
	CausalConv1d(dim, dim_out, 3),
	Transpose(1, 2),
	nn.LayerNorm(dim_out),
	Transpose(1, 2),
	nn.Mish(),
	)

	def forward(self, x: torch.Tensor, mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
	output = self.block(x * mask)
	return output * mask


	class CausalResnetBlock1D(ResnetBlock1D):
	def __init__(self, dim: int, dim_out: int, time_emb_dim: int, groups: int = 8):
	super(CausalResnetBlock1D, self).__init__(dim, dim_out, time_emb_dim, groups)
	self.block1 = CausalBlock1D(dim, dim_out)
	self.block2 = CausalBlock1D(dim_out, dim_out)


	class ConditionalDecoder(nn.Module):
	"""
	This decoder requires an input with the same shape of the target. So, if your text content
	is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.

	Args:
	in_channels: number of input channels
	out_channels: number of output channels
	channels: tuple of channel dimensions
	dropout: dropout rate
	attention_head_dim: dimension of attention heads
	n_blocks: number of transformer blocks
	num_mid_blocks: number of middle blocks
	num_heads: number of attention heads
	act_fn: activation function name
	"""

	def __init__(
	self,
	in_channels,
	out_channels,
	channels=(256, 256),
	dropout=0.05,
	attention_head_dim=64,
	n_blocks=1,
	num_mid_blocks=2,
	num_heads=4,
	act_fn="snake",
	):
	super().__init__()
	channels = tuple(channels)
	self.in_channels = in_channels
	self.out_channels = out_channels

	self.time_embeddings = SinusoidalPosEmb(in_channels)
	time_embed_dim = channels[0] * 4
	self.time_mlp = TimestepEmbedding(
	in_channels=in_channels,
	time_embed_dim=time_embed_dim,
	act_fn="silu",
	)
	self.down_blocks = nn.ModuleList([])
	self.mid_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	output_channel = in_channels
	for i in range(len(channels)): # pylint: disable=consider-using-enumerate
	input_channel = output_channel
	output_channel = channels[i]
	is_last = i == len(channels) - 1
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	downsample = (
	Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)
	self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

	for _ in range(num_mid_blocks):
	input_channel = channels[-1]
	out_channels = channels[-1]
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)

	self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

	channels = channels[::-1] + (channels[0],)
	for i in range(len(channels) - 1):
	input_channel = channels[i] * 2
	output_channel = channels[i + 1]
	is_last = i == len(channels) - 2
	resnet = ResnetBlock1D(
	dim=input_channel,
	dim_out=output_channel,
	time_emb_dim=time_embed_dim,
	)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	upsample = (
	Upsample1D(output_channel, use_conv_transpose=True)
	if not is_last
	else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)
	self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
	self.final_block = Block1D(channels[-1], channels[-1])
	self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
	self.initialize_weights()

	def initialize_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Conv1d):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.GroupNorm):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)
	elif isinstance(m, nn.Linear):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
	"""Forward pass of the UNet1DConditional model.

	Args:
	x (torch.Tensor): shape (batch_size, in_channels, time)
	mask (_type_): shape (batch_size, 1, time)
	t (_type_): shape (batch_size)
	spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
	cond (_type_, optional): placeholder for future use. Defaults to None.

	Raises:
	ValueError: _description_
	ValueError: _description_

	Returns:
	_type_: _description_
	"""

	t = self.time_embeddings(t).to(t.dtype)
	t = self.time_mlp(t)

	x = pack([x, mu], "b * t")[0]

	if spks is not None:
	spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
	x = pack([x, spks], "b * t")[0]
	if cond is not None:
	x = pack([x, cond], "b * t")[0]

	hiddens = []
	masks = [mask]
	for resnet, transformer_blocks, downsample in self.down_blocks:
	mask_down = masks[-1]
	x = resnet(x, mask_down, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	hiddens.append(x) # Save hidden states for skip connections
	x = downsample(x * mask_down)
	masks.append(mask_down[:, :, ::2])
	masks = masks[:-1]
	mask_mid = masks[-1]

	for resnet, transformer_blocks in self.mid_blocks:
	x = resnet(x, mask_mid, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()

	for resnet, transformer_blocks, upsample in self.up_blocks:
	mask_up = masks.pop()
	skip = hiddens.pop()
	x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
	x = resnet(x, mask_up, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	x = upsample(x * mask_up)
	x = self.final_block(x, mask_up)
	output = self.final_proj(x * mask_up)
	return output * mask


	class CausalConditionalDecoder(ConditionalDecoder):
	"""
	This decoder requires an input with the same shape of the target. So, if your text content
	is shorter or longer than the outputs, please re-sampling it before feeding to the decoder.

	Args:
	in_channels: number of input channels
	out_channels: number of output channels
	channels: list of channel dimensions
	dropout: dropout rate
	attention_head_dim: dimension of attention heads
	n_blocks: number of transformer blocks
	num_mid_blocks: number of middle blocks
	num_heads: number of attention heads
	act_fn: activation function name
	static_chunk_size: size of static chunks
	num_decoding_left_chunks: number of left chunks for decoding
	"""

	def __init__(
	self,
	in_channels=320,
	out_channels=80,
	channels=[256], # noqa
	dropout=0.0,
	attention_head_dim=64,
	n_blocks=4,
	num_mid_blocks=12,
	num_heads=8,
	act_fn="gelu",
	static_chunk_size=50,
	num_decoding_left_chunks=-1,
	):
	torch.nn.Module.__init__(self)
	channels = tuple(channels)
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.time_embeddings = SinusoidalPosEmb(in_channels)
	time_embed_dim = channels[0] * 4
	self.time_mlp = TimestepEmbedding(
	in_channels=in_channels,
	time_embed_dim=time_embed_dim,
	act_fn="silu",
	)
	self.static_chunk_size = static_chunk_size
	self.num_decoding_left_chunks = num_decoding_left_chunks
	self.down_blocks = nn.ModuleList([])
	self.mid_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	output_channel = in_channels
	for i in range(len(channels)): # pylint: disable=consider-using-enumerate
	input_channel = output_channel
	output_channel = channels[i]
	is_last = i == len(channels) - 1
	resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	downsample = (
	Downsample1D(output_channel) if not is_last else CausalConv1d(output_channel, output_channel, 3)
	)
	self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

	for _ in range(num_mid_blocks):
	input_channel = channels[-1]
	out_channels = channels[-1]
	resnet = CausalResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)

	self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

	channels = channels[::-1] + (channels[0],)
	for i in range(len(channels) - 1):
	input_channel = channels[i] * 2
	output_channel = channels[i + 1]
	is_last = i == len(channels) - 2
	resnet = CausalResnetBlock1D(
	dim=input_channel,
	dim_out=output_channel,
	time_emb_dim=time_embed_dim,
	)
	transformer_blocks = nn.ModuleList(
	[
	BasicTransformerBlock(
	dim=output_channel,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	upsample = (
	Upsample1D(output_channel, use_conv_transpose=True)
	if not is_last
	else CausalConv1d(output_channel, output_channel, 3)
	)
	self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))
	self.final_block = CausalBlock1D(channels[-1], channels[-1])
	self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)
	self.initialize_weights()

	def forward(self, x, mask, mu, t, spks=None, cond=None, streaming=False):
	"""Forward pass of the UNet1DConditional model.

	Args:
	x (torch.Tensor): shape (batch_size, in_channels, time)
	mask (_type_): shape (batch_size, 1, time)
	t (_type_): shape (batch_size)
	spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
	cond (_type_, optional): placeholder for future use. Defaults to None.

	Raises:
	ValueError: _description_
	ValueError: _description_

	Returns:
	_type_: _description_
	"""
	t = self.time_embeddings(t).to(t.dtype)
	t = self.time_mlp(t)

	x = pack([x, mu], "b * t")[0]

	if spks is not None:
	spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
	x = pack([x, spks], "b * t")[0]
	if cond is not None:
	x = pack([x, cond], "b * t")[0]

	hiddens = []
	masks = [mask]
	for resnet, transformer_blocks, downsample in self.down_blocks:
	mask_down = masks[-1]
	x = resnet(x, mask_down, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	if streaming is True:
	attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, self.static_chunk_size, -1)
	else:
	attn_mask = add_optional_chunk_mask(x, mask_down.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	hiddens.append(x) # Save hidden states for skip connections
	x = downsample(x * mask_down)
	masks.append(mask_down[:, :, ::2])
	masks = masks[:-1]
	mask_mid = masks[-1]

	for resnet, transformer_blocks in self.mid_blocks:
	x = resnet(x, mask_mid, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	if streaming is True:
	attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, self.static_chunk_size, -1)
	else:
	attn_mask = add_optional_chunk_mask(x, mask_mid.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()

	for resnet, transformer_blocks, upsample in self.up_blocks:
	mask_up = masks.pop()
	skip = hiddens.pop()
	x = pack([x[:, :, :skip.shape[-1]], skip], "b * t")[0]
	x = resnet(x, mask_up, t)
	x = rearrange(x, "b c t -> b t c").contiguous()
	if streaming is True:
	attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, self.static_chunk_size, -1)
	else:
	attn_mask = add_optional_chunk_mask(x, mask_up.bool(), False, False, 0, 0, -1).repeat(1, x.size(1), 1)
	attn_mask = mask_to_bias(attn_mask, x.dtype)
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=attn_mask,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t").contiguous()
	x = upsample(x * mask_up)
	x = self.final_block(x, mask_up)
	output = self.final_proj(x * mask_up)
	return output * mask