Chatterbox-Multilingual-TTS

Running

App Files Files Community

Chatterbox-Multilingual-TTS / src /chatterbox /models /s3gen /matcha /decoder.py

Zihan428

Clean multilingual TTS repo

f79db70 6 months ago

raw

history blame contribute delete

14.4 kB

	import math
	from typing import Optional

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from conformer import ConformerBlock
	from diffusers.models.activations import get_activation
	from einops import pack, rearrange, repeat

	from .transformer import BasicTransformerBlock


	class SinusoidalPosEmb(torch.nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim
	assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"

	def forward(self, x, scale=1000):
	if x.ndim < 1:
	x = x.unsqueeze(0)
	device = x.device
	half_dim = self.dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, device=device).float() * -emb)
	emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
	return emb


	class Block1D(torch.nn.Module):
	def __init__(self, dim, dim_out, groups=8):
	super().__init__()
	self.block = torch.nn.Sequential(
	torch.nn.Conv1d(dim, dim_out, 3, padding=1),
	torch.nn.GroupNorm(groups, dim_out),
	nn.Mish(),
	)

	def forward(self, x, mask):
	output = self.block(x * mask)
	return output * mask


	class ResnetBlock1D(torch.nn.Module):
	def __init__(self, dim, dim_out, time_emb_dim, groups=8):
	super().__init__()
	self.mlp = torch.nn.Sequential(nn.Mish(), torch.nn.Linear(time_emb_dim, dim_out))

	self.block1 = Block1D(dim, dim_out, groups=groups)
	self.block2 = Block1D(dim_out, dim_out, groups=groups)

	self.res_conv = torch.nn.Conv1d(dim, dim_out, 1)

	def forward(self, x, mask, time_emb):
	h = self.block1(x, mask)
	h += self.mlp(time_emb).unsqueeze(-1)
	h = self.block2(h, mask)
	output = h + self.res_conv(x * mask)
	return output


	class Downsample1D(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.conv = torch.nn.Conv1d(dim, dim, 3, 2, 1)

	def forward(self, x):
	return self.conv(x)


	class TimestepEmbedding(nn.Module):
	def __init__(
	self,
	in_channels: int,
	time_embed_dim: int,
	act_fn: str = "silu",
	out_dim: int = None,
	post_act_fn: Optional[str] = None,
	cond_proj_dim=None,
	):
	super().__init__()

	self.linear_1 = nn.Linear(in_channels, time_embed_dim)

	if cond_proj_dim is not None:
	self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
	else:
	self.cond_proj = None

	self.act = get_activation(act_fn)

	if out_dim is not None:
	time_embed_dim_out = out_dim
	else:
	time_embed_dim_out = time_embed_dim
	self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out)

	if post_act_fn is None:
	self.post_act = None
	else:
	self.post_act = get_activation(post_act_fn)

	def forward(self, sample, condition=None):
	if condition is not None:
	sample = sample + self.cond_proj(condition)
	sample = self.linear_1(sample)

	if self.act is not None:
	sample = self.act(sample)

	sample = self.linear_2(sample)

	if self.post_act is not None:
	sample = self.post_act(sample)
	return sample


	class Upsample1D(nn.Module):
	"""A 1D upsampling layer with an optional convolution.

	Parameters:
	channels (`int`):
	number of channels in the inputs and outputs.
	use_conv (`bool`, default `False`):
	option to use a convolution.
	use_conv_transpose (`bool`, default `False`):
	option to use a convolution transpose.
	out_channels (`int`, optional):
	number of output channels. Defaults to `channels`.
	"""

	def __init__(self, channels, use_conv=False, use_conv_transpose=True, out_channels=None, name="conv"):
	super().__init__()
	self.channels = channels
	self.out_channels = out_channels or channels
	self.use_conv = use_conv
	self.use_conv_transpose = use_conv_transpose
	self.name = name

	self.conv = None
	if use_conv_transpose:
	self.conv = nn.ConvTranspose1d(channels, self.out_channels, 4, 2, 1)
	elif use_conv:
	self.conv = nn.Conv1d(self.channels, self.out_channels, 3, padding=1)

	def forward(self, inputs):
	assert inputs.shape[1] == self.channels
	if self.use_conv_transpose:
	return self.conv(inputs)

	outputs = F.interpolate(inputs, scale_factor=2.0, mode="nearest")

	if self.use_conv:
	outputs = self.conv(outputs)

	return outputs


	class ConformerWrapper(ConformerBlock):
	def __init__( # pylint: disable=useless-super-delegation
	self,
	*,
	dim,
	dim_head=64,
	heads=8,
	ff_mult=4,
	conv_expansion_factor=2,
	conv_kernel_size=31,
	attn_dropout=0,
	ff_dropout=0,
	conv_dropout=0,
	conv_causal=False,
	):
	super().__init__(
	dim=dim,
	dim_head=dim_head,
	heads=heads,
	ff_mult=ff_mult,
	conv_expansion_factor=conv_expansion_factor,
	conv_kernel_size=conv_kernel_size,
	attn_dropout=attn_dropout,
	ff_dropout=ff_dropout,
	conv_dropout=conv_dropout,
	conv_causal=conv_causal,
	)

	def forward(
	self,
	hidden_states,
	attention_mask,
	encoder_hidden_states=None,
	encoder_attention_mask=None,
	timestep=None,
	):
	return super().forward(x=hidden_states, mask=attention_mask.bool())


	class Decoder(nn.Module):
	def __init__(
	self,
	in_channels,
	out_channels,
	channels=(256, 256),
	dropout=0.05,
	attention_head_dim=64,
	n_blocks=1,
	num_mid_blocks=2,
	num_heads=4,
	act_fn="snake",
	down_block_type="transformer",
	mid_block_type="transformer",
	up_block_type="transformer",
	):
	super().__init__()
	channels = tuple(channels)
	self.in_channels = in_channels
	self.out_channels = out_channels

	self.time_embeddings = SinusoidalPosEmb(in_channels)
	time_embed_dim = channels[0] * 4
	self.time_mlp = TimestepEmbedding(
	in_channels=in_channels,
	time_embed_dim=time_embed_dim,
	act_fn="silu",
	)

	self.down_blocks = nn.ModuleList([])
	self.mid_blocks = nn.ModuleList([])
	self.up_blocks = nn.ModuleList([])

	output_channel = in_channels
	for i in range(len(channels)): # pylint: disable=consider-using-enumerate
	input_channel = output_channel
	output_channel = channels[i]
	is_last = i == len(channels) - 1
	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)
	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	down_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	downsample = (
	Downsample1D(output_channel) if not is_last else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)

	self.down_blocks.append(nn.ModuleList([resnet, transformer_blocks, downsample]))

	for i in range(num_mid_blocks):
	input_channel = channels[-1]
	out_channels = channels[-1]

	resnet = ResnetBlock1D(dim=input_channel, dim_out=output_channel, time_emb_dim=time_embed_dim)

	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	mid_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)

	self.mid_blocks.append(nn.ModuleList([resnet, transformer_blocks]))

	channels = channels[::-1] + (channels[0],)
	for i in range(len(channels) - 1):
	input_channel = channels[i]
	output_channel = channels[i + 1]
	is_last = i == len(channels) - 2

	resnet = ResnetBlock1D(
	dim=2 * input_channel,
	dim_out=output_channel,
	time_emb_dim=time_embed_dim,
	)
	transformer_blocks = nn.ModuleList(
	[
	self.get_block(
	up_block_type,
	output_channel,
	attention_head_dim,
	num_heads,
	dropout,
	act_fn,
	)
	for _ in range(n_blocks)
	]
	)
	upsample = (
	Upsample1D(output_channel, use_conv_transpose=True)
	if not is_last
	else nn.Conv1d(output_channel, output_channel, 3, padding=1)
	)

	self.up_blocks.append(nn.ModuleList([resnet, transformer_blocks, upsample]))

	self.final_block = Block1D(channels[-1], channels[-1])
	self.final_proj = nn.Conv1d(channels[-1], self.out_channels, 1)

	self.initialize_weights()
	# nn.init.normal_(self.final_proj.weight)

	@staticmethod
	def get_block(block_type, dim, attention_head_dim, num_heads, dropout, act_fn):
	if block_type == "conformer":
	block = ConformerWrapper(
	dim=dim,
	dim_head=attention_head_dim,
	heads=num_heads,
	ff_mult=1,
	conv_expansion_factor=2,
	ff_dropout=dropout,
	attn_dropout=dropout,
	conv_dropout=dropout,
	conv_kernel_size=31,
	)
	elif block_type == "transformer":
	block = BasicTransformerBlock(
	dim=dim,
	num_attention_heads=num_heads,
	attention_head_dim=attention_head_dim,
	dropout=dropout,
	activation_fn=act_fn,
	)
	else:
	raise ValueError(f"Unknown block type {block_type}")

	return block

	def initialize_weights(self):
	for m in self.modules():
	if isinstance(m, nn.Conv1d):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")

	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	elif isinstance(m, nn.GroupNorm):
	nn.init.constant_(m.weight, 1)
	nn.init.constant_(m.bias, 0)

	elif isinstance(m, nn.Linear):
	nn.init.kaiming_normal_(m.weight, nonlinearity="relu")

	if m.bias is not None:
	nn.init.constant_(m.bias, 0)

	def forward(self, x, mask, mu, t, spks=None, cond=None):
	"""Forward pass of the UNet1DConditional model.

	Args:
	x (torch.Tensor): shape (batch_size, in_channels, time)
	mask (_type_): shape (batch_size, 1, time)
	t (_type_): shape (batch_size)
	spks (_type_, optional): shape: (batch_size, condition_channels). Defaults to None.
	cond (_type_, optional): placeholder for future use. Defaults to None.

	Raises:
	ValueError: _description_
	ValueError: _description_

	Returns:
	_type_: _description_
	"""

	t = self.time_embeddings(t)
	t = self.time_mlp(t)

	x = pack([x, mu], "b * t")[0]

	if spks is not None:
	spks = repeat(spks, "b c -> b c t", t=x.shape[-1])
	x = pack([x, spks], "b * t")[0]

	hiddens = []
	masks = [mask]
	for resnet, transformer_blocks, downsample in self.down_blocks:
	mask_down = masks[-1]
	x = resnet(x, mask_down, t)
	x = rearrange(x, "b c t -> b t c")
	mask_down = rearrange(mask_down, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_down,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_down = rearrange(mask_down, "b t -> b 1 t")
	hiddens.append(x) # Save hidden states for skip connections
	x = downsample(x * mask_down)
	masks.append(mask_down[:, :, ::2])

	masks = masks[:-1]
	mask_mid = masks[-1]

	for resnet, transformer_blocks in self.mid_blocks:
	x = resnet(x, mask_mid, t)
	x = rearrange(x, "b c t -> b t c")
	mask_mid = rearrange(mask_mid, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_mid,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_mid = rearrange(mask_mid, "b t -> b 1 t")

	for resnet, transformer_blocks, upsample in self.up_blocks:
	mask_up = masks.pop()
	x = resnet(pack([x, hiddens.pop()], "b * t")[0], mask_up, t)
	x = rearrange(x, "b c t -> b t c")
	mask_up = rearrange(mask_up, "b 1 t -> b t")
	for transformer_block in transformer_blocks:
	x = transformer_block(
	hidden_states=x,
	attention_mask=mask_up,
	timestep=t,
	)
	x = rearrange(x, "b t c -> b c t")
	mask_up = rearrange(mask_up, "b t -> b 1 t")
	x = upsample(x * mask_up)

	x = self.final_block(x, mask_up)
	output = self.final_proj(x * mask_up)

	return output * mask