Spaces:

KdaiP
/

StableTTS1.1

Running

App Files Files Community

StableTTS1.1 / models /estimator.py

KdaiP

Upload 80 files

3dd84f8 verified over 1 year ago

raw

history blame contribute delete

5.73 kB

	import math

	import torch
	import torch.nn as nn

	from models.diffusion_transformer import DiTConVBlock

	class DitWrapper(nn.Module):
	""" add FiLM layer to condition time embedding to DiT """
	def __init__(self, hidden_channels, filter_channels, num_heads, kernel_size=3, p_dropout=0.1, gin_channels=0, time_channels=0):
	super().__init__()
	self.time_fusion = FiLMLayer(hidden_channels, time_channels)
	self.block = DiTConVBlock(hidden_channels, filter_channels, num_heads, kernel_size, p_dropout, gin_channels)

	def forward(self, x, c, t, x_mask):
	x = self.time_fusion(x, t) * x_mask
	x = self.block(x, c, x_mask)
	return x

	class FiLMLayer(nn.Module):
	"""
	Feature-wise Linear Modulation (FiLM) layer
	Reference: https://arxiv.org/abs/1709.07871
	"""
	def __init__(self, in_channels, cond_channels):

	super(FiLMLayer, self).__init__()
	self.in_channels = in_channels
	self.film = nn.Conv1d(cond_channels, in_channels * 2, 1)

	def forward(self, x, c):
	gamma, beta = torch.chunk(self.film(c.unsqueeze(2)), chunks=2, dim=1)
	return gamma * x + beta

	class SinusoidalPosEmb(nn.Module):
	def __init__(self, dim):
	super().__init__()
	self.dim = dim
	assert self.dim % 2 == 0, "SinusoidalPosEmb requires dim to be even"

	def forward(self, x, scale=1000):
	if x.ndim < 1:
	x = x.unsqueeze(0)
	half_dim = self.dim // 2
	emb = math.log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, device=x.device).float() * -emb)
	emb = scale * x.unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
	return emb

	class TimestepEmbedding(nn.Module):
	def __init__(self, in_channels, out_channels, filter_channels):
	super().__init__()

	self.layer = nn.Sequential(
	nn.Linear(in_channels, filter_channels),
	nn.SiLU(inplace=True),
	nn.Linear(filter_channels, out_channels)
	)

	def forward(self, x):
	return self.layer(x)

	# reference: https://github.com/shivammehta25/Matcha-TTS/blob/main/matcha/models/components/decoder.py
	class Decoder(nn.Module):
	def __init__(self, noise_channels, cond_channels, hidden_channels, out_channels, filter_channels, dropout=0.1, n_layers=1, n_heads=4, kernel_size=3, gin_channels=0, use_lsc=True):
	super().__init__()
	self.noise_channels = noise_channels
	self.cond_channels = cond_channels
	self.hidden_channels = hidden_channels
	self.out_channels = out_channels
	self.filter_channels = filter_channels
	self.use_lsc = use_lsc # whether to use unet-like long skip connection

	self.time_embeddings = SinusoidalPosEmb(hidden_channels)
	self.time_mlp = TimestepEmbedding(hidden_channels, hidden_channels, filter_channels)

	self.in_proj = nn.Conv1d(hidden_channels + noise_channels, hidden_channels, 1) # cat noise and encoder output as input
	self.blocks = nn.ModuleList([DitWrapper(hidden_channels, filter_channels, n_heads, kernel_size, dropout, gin_channels, hidden_channels) for _ in range(n_layers)])
	self.final_proj = nn.Conv1d(hidden_channels, out_channels, 1)

	# prenet for encoder output
	self.cond_proj = nn.Sequential(
	nn.Conv1d(cond_channels, filter_channels, kernel_size, padding=kernel_size//2),
	nn.SiLU(inplace=True),
	nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size//2), # add about 3M params
	nn.SiLU(inplace=True),
	nn.Conv1d(filter_channels, hidden_channels, kernel_size, padding=kernel_size//2)
	)

	if use_lsc:
	assert n_layers % 2 == 0
	self.n_lsc_layers = n_layers // 2
	self.lsc_layers = nn.ModuleList([nn.Conv1d(hidden_channels + hidden_channels, hidden_channels, kernel_size, padding = kernel_size // 2) for _ in range(self.n_lsc_layers)])

	self.initialize_weights()

	def initialize_weights(self):
	for block in self.blocks:
	nn.init.constant_(block.block.adaLN_modulation[-1].weight, 0)
	nn.init.constant_(block.block.adaLN_modulation[-1].bias, 0)

	def forward(self, t, x, mask, mu, c):
	"""Forward pass of the DiT model.

	Args:
	t (torch.Tensor): timestep, shape (batch_size)
	x (torch.Tensor): noise, shape (batch_size, in_channels, time)
	mask (torch.Tensor): shape (batch_size, 1, time)
	mu (torch.Tensor): output of encoder, shape (batch_size, in_channels, time)
	c (torch.Tensor): shape (batch_size, gin_channels)

	Returns:
	_type_: _description_
	"""

	t = self.time_mlp(self.time_embeddings(t))
	mu = self.cond_proj(mu)

	x = torch.cat((x, mu), dim=1)
	x = self.in_proj(x)

	lsc_outputs = [] if self.use_lsc else None

	for idx, block in enumerate(self.blocks):
	# add long skip connection, see https://arxiv.org/pdf/2209.12152 for more details
	if self.use_lsc:
	if idx < self.n_lsc_layers:
	lsc_outputs.append(x)
	else:
	x = torch.cat((x, lsc_outputs.pop()), dim=1)
	x = self.lsc_layers[idx - self.n_lsc_layers](x)

	x = block(x, c, t, mask)

	output = self.final_proj(x * mask)

	return output * mask