ThreadAbort
/

IndexTTS-Rust

Model card Files Files and versions

IndexTTS-Rust / indextts /utils /maskgct /models /codec /facodec /modules /style_encoder.py

ThreadAbort's picture

init

f999cc8 4 months ago

3.18 kB

	# Copyright (c) 2023 Amphion.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	# This code is modified from https://github.com/sh-lee-prml/HierSpeechpp/blob/main/ttv_v1/styleencoder.py

	from . import attentions
	from torch import nn
	import torch
	from torch.nn import functional as F


	class Mish(nn.Module):
	def __init__(self):
	super(Mish, self).__init__()

	def forward(self, x):
	return x * torch.tanh(F.softplus(x))


	class Conv1dGLU(nn.Module):
	"""
	Conv1d + GLU(Gated Linear Unit) with residual connection.
	For GLU refer to https://arxiv.org/abs/1612.08083 paper.
	"""

	def __init__(self, in_channels, out_channels, kernel_size, dropout):
	super(Conv1dGLU, self).__init__()
	self.out_channels = out_channels
	self.conv1 = nn.Conv1d(
	in_channels, 2 * out_channels, kernel_size=kernel_size, padding=2
	)
	self.dropout = nn.Dropout(dropout)

	def forward(self, x):
	residual = x
	x = self.conv1(x)
	x1, x2 = torch.split(x, split_size_or_sections=self.out_channels, dim=1)
	x = x1 * torch.sigmoid(x2)
	x = residual + self.dropout(x)
	return x


	class StyleEncoder(torch.nn.Module):
	def __init__(self, in_dim=513, hidden_dim=128, out_dim=256):

	super().__init__()

	self.in_dim = in_dim # Linear 513 wav2vec 2.0 1024
	self.hidden_dim = hidden_dim
	self.out_dim = out_dim
	self.kernel_size = 5
	self.n_head = 2
	self.dropout = 0.1

	self.spectral = nn.Sequential(
	nn.Conv1d(self.in_dim, self.hidden_dim, 1),
	Mish(),
	nn.Dropout(self.dropout),
	nn.Conv1d(self.hidden_dim, self.hidden_dim, 1),
	Mish(),
	nn.Dropout(self.dropout),
	)

	self.temporal = nn.Sequential(
	Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
	Conv1dGLU(self.hidden_dim, self.hidden_dim, self.kernel_size, self.dropout),
	)

	self.slf_attn = attentions.MultiHeadAttention(
	self.hidden_dim,
	self.hidden_dim,
	self.n_head,
	p_dropout=self.dropout,
	proximal_bias=False,
	proximal_init=True,
	)
	self.atten_drop = nn.Dropout(self.dropout)
	self.fc = nn.Conv1d(self.hidden_dim, self.out_dim, 1)

	def forward(self, x, mask=None):

	# spectral
	x = self.spectral(x) * mask
	# temporal
	x = self.temporal(x) * mask

	# self-attention
	attn_mask = mask.unsqueeze(2) * mask.unsqueeze(-1)
	y = self.slf_attn(x, x, attn_mask=attn_mask)
	x = x + self.atten_drop(y)

	# fc
	x = self.fc(x)

	# temoral average pooling
	w = self.temporal_avg_pool(x, mask=mask)

	return w

	def temporal_avg_pool(self, x, mask=None):
	if mask is None:
	out = torch.mean(x, dim=2)
	else:
	len_ = mask.sum(dim=2)
	x = x.sum(dim=2)

	out = torch.div(x, len_)
	return out