Spaces:

koajoel
/

PolyFormer

Running

PolyFormer / fairseq /examples /textless_nlp /gslm /unit2speech /tacotron2 /layers.py

jiang

init commit

650c5f6 over 2 years ago

3.86 kB

	import torch
	from librosa.filters import mel as librosa_mel_fn
	from .audio_processing import dynamic_range_compression
	from .audio_processing import dynamic_range_decompression
	from .stft import STFT
	from .utils import get_mask_from_lengths


	class LinearNorm(torch.nn.Module):
	def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
	super(LinearNorm, self).__init__()
	self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

	torch.nn.init.xavier_uniform_(
	self.linear_layer.weight,
	gain=torch.nn.init.calculate_gain(w_init_gain))

	def forward(self, x):
	return self.linear_layer(x)


	class ConvNorm(torch.nn.Module):
	def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
	padding=None, dilation=1, bias=True, w_init_gain='linear'):
	super(ConvNorm, self).__init__()
	if padding is None:
	assert(kernel_size % 2 == 1)
	padding = int(dilation * (kernel_size - 1) / 2)

	self.conv = torch.nn.Conv1d(in_channels, out_channels,
	kernel_size=kernel_size, stride=stride,
	padding=padding, dilation=dilation,
	bias=bias)

	torch.nn.init.xavier_uniform_(
	self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

	def forward(self, signal):
	conv_signal = self.conv(signal)
	return conv_signal


	class GlobalAvgPool(torch.nn.Module):
	def __init__(self):
	super(GlobalAvgPool, self).__init__()

	def forward(self, x, lengths=None):
	"""Average pooling across time steps (dim=1) with optionally lengths.
	Args:
	x: torch.Tensor of shape (N, T, ...)
	lengths: None or torch.Tensor of shape (N,)
	dim: dimension to pool
	"""
	if lengths is None:
	return x.mean(dim=1, keepdim=False)
	else:
	mask = get_mask_from_lengths(lengths).type(x.type()).to(x.device)
	mask_shape = list(mask.size()) + [1 for _ in range(x.ndimension()-2)]
	mask = mask.reshape(*mask_shape)
	numer = (x * mask).sum(dim=1, keepdim=False)
	denom = mask.sum(dim=1, keepdim=False)
	return numer / denom


	class TacotronSTFT(torch.nn.Module):
	def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
	n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
	mel_fmax=8000.0):
	super(TacotronSTFT, self).__init__()
	self.n_mel_channels = n_mel_channels
	self.sampling_rate = sampling_rate
	self.stft_fn = STFT(filter_length, hop_length, win_length)
	mel_basis = librosa_mel_fn(
	sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax)
	mel_basis = torch.from_numpy(mel_basis).float()
	self.register_buffer('mel_basis', mel_basis)

	def spectral_normalize(self, magnitudes):
	output = dynamic_range_compression(magnitudes)
	return output

	def spectral_de_normalize(self, magnitudes):
	output = dynamic_range_decompression(magnitudes)
	return output

	def mel_spectrogram(self, y):
	"""Computes mel-spectrograms from a batch of waves
	PARAMS
	------
	y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

	RETURNS
	-------
	mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
	"""
	assert(torch.min(y.data) >= -1)
	assert(torch.max(y.data) <= 1)

	magnitudes, phases = self.stft_fn.transform(y)
	magnitudes = magnitudes.data
	mel_output = torch.matmul(self.mel_basis, magnitudes)
	mel_output = self.spectral_normalize(mel_output)
	return mel_output