Spaces:

Rongjiehuang
/

ProDiff

Runtime error

App Files Files Community

ProDiff / modules /FastDiff /module /modules.py

Rongjiehuang

init

64e7f2f over 3 years ago

raw

history blame

14.4 kB

	import math
	import torch
	import numpy as np
	import torch.nn as nn
	import torch.nn.functional as F

	from torch.nn import Conv1d

	LRELU_SLOPE = 0.1



	def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
	''' Sinusoid position encoding table '''

	def cal_angle(position, hid_idx):
	return position / np.power(10000, 2 * (hid_idx // 2) / d_hid)

	def get_posi_angle_vec(position):
	return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

	sinusoid_table = np.array([get_posi_angle_vec(pos_i)
	for pos_i in range(n_position)])

	sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
	sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1

	if padding_idx is not None:
	# zero vector for padding dimension
	sinusoid_table[padding_idx] = 0.

	return torch.FloatTensor(sinusoid_table)


	def overlap_and_add(signal, frame_step):
	"""Reconstructs a signal from a framed representation.

	Adds potentially overlapping frames of a signal with shape
	`[..., frames, frame_length]`, offsetting subsequent frames by `frame_step`.
	The resulting tensor has shape `[..., output_size]` where

	output_size = (frames - 1) * frame_step + frame_length

	Args:
	signal: A [..., frames, frame_length] Tensor. All dimensions may be unknown, and rank must be at least 2.
	frame_step: An integer denoting overlap offsets. Must be less than or equal to frame_length.

	Returns:
	A Tensor with shape [..., output_size] containing the overlap-added frames of signal's inner-most two dimensions.
	output_size = (frames - 1) * frame_step + frame_length

	Based on https://github.com/tensorflow/tensorflow/blob/r1.12/tensorflow/contrib/signal/python/ops/reconstruction_ops.py
	"""
	outer_dimensions = signal.size()[:-2]
	frames, frame_length = signal.size()[-2:]

	# gcd=Greatest Common Divisor
	subframe_length = math.gcd(frame_length, frame_step)
	subframe_step = frame_step // subframe_length
	subframes_per_frame = frame_length // subframe_length
	output_size = frame_step * (frames - 1) + frame_length
	output_subframes = output_size // subframe_length

	subframe_signal = signal.view(*outer_dimensions, -1, subframe_length)

	frame = torch.arange(0, output_subframes).unfold(0, subframes_per_frame, subframe_step)
	frame = signal.new_tensor(frame).long() # signal may in GPU or CPU
	frame = frame.contiguous().view(-1)

	result = signal.new_zeros(*outer_dimensions, output_subframes, subframe_length)
	device_of_result = result.device
	result.index_add_(-2, frame.to(device_of_result), subframe_signal)
	result = result.view(*outer_dimensions, -1)
	return result


	class LastLayer(nn.Module):
	def __init__(self, in_channels, out_channels,
	nonlinear_activation, nonlinear_activation_params,
	pad, kernel_size, pad_params, bias):
	super(LastLayer, self).__init__()
	self.activation = getattr(torch.nn, nonlinear_activation)(**nonlinear_activation_params)
	self.pad = getattr(torch.nn, pad)((kernel_size - 1) // 2, **pad_params)
	self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size, bias=bias)

	def forward(self, x):
	x = self.activation(x)
	x = self.pad(x)
	x = self.conv(x)
	return x


	class WeightConv1d(Conv1d):
	"""Conv1d module with customized initialization."""

	def __init__(self, args, *kwargs):
	"""Initialize Conv1d module."""
	super(Conv1d, self).__init__(args, *kwargs)

	def reset_parameters(self):
	"""Reset parameters."""
	torch.nn.init.kaiming_normal_(self.weight, nonlinearity="relu")
	if self.bias is not None:
	torch.nn.init.constant_(self.bias, 0.0)


	class Conv1d1x1(Conv1d):
	"""1x1 Conv1d with customized initialization."""

	def __init__(self, in_channels, out_channels, bias):
	"""Initialize 1x1 Conv1d module."""
	super(Conv1d1x1, self).__init__(in_channels, out_channels,
	kernel_size=1, padding=0,
	dilation=1, bias=bias)

	class DiffusionDBlock(nn.Module):
	def __init__(self, input_size, hidden_size, factor):
	super().__init__()
	self.factor = factor
	self.residual_dense = Conv1d(input_size, hidden_size, 1)
	self.conv = nn.ModuleList([
	Conv1d(input_size, hidden_size, 3, dilation=1, padding=1),
	Conv1d(hidden_size, hidden_size, 3, dilation=2, padding=2),
	Conv1d(hidden_size, hidden_size, 3, dilation=4, padding=4),
	])

	def forward(self, x):
	size = x.shape[-1] // self.factor

	residual = self.residual_dense(x)
	residual = F.interpolate(residual, size=size)

	x = F.interpolate(x, size=size)
	for layer in self.conv:
	x = F.leaky_relu(x, 0.2)
	x = layer(x)

	return x + residual


	class TimeAware_LVCBlock(torch.nn.Module):
	''' time-aware location-variable convolutions
	'''
	def __init__(self,
	in_channels,
	cond_channels,
	upsample_ratio,
	conv_layers=4,
	conv_kernel_size=3,
	cond_hop_length=256,
	kpnet_hidden_channels=64,
	kpnet_conv_size=3,
	kpnet_dropout=0.0,
	noise_scale_embed_dim_out=512
	):
	super().__init__()

	self.cond_hop_length = cond_hop_length
	self.conv_layers = conv_layers
	self.conv_kernel_size = conv_kernel_size
	self.convs = torch.nn.ModuleList()

	self.upsample = torch.nn.ConvTranspose1d(in_channels, in_channels,
	kernel_size=upsample_ratio*2, stride=upsample_ratio,
	padding=upsample_ratio // 2 + upsample_ratio % 2,
	output_padding=upsample_ratio % 2)


	self.kernel_predictor = KernelPredictor(
	cond_channels=cond_channels,
	conv_in_channels=in_channels,
	conv_out_channels=2 * in_channels,
	conv_layers=conv_layers,
	conv_kernel_size=conv_kernel_size,
	kpnet_hidden_channels=kpnet_hidden_channels,
	kpnet_conv_size=kpnet_conv_size,
	kpnet_dropout=kpnet_dropout
	)

	# the layer-specific fc for noise scale embedding
	self.fc_t = torch.nn.Linear(noise_scale_embed_dim_out, cond_channels)

	for i in range(conv_layers):
	padding = (3 ** i) * int((conv_kernel_size - 1) / 2)
	conv = torch.nn.Conv1d(in_channels, in_channels, kernel_size=conv_kernel_size, padding=padding, dilation=3 ** i)

	self.convs.append(conv)


	def forward(self, data):
	''' forward propagation of the time-aware location-variable convolutions.
	Args:
	x (Tensor): the input sequence (batch, in_channels, in_length)
	c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)

	Returns:
	Tensor: the output sequence (batch, in_channels, in_length)
	'''
	x, audio_down, c, noise_embedding = data
	batch, in_channels, in_length = x.shape

	noise = (self.fc_t(noise_embedding)).unsqueeze(-1) # (B, 80)
	condition = c + noise # (B, 80, T)
	kernels, bias = self.kernel_predictor(condition)
	x = F.leaky_relu(x, 0.2)
	x = self.upsample(x)

	for i in range(self.conv_layers):
	x += audio_down
	y = F.leaky_relu(x, 0.2)
	y = self.convs[i](y)
	y = F.leaky_relu(y, 0.2)

	k = kernels[:, i, :, :, :, :]
	b = bias[:, i, :, :]
	y = self.location_variable_convolution(y, k, b, 1, self.cond_hop_length)
	x = x + torch.sigmoid(y[:, :in_channels, :]) * torch.tanh(y[:, in_channels:, :])
	return x

	def location_variable_convolution(self, x, kernel, bias, dilation, hop_size):
	''' perform location-variable convolution operation on the input sequence (x) using the local convolution kernl.
	Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
	Args:
	x (Tensor): the input sequence (batch, in_channels, in_length).
	kernel (Tensor): the local convolution kernel (batch, in_channel, out_channels, kernel_size, kernel_length)
	bias (Tensor): the bias for the local convolution (batch, out_channels, kernel_length)
	dilation (int): the dilation of convolution.
	hop_size (int): the hop_size of the conditioning sequence.
	Returns:
	(Tensor): the output sequence after performing local convolution. (batch, out_channels, in_length).
	'''
	batch, in_channels, in_length = x.shape
	batch, in_channels, out_channels, kernel_size, kernel_length = kernel.shape


	assert in_length == (kernel_length * hop_size), "length of (x, kernel) is not matched"

	padding = dilation * int((kernel_size - 1) / 2)
	x = F.pad(x, (padding, padding), 'constant', 0) # (batch, in_channels, in_length + 2*padding)
	x = x.unfold(2, hop_size + 2 * padding, hop_size) # (batch, in_channels, kernel_length, hop_size + 2*padding)

	if hop_size < dilation:
	x = F.pad(x, (0, dilation), 'constant', 0)
	x = x.unfold(3, dilation,
	dilation) # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
	x = x[:, :, :, :, :hop_size]
	x = x.transpose(3, 4) # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
	x = x.unfold(4, kernel_size, 1) # (batch, in_channels, kernel_length, dilation, _, kernel_size)

	o = torch.einsum('bildsk,biokl->bolsd', x, kernel)
	o = o + bias.unsqueeze(-1).unsqueeze(-1)
	o = o.contiguous().view(batch, out_channels, -1)
	return o



	class KernelPredictor(torch.nn.Module):
	''' Kernel predictor for the time-aware location-variable convolutions
	'''

	def __init__(self,
	cond_channels,
	conv_in_channels,
	conv_out_channels,
	conv_layers,
	conv_kernel_size=3,
	kpnet_hidden_channels=64,
	kpnet_conv_size=3,
	kpnet_dropout=0.0,
	kpnet_nonlinear_activation="LeakyReLU",
	kpnet_nonlinear_activation_params={"negative_slope": 0.1}
	):
	'''
	Args:
	cond_channels (int): number of channel for the conditioning sequence,
	conv_in_channels (int): number of channel for the input sequence,
	conv_out_channels (int): number of channel for the output sequence,
	conv_layers (int):
	kpnet_
	'''
	super().__init__()

	self.conv_in_channels = conv_in_channels
	self.conv_out_channels = conv_out_channels
	self.conv_kernel_size = conv_kernel_size
	self.conv_layers = conv_layers

	l_w = conv_in_channels * conv_out_channels * conv_kernel_size * conv_layers
	l_b = conv_out_channels * conv_layers

	padding = (kpnet_conv_size - 1) // 2
	self.input_conv = torch.nn.Sequential(
	torch.nn.Conv1d(cond_channels, kpnet_hidden_channels, 5, padding=(5 - 1) // 2, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	)

	self.residual_conv = torch.nn.Sequential(
	torch.nn.Dropout(kpnet_dropout),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	torch.nn.Dropout(kpnet_dropout),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	torch.nn.Dropout(kpnet_dropout),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	torch.nn.Conv1d(kpnet_hidden_channels, kpnet_hidden_channels, kpnet_conv_size, padding=padding, bias=True),
	getattr(torch.nn, kpnet_nonlinear_activation)(**kpnet_nonlinear_activation_params),
	)

	self.kernel_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_w, kpnet_conv_size,
	padding=padding, bias=True)
	self.bias_conv = torch.nn.Conv1d(kpnet_hidden_channels, l_b, kpnet_conv_size, padding=padding,
	bias=True)

	def forward(self, c):
	'''
	Args:
	c (Tensor): the conditioning sequence (batch, cond_channels, cond_length)
	Returns:
	'''
	batch, cond_channels, cond_length = c.shape

	c = self.input_conv(c)
	c = c + self.residual_conv(c)
	k = self.kernel_conv(c)
	b = self.bias_conv(c)

	kernels = k.contiguous().view(batch,
	self.conv_layers,
	self.conv_in_channels,
	self.conv_out_channels,
	self.conv_kernel_size,
	cond_length)
	bias = b.contiguous().view(batch,
	self.conv_layers,
	self.conv_out_channels,
	cond_length)
	return kernels, bias