| | |
| | |
| | |
| | |
| |
|
| | import torch |
| | import torch.nn as nn |
| | import torch.nn.functional as F |
| | from torch.nn.utils import spectral_norm |
| | from modules.generic.conv import Conv1d |
| |
|
| |
|
| | class ConvEncoder(nn.Module): |
| | def __init__(self, in_channels, z_channels, spk_channels, num_dilation_layer=10): |
| | super(ConvEncoder, self).__init__() |
| |
|
| | self.in_channels = in_channels |
| | self.z_channels = z_channels |
| | self.spk_channels = spk_channels |
| |
|
| | self.pre_process = Conv1d(in_channels, 512, kernel_size=3) |
| |
|
| | self.dilated_conv_layers = nn.ModuleList() |
| | for i in range(num_dilation_layer): |
| | dilation = 2**i |
| | self.dilated_conv_layers.append( |
| | DilatedConvBlock(512, 512, z_channels, spk_channels, dilation) |
| | ) |
| |
|
| | def forward(self, inputs, z, s): |
| | inputs = inputs.transpose(1, 2) |
| | outputs = self.pre_process(inputs) |
| | print(inputs.shape) |
| | for layer in self.dilated_conv_layers: |
| | outputs = layer(outputs, z, s) |
| |
|
| | encoder_outputs = outputs.transpose(1, 2) |
| | return encoder_outputs |
| |
|
| |
|
| | class DilatedConvBlock(nn.Module): |
| | """A stack of dilated convolutions interspersed |
| | with batch normalisation and ReLU activations""" |
| |
|
| | def __init__(self, in_channels, out_channels, z_channels, s_channels, dilation): |
| | super(DilatedConvBlock, self).__init__() |
| |
|
| | self.in_channels = in_channels |
| | self.out_channels = out_channels |
| | self.z_channels = z_channels |
| | self.s_channels = s_channels |
| |
|
| | self.conv1d = Conv1d( |
| | in_channels, out_channels, kernel_size=3, dilation=dilation |
| | ) |
| | self.batch_layer = BatchNorm1dLayer(out_channels, s_channels, z_channels) |
| |
|
| | def forward(self, inputs, z, s): |
| | outputs = self.conv1d(inputs) |
| | outputs = self.batch_layer(outputs, z, s) |
| | return F.relu(outputs) |
| |
|
| |
|
| | class BatchNorm1dLayer(nn.Module): |
| | """The latents z and speaker embedding s modulate the scale and |
| | shift parameters of the batch normalisation layers""" |
| |
|
| | def __init__(self, num_features, s_channels=128, z_channels=128): |
| | super().__init__() |
| |
|
| | self.num_features = num_features |
| | self.s_channels = s_channels |
| | self.z_channels = z_channels |
| | self.batch_nrom = nn.BatchNorm1d(num_features, affine=False) |
| |
|
| | self.scale_layer = spectral_norm(nn.Linear(z_channels, num_features)) |
| | self.scale_layer.weight.data.normal_(1, 0.02) |
| | self.scale_layer.bias.data.zero_() |
| |
|
| | self.shift_layer = spectral_norm(nn.Linear(s_channels, num_features)) |
| | self.shift_layer.weight.data.normal_(1, 0.02) |
| | self.shift_layer.bias.data.zero_() |
| |
|
| | def forward(self, inputs, z, s): |
| | outputs = self.batch_nrom(inputs) |
| | scale = self.scale_layer(z) |
| | scale = scale.view(-1, self.num_features, 1) |
| |
|
| | shift = self.shift_layer(s) |
| | shift = shift.view(-1, self.num_features, 1) |
| |
|
| | outputs = scale * outputs + shift |
| |
|
| | return outputs |
| |
|
| |
|
| | if __name__ == "__main__": |
| | model = ConvEncoder(256, 64, 64) |
| | encoder_inputs = torch.randn(2, 256, 10) |
| | z = torch.randn(2, 64) |
| | speaker = torch.randn(1, 64) |
| | outputs, duration = model(encoder_inputs, z, speaker) |
| | print(outputs.shape, duration.shape) |
| |
|