TTSIE / Vocoder_Layers.py
masterofaudio2077's picture
Upload 13 files
c39b616 verified
Raw
History Blame Contribute Delete
2.22 kB
from Imports import *
from Configuration import *
def ResBlock_v1(x, filters, kernel_size, dilation_rates=(1, 3, 5)):
"""
Standard HiFi-GAN ResBlock. Contains 3 cycles of convolutions.
"""
for d in dilation_rates:
shortcut = x
# Sub-cycle 1
x = layers.LeakyReLU(0.1)(x)
x = layers.Conv1D(filters, kernel_size, dilation_rate=d, padding='same')(x)
# Sub-cycle 2
x = layers.LeakyReLU(0.1)(x)
x = layers.Conv1D(filters, kernel_size, dilation_rate=1, padding='same')(x)
# Residual connection
x = x + shortcut
return x
def MRF_Module(x, filters):
"""
Multi-Receptive Field Fusion.
Runs 3 ResBlocks in parallel and averages their output.
"""
# Parallel paths with kernel sizes 3, 7, and 11
res1 = ResBlock_v1(x, filters, kernel_size=3, dilation_rates=(1, 3, 5))
res2 = ResBlock_v1(x, filters, kernel_size=7, dilation_rates=(1, 3, 5))
res3 = ResBlock_v1(x, filters, kernel_size=11, dilation_rates=(1, 3, 5))
# Average the three paths (or Add, then divide by 3)
return (res1 + res2 + res3) / 3
def build_generator(input_shape=(None, 80)):
mel_input = keras.layers.Input(shape=input_shape) # [B, T_mel, 80]
# Initial Convolution
x = layers.Conv1D(512, kernel_size=7, padding='same')(mel_input)
# Standard HiFi-GAN upsampling ratios: [8, 8, 2, 2]
# Standard channels: [256, 128, 64, 32]
upsample_rates = [8, 8, 2, 2]
upsample_kernels = [16, 16, 4, 4]
channels = [256, 128, 64, 32]
for i in range(len(upsample_rates)):
x = layers.LeakyReLU(0.1)(x)
# Upsample
x = layers.Conv1DTranspose(
channels[i],
kernel_size=upsample_kernels[i],
strides=upsample_rates[i],
padding='same'
)(x)
# Apply Multi-Receptive Field Fusion (Parallel blocks)
x = MRF_Module(x, channels[i])
# Final output layer
x = layers.LeakyReLU(0.1)(x)
x = layers.Conv1D(1, kernel_size=7, padding='same', activation='tanh')(x)
return keras.models.Model(mel_input, x, name="HiFiGAN_Generator_V1")