Spaces:
Running
Running
File size: 3,224 Bytes
1d8403e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | # ==================================================================================================
# DEEPFAKE AUDIO - vocoder/hparams.py (Hyperparameter Configuration)
# ==================================================================================================
#
# π DESCRIPTION
# This module defines the architectural and training hyperparameters for the
# WaveRNN vocoder. It ensures consistent audio settings between the
# synthesizer and vocoder, managing bit-depth, sampling rates, and neural
# layer dimensions.
#
# π€ AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# π€π» CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# π PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# π LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================
from synthesizer.hparams import hparams as _syn_hp
# Signal Conditioning: Core audio parameters synchronized with the synthesizer backend
sample_rate = _syn_hp.sample_rate
n_fft = _syn_hp.n_fft
num_mels = _syn_hp.num_mels
hop_length = _syn_hp.hop_size
win_length = _syn_hp.win_size
fmin = _syn_hp.fmin
min_level_db = _syn_hp.min_level_db
ref_level_db = _syn_hp.ref_level_db
mel_max_abs_value = _syn_hp.max_abs_value
preemphasis = _syn_hp.preemphasis
apply_preemphasis = _syn_hp.preemphasize
# Quantization: Bit depth of the output pulse-code modulation (PCM) signal
bits = 9 # 9-bit resolution typically used for Mu-Law
mu_law = True # Non-linear quantization to preserve dynamic range
# Neural Architecture: WaveRNN structural parameters
voc_mode = 'RAW' # Synthesis Mode: 'RAW' (Softmax) or 'MOL' (Logistic Mixture)
voc_upsample_factors = (5, 5, 8) # Transposed Conv factors (Product must equal hop_length)
voc_rnn_dims = 512 # Hidden dimensions for the Gated Recurrent Unit (GRU)
voc_fc_dims = 512 # Fully connected layer dimensions
voc_compute_dims = 128 # Embedding dimensions for categorical inputs
voc_res_out_dims = 128 # Residual block output channels
voc_res_blocks = 10 # Depth of the residual sub-network
# Training Protocol: Weights optimization and data augmentation defaults
voc_batch_size = 100
voc_lr = 1e-4
voc_gen_at_checkpoint = 5 # Samples to generate for qualitative assessment
voc_pad = 2 # Visual padding for the ResNet receptive field
voc_seq_len = hop_length * 5 # Optimization window size (multiples of hop_length)
# Inference Strategy: Parameters for real-time waveform synthesis
voc_gen_batched = True # Enables high-speed parallel generation
voc_target = 8000 # Chunk size for batched inference
voc_overlap = 400 # Crossfade overlap to prevent boundary artifacts
|