Spaces:
Running
Running
| # ================================================================================================== | |
| # DEEPFAKE AUDIO - vocoder/hparams.py (Hyperparameter Configuration) | |
| # ================================================================================================== | |
| # | |
| # π DESCRIPTION | |
| # This module defines the architectural and training hyperparameters for the | |
| # WaveRNN vocoder. It ensures consistent audio settings between the | |
| # synthesizer and vocoder, managing bit-depth, sampling rates, and neural | |
| # layer dimensions. | |
| # | |
| # π€ AUTHORS | |
| # - Amey Thakur (https://github.com/Amey-Thakur) | |
| # - Mega Satish (https://github.com/msatmod) | |
| # | |
| # π€π» CREDITS | |
| # Original Real-Time Voice Cloning methodology by CorentinJ | |
| # Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning | |
| # | |
| # π PROJECT LINKS | |
| # Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO | |
| # Video Demo: https://youtu.be/i3wnBcbHDbs | |
| # Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb | |
| # | |
| # π LICENSE | |
| # Released under the MIT License | |
| # Release Date: 2021-02-06 | |
| # ================================================================================================== | |
| from synthesizer.hparams import hparams as _syn_hp | |
| # Signal Conditioning: Core audio parameters synchronized with the synthesizer backend | |
| sample_rate = _syn_hp.sample_rate | |
| n_fft = _syn_hp.n_fft | |
| num_mels = _syn_hp.num_mels | |
| hop_length = _syn_hp.hop_size | |
| win_length = _syn_hp.win_size | |
| fmin = _syn_hp.fmin | |
| min_level_db = _syn_hp.min_level_db | |
| ref_level_db = _syn_hp.ref_level_db | |
| mel_max_abs_value = _syn_hp.max_abs_value | |
| preemphasis = _syn_hp.preemphasis | |
| apply_preemphasis = _syn_hp.preemphasize | |
| # Quantization: Bit depth of the output pulse-code modulation (PCM) signal | |
| bits = 9 # 9-bit resolution typically used for Mu-Law | |
| mu_law = True # Non-linear quantization to preserve dynamic range | |
| # Neural Architecture: WaveRNN structural parameters | |
| voc_mode = 'RAW' # Synthesis Mode: 'RAW' (Softmax) or 'MOL' (Logistic Mixture) | |
| voc_upsample_factors = (5, 5, 8) # Transposed Conv factors (Product must equal hop_length) | |
| voc_rnn_dims = 512 # Hidden dimensions for the Gated Recurrent Unit (GRU) | |
| voc_fc_dims = 512 # Fully connected layer dimensions | |
| voc_compute_dims = 128 # Embedding dimensions for categorical inputs | |
| voc_res_out_dims = 128 # Residual block output channels | |
| voc_res_blocks = 10 # Depth of the residual sub-network | |
| # Training Protocol: Weights optimization and data augmentation defaults | |
| voc_batch_size = 100 | |
| voc_lr = 1e-4 | |
| voc_gen_at_checkpoint = 5 # Samples to generate for qualitative assessment | |
| voc_pad = 2 # Visual padding for the ResNet receptive field | |
| voc_seq_len = hop_length * 5 # Optimization window size (multiples of hop_length) | |
| # Inference Strategy: Parameters for real-time waveform synthesis | |
| voc_gen_batched = True # Enables high-speed parallel generation | |
| voc_target = 8000 # Chunk size for batched inference | |
| voc_overlap = 400 # Crossfade overlap to prevent boundary artifacts | |