Spaces:

ameythakur
/

Deepfake-Audio

Running

File size: 3,224 Bytes

1d8403e

# ==================================================================================================
# DEEPFAKE AUDIO - vocoder/hparams.py (Hyperparameter Configuration)
# ==================================================================================================
# 
# 📝 DESCRIPTION
# This module defines the architectural and training hyperparameters for the 
# WaveRNN vocoder. It ensures consistent audio settings between the 
# synthesizer and vocoder, managing bit-depth, sampling rates, and neural 
# layer dimensions.
#
# 👤 AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🤝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# 🔗 PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# 📜 LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from synthesizer.hparams import hparams as _syn_hp

# Signal Conditioning: Core audio parameters synchronized with the synthesizer backend
sample_rate = _syn_hp.sample_rate
n_fft = _syn_hp.n_fft
num_mels = _syn_hp.num_mels
hop_length = _syn_hp.hop_size
win_length = _syn_hp.win_size
fmin = _syn_hp.fmin
min_level_db = _syn_hp.min_level_db
ref_level_db = _syn_hp.ref_level_db
mel_max_abs_value = _syn_hp.max_abs_value
preemphasis = _syn_hp.preemphasis
apply_preemphasis = _syn_hp.preemphasize

# Quantization: Bit depth of the output pulse-code modulation (PCM) signal
bits = 9                            # 9-bit resolution typically used for Mu-Law
mu_law = True                       # Non-linear quantization to preserve dynamic range

# Neural Architecture: WaveRNN structural parameters
voc_mode = 'RAW'                    # Synthesis Mode: 'RAW' (Softmax) or 'MOL' (Logistic Mixture)
voc_upsample_factors = (5, 5, 8)    # Transposed Conv factors (Product must equal hop_length)
voc_rnn_dims = 512                  # Hidden dimensions for the Gated Recurrent Unit (GRU)
voc_fc_dims = 512                   # Fully connected layer dimensions
voc_compute_dims = 128              # Embedding dimensions for categorical inputs
voc_res_out_dims = 128              # Residual block output channels
voc_res_blocks = 10                 # Depth of the residual sub-network

# Training Protocol: Weights optimization and data augmentation defaults
voc_batch_size = 100
voc_lr = 1e-4
voc_gen_at_checkpoint = 5           # Samples to generate for qualitative assessment
voc_pad = 2                         # Visual padding for the ResNet receptive field
voc_seq_len = hop_length * 5        # Optimization window size (multiples of hop_length)

# Inference Strategy: Parameters for real-time waveform synthesis
voc_gen_batched = True              # Enables high-speed parallel generation
voc_target = 8000                   # Chunk size for batched inference
voc_overlap = 400                   # Crossfade overlap to prevent boundary artifacts