Spaces:

ameythakur
/

Deepfake-Audio

Running

App Files Files Community

Deepfake-Audio / Source Code /vocoder /hparams.py

ameythakur

Deepfake-Audio

1d8403e verified about 2 months ago

raw

history blame contribute delete

3.22 kB

	# ==================================================================================================
	# DEEPFAKE AUDIO - vocoder/hparams.py (Hyperparameter Configuration)
	# ==================================================================================================
	#
	# 📝 DESCRIPTION
	# This module defines the architectural and training hyperparameters for the
	# WaveRNN vocoder. It ensures consistent audio settings between the
	# synthesizer and vocoder, managing bit-depth, sampling rates, and neural
	# layer dimensions.
	#
	# 👤 AUTHORS
	# - Amey Thakur (https://github.com/Amey-Thakur)
	# - Mega Satish (https://github.com/msatmod)
	#
	# 🤝🏻 CREDITS
	# Original Real-Time Voice Cloning methodology by CorentinJ
	# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
	#
	# 🔗 PROJECT LINKS
	# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
	# Video Demo: https://youtu.be/i3wnBcbHDbs
	# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
	#
	# 📜 LICENSE
	# Released under the MIT License
	# Release Date: 2021-02-06
	# ==================================================================================================

	from synthesizer.hparams import hparams as _syn_hp

	# Signal Conditioning: Core audio parameters synchronized with the synthesizer backend
	sample_rate = _syn_hp.sample_rate
	n_fft = _syn_hp.n_fft
	num_mels = _syn_hp.num_mels
	hop_length = _syn_hp.hop_size
	win_length = _syn_hp.win_size
	fmin = _syn_hp.fmin
	min_level_db = _syn_hp.min_level_db
	ref_level_db = _syn_hp.ref_level_db
	mel_max_abs_value = _syn_hp.max_abs_value
	preemphasis = _syn_hp.preemphasis
	apply_preemphasis = _syn_hp.preemphasize

	# Quantization: Bit depth of the output pulse-code modulation (PCM) signal
	bits = 9 # 9-bit resolution typically used for Mu-Law
	mu_law = True # Non-linear quantization to preserve dynamic range

	# Neural Architecture: WaveRNN structural parameters
	voc_mode = 'RAW' # Synthesis Mode: 'RAW' (Softmax) or 'MOL' (Logistic Mixture)
	voc_upsample_factors = (5, 5, 8) # Transposed Conv factors (Product must equal hop_length)
	voc_rnn_dims = 512 # Hidden dimensions for the Gated Recurrent Unit (GRU)
	voc_fc_dims = 512 # Fully connected layer dimensions
	voc_compute_dims = 128 # Embedding dimensions for categorical inputs
	voc_res_out_dims = 128 # Residual block output channels
	voc_res_blocks = 10 # Depth of the residual sub-network

	# Training Protocol: Weights optimization and data augmentation defaults
	voc_batch_size = 100
	voc_lr = 1e-4
	voc_gen_at_checkpoint = 5 # Samples to generate for qualitative assessment
	voc_pad = 2 # Visual padding for the ResNet receptive field
	voc_seq_len = hop_length * 5 # Optimization window size (multiples of hop_length)

	# Inference Strategy: Parameters for real-time waveform synthesis
	voc_gen_batched = True # Enables high-speed parallel generation
	voc_target = 8000 # Chunk size for batched inference
	voc_overlap = 400 # Crossfade overlap to prevent boundary artifacts