Voice-Clone-Multilingual

Running

App Files Files Community

Voice-Clone-Multilingual / TTS /vocoder /configs /wavernn_config.py

Shadhil

voice-clone with single audio sample input

9b2107c over 2 years ago

raw

history blame contribute delete

4.55 kB

	from dataclasses import dataclass, field

	from TTS.vocoder.configs.shared_configs import BaseVocoderConfig
	from TTS.vocoder.models.wavernn import WavernnArgs


	@dataclass
	class WavernnConfig(BaseVocoderConfig):
	"""Defines parameters for Wavernn vocoder.
	Example:

	>>> from TTS.vocoder.configs import WavernnConfig
	>>> config = WavernnConfig()

	Args:
	model (str):
	Model name used for selecting the right model at initialization. Defaults to `wavernn`.
	mode (str):
	Output mode of the WaveRNN vocoder. `mold` for Mixture of Logistic Distribution, `gauss` for a single
	Gaussian Distribution and `bits` for quantized bits as the model's output.
	mulaw (bool):
	enable / disable the use of Mulaw quantization for training. Only applicable if `mode == 'bits'`. Defaults
	to `True`.
	generator_model (str):
	One of the generators from TTS.vocoder.models.*`. Every other non-GAN vocoder model is
	considered as a generator too. Defaults to `WaveRNN`.
	wavernn_model_params (dict):
	kwargs for the WaveRNN model. Defaults to
	`{
	"rnn_dims": 512,
	"fc_dims": 512,
	"compute_dims": 128,
	"res_out_dims": 128,
	"num_res_blocks": 10,
	"use_aux_net": True,
	"use_upsample_net": True,
	"upsample_factors": [4, 8, 8]
	}`
	batched (bool):
	enable / disable the batched inference. It speeds up the inference by splitting the input into segments and
	processing the segments in a batch. Then it merges the outputs with a certain overlap and smoothing. If
	you set it False, without CUDA, it is too slow to be practical. Defaults to True.
	target_samples (int):
	Size of the segments in batched mode. Defaults to 11000.
	overlap_sampels (int):
	Size of the overlap between consecutive segments. Defaults to 550.
	batch_size (int):
	Batch size used at training. Larger values use more memory. Defaults to 256.
	seq_len (int):
	Audio segment length used at training. Larger values use more memory. Defaults to 1280.

	use_noise_augment (bool):
	enable / disable random noise added to the input waveform. The noise is added after computing the
	features. Defaults to True.
	use_cache (bool):
	enable / disable in memory caching of the computed features. It can cause OOM error if the system RAM is
	not large enough. Defaults to True.
	mixed_precision (bool):
	enable / disable mixed precision training. Default is True.
	eval_split_size (int):
	Number of samples used for evalutaion. Defaults to 50.
	num_epochs_before_test (int):
	Number of epochs waited to run the next evalution. Since inference takes some time, it is better to
	wait some number of epochs not ot waste training time. Defaults to 10.
	grad_clip (float):
	Gradient clipping threshold. If <= 0.0, no clipping is applied. Defaults to 4.0
	lr (float):
	Initila leraning rate. Defaults to 1e-4.
	lr_scheduler (str):
	One of the learning rate schedulers from `torch.optim.scheduler.*`. Defaults to `MultiStepLR`.
	lr_scheduler_params (dict):
	kwargs for the scheduler. Defaults to `{"gamma": 0.5, "milestones": [200000, 400000, 600000]}`
	"""

	model: str = "wavernn"

	# Model specific params
	model_args: WavernnArgs = field(default_factory=WavernnArgs)
	target_loss: str = "loss"

	# Inference
	batched: bool = True
	target_samples: int = 11000
	overlap_samples: int = 550

	# Training - overrides
	epochs: int = 10000
	batch_size: int = 256
	seq_len: int = 1280
	use_noise_augment: bool = False
	use_cache: bool = True
	mixed_precision: bool = True
	eval_split_size: int = 50
	num_epochs_before_test: int = (
	10 # number of epochs to wait until the next test run (synthesizing a full audio clip).
	)

	# optimizer overrides
	grad_clip: float = 4.0
	lr: float = 1e-4 # Initial learning rate.
	lr_scheduler: str = "MultiStepLR" # one of the schedulers from https:#pytorch.org/docs/stable/optim.html
	lr_scheduler_params: dict = field(default_factory=lambda: {"gamma": 0.5, "milestones": [200000, 400000, 600000]})