| { |
| "_target_": "modules.v2.vc_wrapper.VoiceConversionWrapper", |
| "sr": 22050, |
| "hop_size": 256, |
| "mel_fn": { |
| "_target_": "modules.audio.mel_spectrogram", |
| "_partial_": true, |
| "n_fft": 1024, |
| "win_size": 1024, |
| "hop_size": 256, |
| "num_mels": 80, |
| "sampling_rate": 22050, |
| "fmin": 0, |
| "fmax": null, |
| "center": false |
| }, |
| "cfm": { |
| "_target_": "modules.v2.cfm.CFM", |
| "estimator": { |
| "_target_": "modules.v2.dit_wrapper.DiT", |
| "time_as_token": true, |
| "style_as_token": true, |
| "uvit_skip_connection": false, |
| "block_size": 8192, |
| "depth": 13, |
| "num_heads": 8, |
| "hidden_dim": 512, |
| "in_channels": 80, |
| "content_dim": 512, |
| "style_encoder_dim": 192, |
| "class_dropout_prob": 0.1, |
| "dropout_rate": 0.0, |
| "attn_dropout_rate": 0.0 |
| } |
| }, |
| "cfm_length_regulator": { |
| "_target_": "modules.v2.length_regulator.InterpolateRegulator", |
| "channels": 512, |
| "is_discrete": true, |
| "codebook_size": 2048, |
| "sampling_ratios": [ |
| 1, |
| 1, |
| 1, |
| 1 |
| ], |
| "f0_condition": false |
| }, |
| "ar": { |
| "_target_": "modules.v2.ar.NaiveWrapper", |
| "model": { |
| "_target_": "modules.v2.ar.NaiveTransformer", |
| "config": { |
| "_target_": "modules.v2.ar.NaiveModelArgs", |
| "dropout": 0.0, |
| "rope_base": 10000.0, |
| "dim": 768, |
| "head_dim": 64, |
| "n_local_heads": 2, |
| "intermediate_size": 2304, |
| "n_head": 12, |
| "n_layer": 12, |
| "vocab_size": 2049 |
| } |
| } |
| }, |
| "ar_length_regulator": { |
| "_target_": "modules.v2.length_regulator.InterpolateRegulator", |
| "channels": 768, |
| "is_discrete": true, |
| "codebook_size": 32, |
| "sampling_ratios": [], |
| "f0_condition": false |
| }, |
| "style_encoder": { |
| "_target_": "modules.campplus.DTDNN.CAMPPlus", |
| "feat_dim": 80, |
| "embedding_size": 192 |
| }, |
| "content_extractor_narrow": { |
| "_target_": "modules.astral_quantization.default_model.AstralQuantizer", |
| "tokenizer_name": "openai/whisper-small", |
| "ssl_model_name": "facebook/hubert-large-ll60k", |
| "ssl_output_layer": 18, |
| "skip_ssl": true, |
| "encoder": { |
| "_target_": "modules.astral_quantization.convnext.ConvNeXtV2Stage", |
| "dim": 512, |
| "num_blocks": 12, |
| "intermediate_dim": 1536, |
| "dilation": 1, |
| "input_dim": 1024 |
| }, |
| "quantizer": { |
| "_target_": "modules.astral_quantization.bsq.BinarySphericalQuantize", |
| "codebook_size": 32, |
| "dim": 512, |
| "entropy_loss_weight": 0.1, |
| "diversity_gamma": 1.0, |
| "spherical": true, |
| "enable_entropy_loss": true, |
| "soft_entropy_loss": true |
| } |
| }, |
| "content_extractor_wide": { |
| "_target_": "modules.astral_quantization.default_model.AstralQuantizer", |
| "tokenizer_name": "openai/whisper-small", |
| "ssl_model_name": "facebook/hubert-large-ll60k", |
| "ssl_output_layer": 18, |
| "encoder": { |
| "_target_": "modules.astral_quantization.convnext.ConvNeXtV2Stage", |
| "dim": 512, |
| "num_blocks": 12, |
| "intermediate_dim": 1536, |
| "dilation": 1, |
| "input_dim": 1024 |
| }, |
| "quantizer": { |
| "_target_": "modules.astral_quantization.bsq.BinarySphericalQuantize", |
| "codebook_size": 2048, |
| "dim": 512, |
| "entropy_loss_weight": 0.1, |
| "diversity_gamma": 1.0, |
| "spherical": true, |
| "enable_entropy_loss": true, |
| "soft_entropy_loss": true |
| } |
| }, |
| "vocoder": { |
| "_target_": "modules.bigvgan.bigvgan.BigVGAN.from_pretrained", |
| "pretrained_model_name_or_path": "nvidia/bigvgan_v2_22khz_80band_256x", |
| "use_cuda_kernel": false |
| } |
| } |
|
|