Upload folder using huggingface_hub
Browse files
russian_train_1/vc_wrapper.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
|
| 2 |
+
sr: 22050
|
| 3 |
+
hop_size: 256
|
| 4 |
+
mel_fn:
|
| 5 |
+
_target_: modules.audio.mel_spectrogram
|
| 6 |
+
_partial_: true
|
| 7 |
+
n_fft: 1024
|
| 8 |
+
win_size: 1024
|
| 9 |
+
hop_size: 256
|
| 10 |
+
num_mels: 80
|
| 11 |
+
sampling_rate: 22050
|
| 12 |
+
fmin: 0
|
| 13 |
+
fmax: null
|
| 14 |
+
center: False
|
| 15 |
+
cfm:
|
| 16 |
+
_target_: modules.v2.cfm.CFM
|
| 17 |
+
estimator:
|
| 18 |
+
_target_: modules.v2.dit_wrapper.DiT
|
| 19 |
+
time_as_token: true
|
| 20 |
+
style_as_token: true
|
| 21 |
+
uvit_skip_connection: false
|
| 22 |
+
block_size: 8192
|
| 23 |
+
depth: 13
|
| 24 |
+
num_heads: 8
|
| 25 |
+
hidden_dim: 512
|
| 26 |
+
in_channels: 80
|
| 27 |
+
content_dim: 512
|
| 28 |
+
style_encoder_dim: 192
|
| 29 |
+
class_dropout_prob: 0.1
|
| 30 |
+
dropout_rate: 0.0
|
| 31 |
+
attn_dropout_rate: 0.0
|
| 32 |
+
cfm_length_regulator:
|
| 33 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 34 |
+
channels: 512
|
| 35 |
+
is_discrete: true
|
| 36 |
+
codebook_size: 2048
|
| 37 |
+
sampling_ratios: [ 1, 1, 1, 1 ]
|
| 38 |
+
f0_condition: false
|
| 39 |
+
ar:
|
| 40 |
+
_target_: modules.v2.ar.NaiveWrapper
|
| 41 |
+
model:
|
| 42 |
+
_target_: modules.v2.ar.NaiveTransformer
|
| 43 |
+
config:
|
| 44 |
+
_target_: modules.v2.ar.NaiveModelArgs
|
| 45 |
+
dropout: 0.0
|
| 46 |
+
rope_base: 10000.0
|
| 47 |
+
dim: 768
|
| 48 |
+
head_dim: 64
|
| 49 |
+
n_local_heads: 2
|
| 50 |
+
intermediate_size: 2304
|
| 51 |
+
n_head: 12
|
| 52 |
+
n_layer: 12
|
| 53 |
+
vocab_size: 2049 # 1 + 1 for eos
|
| 54 |
+
ar_length_regulator:
|
| 55 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 56 |
+
channels: 768
|
| 57 |
+
is_discrete: true
|
| 58 |
+
codebook_size: 32
|
| 59 |
+
sampling_ratios: [ ]
|
| 60 |
+
f0_condition: false
|
| 61 |
+
style_encoder:
|
| 62 |
+
_target_: modules.campplus.DTDNN.CAMPPlus
|
| 63 |
+
feat_dim: 80
|
| 64 |
+
embedding_size: 192
|
| 65 |
+
content_extractor_narrow:
|
| 66 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 67 |
+
tokenizer_name: "openai/whisper-small"
|
| 68 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 69 |
+
ssl_output_layer: 18
|
| 70 |
+
skip_ssl: true
|
| 71 |
+
encoder: &bottleneck_encoder
|
| 72 |
+
_target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
|
| 73 |
+
dim: 512
|
| 74 |
+
num_blocks: 12
|
| 75 |
+
intermediate_dim: 1536
|
| 76 |
+
dilation: 1
|
| 77 |
+
input_dim: 1024
|
| 78 |
+
quantizer:
|
| 79 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 80 |
+
codebook_size: 32 # codebook size, must be a power of 2
|
| 81 |
+
dim: 512
|
| 82 |
+
entropy_loss_weight: 0.1
|
| 83 |
+
diversity_gamma: 1.0
|
| 84 |
+
spherical: True
|
| 85 |
+
enable_entropy_loss: True
|
| 86 |
+
soft_entropy_loss: True
|
| 87 |
+
content_extractor_wide:
|
| 88 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 89 |
+
tokenizer_name: "openai/whisper-small"
|
| 90 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 91 |
+
ssl_output_layer: 18
|
| 92 |
+
encoder: *bottleneck_encoder
|
| 93 |
+
quantizer:
|
| 94 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 95 |
+
codebook_size: 2048 # codebook size, must be a power of 2
|
| 96 |
+
dim: 512
|
| 97 |
+
entropy_loss_weight: 0.1
|
| 98 |
+
diversity_gamma: 1.0
|
| 99 |
+
spherical: True
|
| 100 |
+
enable_entropy_loss: True
|
| 101 |
+
soft_entropy_loss: True
|
| 102 |
+
vocoder:
|
| 103 |
+
_target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
|
| 104 |
+
pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
|
| 105 |
+
use_cuda_kernel: false
|
russian_train_2/train.log
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Epoch 0, Iteration 0, Loss: 7.4123, Loss AR: 6.8261, Loss CFM: 0.5862, Grad Norm: 5.8548, LR: 0.000000
|
| 2 |
+
Epoch 0, Iteration 10, Loss: 7.1086, Loss AR: 6.5280, Loss CFM: 0.5806, Grad Norm: 7.3128, LR: 0.000020
|
| 3 |
+
Epoch 0, Iteration 20, Loss: 6.7630, Loss AR: 6.1732, Loss CFM: 0.5898, Grad Norm: 5.3300, LR: 0.000020
|
| 4 |
+
Epoch 0, Iteration 30, Loss: 6.6540, Loss AR: 6.0648, Loss CFM: 0.5893, Grad Norm: 7.9873, LR: 0.000020
|
| 5 |
+
Epoch 0, Iteration 40, Loss: 6.4128, Loss AR: 5.9118, Loss CFM: 0.5010, Grad Norm: 6.9926, LR: 0.000020
|
| 6 |
+
Epoch 0, Iteration 50, Loss: 6.3736, Loss AR: 5.8265, Loss CFM: 0.5471, Grad Norm: 6.1079, LR: 0.000020
|
| 7 |
+
Epoch 0, Iteration 60, Loss: 6.3835, Loss AR: 5.7620, Loss CFM: 0.6215, Grad Norm: 5.3433, LR: 0.000020
|
| 8 |
+
Epoch 0, Iteration 70, Loss: 6.3277, Loss AR: 5.7967, Loss CFM: 0.5310, Grad Norm: 5.4081, LR: 0.000020
|
| 9 |
+
Epoch 0, Iteration 80, Loss: 6.1539, Loss AR: 5.5468, Loss CFM: 0.6071, Grad Norm: 6.1189, LR: 0.000020
|
| 10 |
+
Epoch 0, Iteration 90, Loss: 6.1051, Loss AR: 5.5220, Loss CFM: 0.5830, Grad Norm: 5.2862, LR: 0.000020
|
| 11 |
+
Epoch 0, Iteration 100, Loss: 6.0231, Loss AR: 5.3957, Loss CFM: 0.6274, Grad Norm: 5.5014, LR: 0.000020
|
| 12 |
+
Epoch 0, Iteration 110, Loss: 5.9073, Loss AR: 5.3184, Loss CFM: 0.5889, Grad Norm: 6.4655, LR: 0.000020
|
| 13 |
+
Epoch 0, Iteration 120, Loss: 5.7812, Loss AR: 5.3116, Loss CFM: 0.4697, Grad Norm: 4.8613, LR: 0.000020
|
| 14 |
+
Epoch 0, Iteration 130, Loss: 5.7230, Loss AR: 5.2267, Loss CFM: 0.4963, Grad Norm: 4.9150, LR: 0.000020
|
| 15 |
+
Epoch 0, Iteration 140, Loss: 5.9952, Loss AR: 5.3835, Loss CFM: 0.6117, Grad Norm: 4.8557, LR: 0.000020
|
| 16 |
+
Epoch 0, Iteration 150, Loss: 5.7789, Loss AR: 5.1951, Loss CFM: 0.5839, Grad Norm: 5.9602, LR: 0.000020
|
| 17 |
+
Epoch 0, Iteration 160, Loss: 5.8195, Loss AR: 5.2459, Loss CFM: 0.5736, Grad Norm: 6.8558, LR: 0.000020
|
| 18 |
+
Epoch 0, Iteration 170, Loss: 5.6152, Loss AR: 5.0701, Loss CFM: 0.5452, Grad Norm: 4.2240, LR: 0.000020
|
| 19 |
+
Epoch 0, Iteration 180, Loss: 5.8292, Loss AR: 5.3408, Loss CFM: 0.4884, Grad Norm: 4.1121, LR: 0.000020
|
| 20 |
+
Epoch 0, Iteration 190, Loss: 6.0036, Loss AR: 5.3866, Loss CFM: 0.6170, Grad Norm: 6.9337, LR: 0.000020
|
| 21 |
+
Epoch 0, Iteration 200, Loss: 5.6125, Loss AR: 5.1298, Loss CFM: 0.4827, Grad Norm: 6.0064, LR: 0.000020
|
| 22 |
+
Epoch 0, Iteration 210, Loss: 5.8327, Loss AR: 5.2733, Loss CFM: 0.5593, Grad Norm: 5.4030, LR: 0.000020
|
| 23 |
+
Epoch 0, Iteration 220, Loss: 5.5699, Loss AR: 5.0621, Loss CFM: 0.5078, Grad Norm: 5.9616, LR: 0.000020
|
| 24 |
+
Epoch 0, Iteration 230, Loss: 5.8119, Loss AR: 5.1636, Loss CFM: 0.6483, Grad Norm: 8.4473, LR: 0.000020
|
| 25 |
+
Epoch 0, Iteration 240, Loss: 5.7326, Loss AR: 5.1655, Loss CFM: 0.5671, Grad Norm: 7.5353, LR: 0.000020
|
| 26 |
+
Epoch 0, Iteration 250, Loss: 5.7567, Loss AR: 5.2142, Loss CFM: 0.5425, Grad Norm: 5.6969, LR: 0.000020
|
| 27 |
+
Epoch 0, Iteration 260, Loss: 5.4423, Loss AR: 4.9438, Loss CFM: 0.4985, Grad Norm: 5.0690, LR: 0.000020
|
| 28 |
+
Epoch 0, Iteration 270, Loss: 5.6098, Loss AR: 5.0514, Loss CFM: 0.5585, Grad Norm: 5.7998, LR: 0.000020
|
| 29 |
+
Epoch 0, Iteration 280, Loss: 5.6351, Loss AR: 5.0776, Loss CFM: 0.5575, Grad Norm: 5.4499, LR: 0.000020
|
| 30 |
+
Epoch 0, Iteration 290, Loss: 5.6632, Loss AR: 5.0598, Loss CFM: 0.6034, Grad Norm: 5.3381, LR: 0.000020
|
| 31 |
+
Epoch 0, Iteration 300, Loss: 5.5276, Loss AR: 5.0196, Loss CFM: 0.5081, Grad Norm: 6.4791, LR: 0.000020
|
russian_train_2/vc_wrapper.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
|
| 2 |
+
sr: 22050
|
| 3 |
+
hop_size: 256
|
| 4 |
+
mel_fn:
|
| 5 |
+
_target_: modules.audio.mel_spectrogram
|
| 6 |
+
_partial_: true
|
| 7 |
+
n_fft: 1024
|
| 8 |
+
win_size: 1024
|
| 9 |
+
hop_size: 256
|
| 10 |
+
num_mels: 80
|
| 11 |
+
sampling_rate: 22050
|
| 12 |
+
fmin: 0
|
| 13 |
+
fmax: null
|
| 14 |
+
center: False
|
| 15 |
+
cfm:
|
| 16 |
+
_target_: modules.v2.cfm.CFM
|
| 17 |
+
estimator:
|
| 18 |
+
_target_: modules.v2.dit_wrapper.DiT
|
| 19 |
+
time_as_token: true
|
| 20 |
+
style_as_token: true
|
| 21 |
+
uvit_skip_connection: false
|
| 22 |
+
block_size: 8192
|
| 23 |
+
depth: 13
|
| 24 |
+
num_heads: 8
|
| 25 |
+
hidden_dim: 512
|
| 26 |
+
in_channels: 80
|
| 27 |
+
content_dim: 512
|
| 28 |
+
style_encoder_dim: 192
|
| 29 |
+
class_dropout_prob: 0.1
|
| 30 |
+
dropout_rate: 0.0
|
| 31 |
+
attn_dropout_rate: 0.0
|
| 32 |
+
cfm_length_regulator:
|
| 33 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 34 |
+
channels: 512
|
| 35 |
+
is_discrete: true
|
| 36 |
+
codebook_size: 2048
|
| 37 |
+
sampling_ratios: [ 1, 1, 1, 1 ]
|
| 38 |
+
f0_condition: false
|
| 39 |
+
ar:
|
| 40 |
+
_target_: modules.v2.ar.NaiveWrapper
|
| 41 |
+
model:
|
| 42 |
+
_target_: modules.v2.ar.NaiveTransformer
|
| 43 |
+
config:
|
| 44 |
+
_target_: modules.v2.ar.NaiveModelArgs
|
| 45 |
+
dropout: 0.0
|
| 46 |
+
rope_base: 10000.0
|
| 47 |
+
dim: 768
|
| 48 |
+
head_dim: 64
|
| 49 |
+
n_local_heads: 2
|
| 50 |
+
intermediate_size: 2304
|
| 51 |
+
n_head: 12
|
| 52 |
+
n_layer: 12
|
| 53 |
+
vocab_size: 2049 # 1 + 1 for eos
|
| 54 |
+
ar_length_regulator:
|
| 55 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 56 |
+
channels: 768
|
| 57 |
+
is_discrete: true
|
| 58 |
+
codebook_size: 32
|
| 59 |
+
sampling_ratios: [ ]
|
| 60 |
+
f0_condition: false
|
| 61 |
+
style_encoder:
|
| 62 |
+
_target_: modules.campplus.DTDNN.CAMPPlus
|
| 63 |
+
feat_dim: 80
|
| 64 |
+
embedding_size: 192
|
| 65 |
+
content_extractor_narrow:
|
| 66 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 67 |
+
tokenizer_name: "openai/whisper-small"
|
| 68 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 69 |
+
ssl_output_layer: 18
|
| 70 |
+
skip_ssl: true
|
| 71 |
+
encoder: &bottleneck_encoder
|
| 72 |
+
_target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
|
| 73 |
+
dim: 512
|
| 74 |
+
num_blocks: 12
|
| 75 |
+
intermediate_dim: 1536
|
| 76 |
+
dilation: 1
|
| 77 |
+
input_dim: 1024
|
| 78 |
+
quantizer:
|
| 79 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 80 |
+
codebook_size: 32 # codebook size, must be a power of 2
|
| 81 |
+
dim: 512
|
| 82 |
+
entropy_loss_weight: 0.1
|
| 83 |
+
diversity_gamma: 1.0
|
| 84 |
+
spherical: True
|
| 85 |
+
enable_entropy_loss: True
|
| 86 |
+
soft_entropy_loss: True
|
| 87 |
+
content_extractor_wide:
|
| 88 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 89 |
+
tokenizer_name: "openai/whisper-small"
|
| 90 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 91 |
+
ssl_output_layer: 18
|
| 92 |
+
encoder: *bottleneck_encoder
|
| 93 |
+
quantizer:
|
| 94 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 95 |
+
codebook_size: 2048 # codebook size, must be a power of 2
|
| 96 |
+
dim: 512
|
| 97 |
+
entropy_loss_weight: 0.1
|
| 98 |
+
diversity_gamma: 1.0
|
| 99 |
+
spherical: True
|
| 100 |
+
enable_entropy_loss: True
|
| 101 |
+
soft_entropy_loss: True
|
| 102 |
+
vocoder:
|
| 103 |
+
_target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
|
| 104 |
+
pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
|
| 105 |
+
use_cuda_kernel: false
|
russian_train_3/AR_epoch_00000_step_10000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb478588c963544c0e665841923f397eb44ca05eae421ad062c9603768e97750
|
| 3 |
+
size 333673060
|
russian_train_3/CFM_epoch_00000_step_10000.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:699797b76af298ed84d4126b72535f9f4f7adf7f62bbed9e11bc748259bb02c2
|
| 3 |
+
size 352130283
|
russian_train_3/train.log
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
russian_train_3/vc_wrapper.yaml
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
_target_: modules.v2.vc_wrapper.VoiceConversionWrapper
|
| 2 |
+
sr: 22050
|
| 3 |
+
hop_size: 256
|
| 4 |
+
mel_fn:
|
| 5 |
+
_target_: modules.audio.mel_spectrogram
|
| 6 |
+
_partial_: true
|
| 7 |
+
n_fft: 1024
|
| 8 |
+
win_size: 1024
|
| 9 |
+
hop_size: 256
|
| 10 |
+
num_mels: 80
|
| 11 |
+
sampling_rate: 22050
|
| 12 |
+
fmin: 0
|
| 13 |
+
fmax: null
|
| 14 |
+
center: False
|
| 15 |
+
cfm:
|
| 16 |
+
_target_: modules.v2.cfm.CFM
|
| 17 |
+
estimator:
|
| 18 |
+
_target_: modules.v2.dit_wrapper.DiT
|
| 19 |
+
time_as_token: true
|
| 20 |
+
style_as_token: true
|
| 21 |
+
uvit_skip_connection: false
|
| 22 |
+
block_size: 8192
|
| 23 |
+
depth: 13
|
| 24 |
+
num_heads: 8
|
| 25 |
+
hidden_dim: 512
|
| 26 |
+
in_channels: 80
|
| 27 |
+
content_dim: 512
|
| 28 |
+
style_encoder_dim: 192
|
| 29 |
+
class_dropout_prob: 0.1
|
| 30 |
+
dropout_rate: 0.0
|
| 31 |
+
attn_dropout_rate: 0.0
|
| 32 |
+
cfm_length_regulator:
|
| 33 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 34 |
+
channels: 512
|
| 35 |
+
is_discrete: true
|
| 36 |
+
codebook_size: 2048
|
| 37 |
+
sampling_ratios: [ 1, 1, 1, 1 ]
|
| 38 |
+
f0_condition: false
|
| 39 |
+
ar:
|
| 40 |
+
_target_: modules.v2.ar.NaiveWrapper
|
| 41 |
+
model:
|
| 42 |
+
_target_: modules.v2.ar.NaiveTransformer
|
| 43 |
+
config:
|
| 44 |
+
_target_: modules.v2.ar.NaiveModelArgs
|
| 45 |
+
dropout: 0.0
|
| 46 |
+
rope_base: 10000.0
|
| 47 |
+
dim: 768
|
| 48 |
+
head_dim: 64
|
| 49 |
+
n_local_heads: 2
|
| 50 |
+
intermediate_size: 2304
|
| 51 |
+
n_head: 12
|
| 52 |
+
n_layer: 12
|
| 53 |
+
vocab_size: 2049 # 1 + 1 for eos
|
| 54 |
+
ar_length_regulator:
|
| 55 |
+
_target_: modules.v2.length_regulator.InterpolateRegulator
|
| 56 |
+
channels: 768
|
| 57 |
+
is_discrete: true
|
| 58 |
+
codebook_size: 32
|
| 59 |
+
sampling_ratios: [ ]
|
| 60 |
+
f0_condition: false
|
| 61 |
+
style_encoder:
|
| 62 |
+
_target_: modules.campplus.DTDNN.CAMPPlus
|
| 63 |
+
feat_dim: 80
|
| 64 |
+
embedding_size: 192
|
| 65 |
+
content_extractor_narrow:
|
| 66 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 67 |
+
tokenizer_name: "openai/whisper-small"
|
| 68 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 69 |
+
ssl_output_layer: 18
|
| 70 |
+
skip_ssl: true
|
| 71 |
+
encoder: &bottleneck_encoder
|
| 72 |
+
_target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
|
| 73 |
+
dim: 512
|
| 74 |
+
num_blocks: 12
|
| 75 |
+
intermediate_dim: 1536
|
| 76 |
+
dilation: 1
|
| 77 |
+
input_dim: 1024
|
| 78 |
+
quantizer:
|
| 79 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 80 |
+
codebook_size: 32 # codebook size, must be a power of 2
|
| 81 |
+
dim: 512
|
| 82 |
+
entropy_loss_weight: 0.1
|
| 83 |
+
diversity_gamma: 1.0
|
| 84 |
+
spherical: True
|
| 85 |
+
enable_entropy_loss: True
|
| 86 |
+
soft_entropy_loss: True
|
| 87 |
+
content_extractor_wide:
|
| 88 |
+
_target_: modules.astral_quantization.default_model.AstralQuantizer
|
| 89 |
+
tokenizer_name: "openai/whisper-small"
|
| 90 |
+
ssl_model_name: "facebook/hubert-large-ll60k"
|
| 91 |
+
ssl_output_layer: 18
|
| 92 |
+
encoder: *bottleneck_encoder
|
| 93 |
+
quantizer:
|
| 94 |
+
_target_: modules.astral_quantization.bsq.BinarySphericalQuantize
|
| 95 |
+
codebook_size: 2048 # codebook size, must be a power of 2
|
| 96 |
+
dim: 512
|
| 97 |
+
entropy_loss_weight: 0.1
|
| 98 |
+
diversity_gamma: 1.0
|
| 99 |
+
spherical: True
|
| 100 |
+
enable_entropy_loss: True
|
| 101 |
+
soft_entropy_loss: True
|
| 102 |
+
vocoder:
|
| 103 |
+
_target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
|
| 104 |
+
pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
|
| 105 |
+
use_cuda_kernel: false
|