narySt commited on
Commit
a88ed6c
·
verified ·
1 Parent(s): f1a6375

Upload folder using huggingface_hub

Browse files
russian_train_1/vc_wrapper.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: modules.v2.vc_wrapper.VoiceConversionWrapper
2
+ sr: 22050
3
+ hop_size: 256
4
+ mel_fn:
5
+ _target_: modules.audio.mel_spectrogram
6
+ _partial_: true
7
+ n_fft: 1024
8
+ win_size: 1024
9
+ hop_size: 256
10
+ num_mels: 80
11
+ sampling_rate: 22050
12
+ fmin: 0
13
+ fmax: null
14
+ center: False
15
+ cfm:
16
+ _target_: modules.v2.cfm.CFM
17
+ estimator:
18
+ _target_: modules.v2.dit_wrapper.DiT
19
+ time_as_token: true
20
+ style_as_token: true
21
+ uvit_skip_connection: false
22
+ block_size: 8192
23
+ depth: 13
24
+ num_heads: 8
25
+ hidden_dim: 512
26
+ in_channels: 80
27
+ content_dim: 512
28
+ style_encoder_dim: 192
29
+ class_dropout_prob: 0.1
30
+ dropout_rate: 0.0
31
+ attn_dropout_rate: 0.0
32
+ cfm_length_regulator:
33
+ _target_: modules.v2.length_regulator.InterpolateRegulator
34
+ channels: 512
35
+ is_discrete: true
36
+ codebook_size: 2048
37
+ sampling_ratios: [ 1, 1, 1, 1 ]
38
+ f0_condition: false
39
+ ar:
40
+ _target_: modules.v2.ar.NaiveWrapper
41
+ model:
42
+ _target_: modules.v2.ar.NaiveTransformer
43
+ config:
44
+ _target_: modules.v2.ar.NaiveModelArgs
45
+ dropout: 0.0
46
+ rope_base: 10000.0
47
+ dim: 768
48
+ head_dim: 64
49
+ n_local_heads: 2
50
+ intermediate_size: 2304
51
+ n_head: 12
52
+ n_layer: 12
53
+ vocab_size: 2049 # 1 + 1 for eos
54
+ ar_length_regulator:
55
+ _target_: modules.v2.length_regulator.InterpolateRegulator
56
+ channels: 768
57
+ is_discrete: true
58
+ codebook_size: 32
59
+ sampling_ratios: [ ]
60
+ f0_condition: false
61
+ style_encoder:
62
+ _target_: modules.campplus.DTDNN.CAMPPlus
63
+ feat_dim: 80
64
+ embedding_size: 192
65
+ content_extractor_narrow:
66
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
67
+ tokenizer_name: "openai/whisper-small"
68
+ ssl_model_name: "facebook/hubert-large-ll60k"
69
+ ssl_output_layer: 18
70
+ skip_ssl: true
71
+ encoder: &bottleneck_encoder
72
+ _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
73
+ dim: 512
74
+ num_blocks: 12
75
+ intermediate_dim: 1536
76
+ dilation: 1
77
+ input_dim: 1024
78
+ quantizer:
79
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
80
+ codebook_size: 32 # codebook size, must be a power of 2
81
+ dim: 512
82
+ entropy_loss_weight: 0.1
83
+ diversity_gamma: 1.0
84
+ spherical: True
85
+ enable_entropy_loss: True
86
+ soft_entropy_loss: True
87
+ content_extractor_wide:
88
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
89
+ tokenizer_name: "openai/whisper-small"
90
+ ssl_model_name: "facebook/hubert-large-ll60k"
91
+ ssl_output_layer: 18
92
+ encoder: *bottleneck_encoder
93
+ quantizer:
94
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
95
+ codebook_size: 2048 # codebook size, must be a power of 2
96
+ dim: 512
97
+ entropy_loss_weight: 0.1
98
+ diversity_gamma: 1.0
99
+ spherical: True
100
+ enable_entropy_loss: True
101
+ soft_entropy_loss: True
102
+ vocoder:
103
+ _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
104
+ pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
105
+ use_cuda_kernel: false
russian_train_2/train.log ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Epoch 0, Iteration 0, Loss: 7.4123, Loss AR: 6.8261, Loss CFM: 0.5862, Grad Norm: 5.8548, LR: 0.000000
2
+ Epoch 0, Iteration 10, Loss: 7.1086, Loss AR: 6.5280, Loss CFM: 0.5806, Grad Norm: 7.3128, LR: 0.000020
3
+ Epoch 0, Iteration 20, Loss: 6.7630, Loss AR: 6.1732, Loss CFM: 0.5898, Grad Norm: 5.3300, LR: 0.000020
4
+ Epoch 0, Iteration 30, Loss: 6.6540, Loss AR: 6.0648, Loss CFM: 0.5893, Grad Norm: 7.9873, LR: 0.000020
5
+ Epoch 0, Iteration 40, Loss: 6.4128, Loss AR: 5.9118, Loss CFM: 0.5010, Grad Norm: 6.9926, LR: 0.000020
6
+ Epoch 0, Iteration 50, Loss: 6.3736, Loss AR: 5.8265, Loss CFM: 0.5471, Grad Norm: 6.1079, LR: 0.000020
7
+ Epoch 0, Iteration 60, Loss: 6.3835, Loss AR: 5.7620, Loss CFM: 0.6215, Grad Norm: 5.3433, LR: 0.000020
8
+ Epoch 0, Iteration 70, Loss: 6.3277, Loss AR: 5.7967, Loss CFM: 0.5310, Grad Norm: 5.4081, LR: 0.000020
9
+ Epoch 0, Iteration 80, Loss: 6.1539, Loss AR: 5.5468, Loss CFM: 0.6071, Grad Norm: 6.1189, LR: 0.000020
10
+ Epoch 0, Iteration 90, Loss: 6.1051, Loss AR: 5.5220, Loss CFM: 0.5830, Grad Norm: 5.2862, LR: 0.000020
11
+ Epoch 0, Iteration 100, Loss: 6.0231, Loss AR: 5.3957, Loss CFM: 0.6274, Grad Norm: 5.5014, LR: 0.000020
12
+ Epoch 0, Iteration 110, Loss: 5.9073, Loss AR: 5.3184, Loss CFM: 0.5889, Grad Norm: 6.4655, LR: 0.000020
13
+ Epoch 0, Iteration 120, Loss: 5.7812, Loss AR: 5.3116, Loss CFM: 0.4697, Grad Norm: 4.8613, LR: 0.000020
14
+ Epoch 0, Iteration 130, Loss: 5.7230, Loss AR: 5.2267, Loss CFM: 0.4963, Grad Norm: 4.9150, LR: 0.000020
15
+ Epoch 0, Iteration 140, Loss: 5.9952, Loss AR: 5.3835, Loss CFM: 0.6117, Grad Norm: 4.8557, LR: 0.000020
16
+ Epoch 0, Iteration 150, Loss: 5.7789, Loss AR: 5.1951, Loss CFM: 0.5839, Grad Norm: 5.9602, LR: 0.000020
17
+ Epoch 0, Iteration 160, Loss: 5.8195, Loss AR: 5.2459, Loss CFM: 0.5736, Grad Norm: 6.8558, LR: 0.000020
18
+ Epoch 0, Iteration 170, Loss: 5.6152, Loss AR: 5.0701, Loss CFM: 0.5452, Grad Norm: 4.2240, LR: 0.000020
19
+ Epoch 0, Iteration 180, Loss: 5.8292, Loss AR: 5.3408, Loss CFM: 0.4884, Grad Norm: 4.1121, LR: 0.000020
20
+ Epoch 0, Iteration 190, Loss: 6.0036, Loss AR: 5.3866, Loss CFM: 0.6170, Grad Norm: 6.9337, LR: 0.000020
21
+ Epoch 0, Iteration 200, Loss: 5.6125, Loss AR: 5.1298, Loss CFM: 0.4827, Grad Norm: 6.0064, LR: 0.000020
22
+ Epoch 0, Iteration 210, Loss: 5.8327, Loss AR: 5.2733, Loss CFM: 0.5593, Grad Norm: 5.4030, LR: 0.000020
23
+ Epoch 0, Iteration 220, Loss: 5.5699, Loss AR: 5.0621, Loss CFM: 0.5078, Grad Norm: 5.9616, LR: 0.000020
24
+ Epoch 0, Iteration 230, Loss: 5.8119, Loss AR: 5.1636, Loss CFM: 0.6483, Grad Norm: 8.4473, LR: 0.000020
25
+ Epoch 0, Iteration 240, Loss: 5.7326, Loss AR: 5.1655, Loss CFM: 0.5671, Grad Norm: 7.5353, LR: 0.000020
26
+ Epoch 0, Iteration 250, Loss: 5.7567, Loss AR: 5.2142, Loss CFM: 0.5425, Grad Norm: 5.6969, LR: 0.000020
27
+ Epoch 0, Iteration 260, Loss: 5.4423, Loss AR: 4.9438, Loss CFM: 0.4985, Grad Norm: 5.0690, LR: 0.000020
28
+ Epoch 0, Iteration 270, Loss: 5.6098, Loss AR: 5.0514, Loss CFM: 0.5585, Grad Norm: 5.7998, LR: 0.000020
29
+ Epoch 0, Iteration 280, Loss: 5.6351, Loss AR: 5.0776, Loss CFM: 0.5575, Grad Norm: 5.4499, LR: 0.000020
30
+ Epoch 0, Iteration 290, Loss: 5.6632, Loss AR: 5.0598, Loss CFM: 0.6034, Grad Norm: 5.3381, LR: 0.000020
31
+ Epoch 0, Iteration 300, Loss: 5.5276, Loss AR: 5.0196, Loss CFM: 0.5081, Grad Norm: 6.4791, LR: 0.000020
russian_train_2/vc_wrapper.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: modules.v2.vc_wrapper.VoiceConversionWrapper
2
+ sr: 22050
3
+ hop_size: 256
4
+ mel_fn:
5
+ _target_: modules.audio.mel_spectrogram
6
+ _partial_: true
7
+ n_fft: 1024
8
+ win_size: 1024
9
+ hop_size: 256
10
+ num_mels: 80
11
+ sampling_rate: 22050
12
+ fmin: 0
13
+ fmax: null
14
+ center: False
15
+ cfm:
16
+ _target_: modules.v2.cfm.CFM
17
+ estimator:
18
+ _target_: modules.v2.dit_wrapper.DiT
19
+ time_as_token: true
20
+ style_as_token: true
21
+ uvit_skip_connection: false
22
+ block_size: 8192
23
+ depth: 13
24
+ num_heads: 8
25
+ hidden_dim: 512
26
+ in_channels: 80
27
+ content_dim: 512
28
+ style_encoder_dim: 192
29
+ class_dropout_prob: 0.1
30
+ dropout_rate: 0.0
31
+ attn_dropout_rate: 0.0
32
+ cfm_length_regulator:
33
+ _target_: modules.v2.length_regulator.InterpolateRegulator
34
+ channels: 512
35
+ is_discrete: true
36
+ codebook_size: 2048
37
+ sampling_ratios: [ 1, 1, 1, 1 ]
38
+ f0_condition: false
39
+ ar:
40
+ _target_: modules.v2.ar.NaiveWrapper
41
+ model:
42
+ _target_: modules.v2.ar.NaiveTransformer
43
+ config:
44
+ _target_: modules.v2.ar.NaiveModelArgs
45
+ dropout: 0.0
46
+ rope_base: 10000.0
47
+ dim: 768
48
+ head_dim: 64
49
+ n_local_heads: 2
50
+ intermediate_size: 2304
51
+ n_head: 12
52
+ n_layer: 12
53
+ vocab_size: 2049 # 1 + 1 for eos
54
+ ar_length_regulator:
55
+ _target_: modules.v2.length_regulator.InterpolateRegulator
56
+ channels: 768
57
+ is_discrete: true
58
+ codebook_size: 32
59
+ sampling_ratios: [ ]
60
+ f0_condition: false
61
+ style_encoder:
62
+ _target_: modules.campplus.DTDNN.CAMPPlus
63
+ feat_dim: 80
64
+ embedding_size: 192
65
+ content_extractor_narrow:
66
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
67
+ tokenizer_name: "openai/whisper-small"
68
+ ssl_model_name: "facebook/hubert-large-ll60k"
69
+ ssl_output_layer: 18
70
+ skip_ssl: true
71
+ encoder: &bottleneck_encoder
72
+ _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
73
+ dim: 512
74
+ num_blocks: 12
75
+ intermediate_dim: 1536
76
+ dilation: 1
77
+ input_dim: 1024
78
+ quantizer:
79
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
80
+ codebook_size: 32 # codebook size, must be a power of 2
81
+ dim: 512
82
+ entropy_loss_weight: 0.1
83
+ diversity_gamma: 1.0
84
+ spherical: True
85
+ enable_entropy_loss: True
86
+ soft_entropy_loss: True
87
+ content_extractor_wide:
88
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
89
+ tokenizer_name: "openai/whisper-small"
90
+ ssl_model_name: "facebook/hubert-large-ll60k"
91
+ ssl_output_layer: 18
92
+ encoder: *bottleneck_encoder
93
+ quantizer:
94
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
95
+ codebook_size: 2048 # codebook size, must be a power of 2
96
+ dim: 512
97
+ entropy_loss_weight: 0.1
98
+ diversity_gamma: 1.0
99
+ spherical: True
100
+ enable_entropy_loss: True
101
+ soft_entropy_loss: True
102
+ vocoder:
103
+ _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
104
+ pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
105
+ use_cuda_kernel: false
russian_train_3/AR_epoch_00000_step_10000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb478588c963544c0e665841923f397eb44ca05eae421ad062c9603768e97750
3
+ size 333673060
russian_train_3/CFM_epoch_00000_step_10000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:699797b76af298ed84d4126b72535f9f4f7adf7f62bbed9e11bc748259bb02c2
3
+ size 352130283
russian_train_3/train.log ADDED
The diff for this file is too large to render. See raw diff
 
russian_train_3/vc_wrapper.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ _target_: modules.v2.vc_wrapper.VoiceConversionWrapper
2
+ sr: 22050
3
+ hop_size: 256
4
+ mel_fn:
5
+ _target_: modules.audio.mel_spectrogram
6
+ _partial_: true
7
+ n_fft: 1024
8
+ win_size: 1024
9
+ hop_size: 256
10
+ num_mels: 80
11
+ sampling_rate: 22050
12
+ fmin: 0
13
+ fmax: null
14
+ center: False
15
+ cfm:
16
+ _target_: modules.v2.cfm.CFM
17
+ estimator:
18
+ _target_: modules.v2.dit_wrapper.DiT
19
+ time_as_token: true
20
+ style_as_token: true
21
+ uvit_skip_connection: false
22
+ block_size: 8192
23
+ depth: 13
24
+ num_heads: 8
25
+ hidden_dim: 512
26
+ in_channels: 80
27
+ content_dim: 512
28
+ style_encoder_dim: 192
29
+ class_dropout_prob: 0.1
30
+ dropout_rate: 0.0
31
+ attn_dropout_rate: 0.0
32
+ cfm_length_regulator:
33
+ _target_: modules.v2.length_regulator.InterpolateRegulator
34
+ channels: 512
35
+ is_discrete: true
36
+ codebook_size: 2048
37
+ sampling_ratios: [ 1, 1, 1, 1 ]
38
+ f0_condition: false
39
+ ar:
40
+ _target_: modules.v2.ar.NaiveWrapper
41
+ model:
42
+ _target_: modules.v2.ar.NaiveTransformer
43
+ config:
44
+ _target_: modules.v2.ar.NaiveModelArgs
45
+ dropout: 0.0
46
+ rope_base: 10000.0
47
+ dim: 768
48
+ head_dim: 64
49
+ n_local_heads: 2
50
+ intermediate_size: 2304
51
+ n_head: 12
52
+ n_layer: 12
53
+ vocab_size: 2049 # 1 + 1 for eos
54
+ ar_length_regulator:
55
+ _target_: modules.v2.length_regulator.InterpolateRegulator
56
+ channels: 768
57
+ is_discrete: true
58
+ codebook_size: 32
59
+ sampling_ratios: [ ]
60
+ f0_condition: false
61
+ style_encoder:
62
+ _target_: modules.campplus.DTDNN.CAMPPlus
63
+ feat_dim: 80
64
+ embedding_size: 192
65
+ content_extractor_narrow:
66
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
67
+ tokenizer_name: "openai/whisper-small"
68
+ ssl_model_name: "facebook/hubert-large-ll60k"
69
+ ssl_output_layer: 18
70
+ skip_ssl: true
71
+ encoder: &bottleneck_encoder
72
+ _target_: modules.astral_quantization.convnext.ConvNeXtV2Stage
73
+ dim: 512
74
+ num_blocks: 12
75
+ intermediate_dim: 1536
76
+ dilation: 1
77
+ input_dim: 1024
78
+ quantizer:
79
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
80
+ codebook_size: 32 # codebook size, must be a power of 2
81
+ dim: 512
82
+ entropy_loss_weight: 0.1
83
+ diversity_gamma: 1.0
84
+ spherical: True
85
+ enable_entropy_loss: True
86
+ soft_entropy_loss: True
87
+ content_extractor_wide:
88
+ _target_: modules.astral_quantization.default_model.AstralQuantizer
89
+ tokenizer_name: "openai/whisper-small"
90
+ ssl_model_name: "facebook/hubert-large-ll60k"
91
+ ssl_output_layer: 18
92
+ encoder: *bottleneck_encoder
93
+ quantizer:
94
+ _target_: modules.astral_quantization.bsq.BinarySphericalQuantize
95
+ codebook_size: 2048 # codebook size, must be a power of 2
96
+ dim: 512
97
+ entropy_loss_weight: 0.1
98
+ diversity_gamma: 1.0
99
+ spherical: True
100
+ enable_entropy_loss: True
101
+ soft_entropy_loss: True
102
+ vocoder:
103
+ _target_: modules.bigvgan.bigvgan.BigVGAN.from_pretrained
104
+ pretrained_model_name_or_path: "nvidia/bigvgan_v2_22khz_80band_256x"
105
+ use_cuda_kernel: false