lainlives commited on
Commit
d95ab09
·
verified ·
1 Parent(s): d5256e6

Upload 38 files

Browse files
Files changed (34) hide show
  1. .gitattributes +18 -0
  2. rvc/embedders/custom/7_12_2048_WavLM/config.json +107 -0
  3. rvc/embedders/custom/7_12_2048_WavLM/last.ckpt +3 -0
  4. rvc/embedders/custom/7_12_2048_WavLM/pytorch_model.bin +3 -0
  5. rvc/embedders/custom/9_10_11_12_1024/config.json +71 -0
  6. rvc/embedders/custom/9_10_11_12_1024/pytorch_model.bin +3 -0
  7. rvc/embedders/custom/9_12_2048_WavLM/config.json +107 -0
  8. rvc/embedders/custom/9_12_2048_WavLM/last.ckpt +3 -0
  9. rvc/embedders/custom/9_12_2048_WavLM/pytorch_model.bin +3 -0
  10. rvc/embedders/custom/KLM-SVM-Embedder_RVC/KLM-SVM.bin +3 -0
  11. rvc/embedders/custom/KLM-SVM-Embedder_RVC/config.json +72 -0
  12. rvc/embedders/custom/KLM-VOCAL-Embedder_RVC/KLM-VOCAL.bin +3 -0
  13. rvc/embedders/custom/KLM-VOCAL-Embedder_RVC/config.json +72 -0
  14. rvc/embedders/custom/KLM-Vocal-X1/KLM-VOCAL-X1.bin +3 -0
  15. rvc/embedders/custom/KLM-Vocal-X1/config.json +122 -0
  16. rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768.ckpt +3 -0
  17. rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768_pytorch_model.bin +3 -0
  18. rvc/embedders/custom/SPIN_450H_FINETUNE_26768/config.json +71 -0
  19. rvc/embedders/custom/SPIN_450H_FINETUNE_26768/spin.yaml +91 -0
  20. rvc/embedders/custom/WAVLMPLUS-5-12/config.json +99 -0
  21. rvc/embedders/custom/WAVLMPLUS-5-12/preprocessor_config.json +9 -0
  22. rvc/embedders/custom/WAVLMPLUS-5-12/pytorch_model.bin +3 -0
  23. rvc/embedders/custom/spin-v2/config.json +72 -0
  24. rvc/embedders/custom/spin-v2/pytorch_model.bin +3 -0
  25. rvc/embedders/custom/spin/config.json +71 -0
  26. rvc/embedders/custom/spin/pytorch_model.bin +3 -0
  27. rvc/embedders/custom/spinv2_official/config.json +72 -0
  28. rvc/embedders/custom/spinv2_official/pytorch_model.bin +3 -0
  29. rvc/embedders/custom/spinv2_official/spinv2_epoch=15-step=7216.ckpt +3 -0
  30. rvc/embedders/custom/wavLM-SPIN-2048_11_12/config.json +107 -0
  31. rvc/embedders/custom/wavLM-SPIN-2048_11_12/pytorch_model.bin +3 -0
  32. rvc/embedders/custom/wavLM-SPIN-2048_11_12_updatedConfig/config.json +107 -0
  33. rvc/embedders/custom/wavLM-SPIN-2048_11_12_updatedConfig/pytorch_model.bin +3 -0
  34. rvc/embedders/custom/wavlm-ft.zip +3 -0
.gitattributes CHANGED
@@ -141,3 +141,21 @@ rvc/voice_models/LainV4/LainIwakura.pth filter=lfs diff=lfs merge=lfs -text
141
  rvc/voice_models/LainV4/added_IVF240_Flat_nprobe_1_LainIwakura_v2.index filter=lfs diff=lfs merge=lfs -text
142
  rvc/voice_models/Rick_Sanchez/RickSanchez.index filter=lfs diff=lfs merge=lfs -text
143
  rvc/voice_models/Rick_Sanchez/RickSanchez_365e_11315s.pth filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  rvc/voice_models/LainV4/added_IVF240_Flat_nprobe_1_LainIwakura_v2.index filter=lfs diff=lfs merge=lfs -text
142
  rvc/voice_models/Rick_Sanchez/RickSanchez.index filter=lfs diff=lfs merge=lfs -text
143
  rvc/voice_models/Rick_Sanchez/RickSanchez_365e_11315s.pth filter=lfs diff=lfs merge=lfs -text
144
+ rvc/embedders/custom/7_12_2048_WavLM/last.ckpt filter=lfs diff=lfs merge=lfs -text
145
+ rvc/embedders/custom/7_12_2048_WavLM/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
146
+ rvc/embedders/custom/9_10_11_12_1024/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
147
+ rvc/embedders/custom/9_12_2048_WavLM/last.ckpt filter=lfs diff=lfs merge=lfs -text
148
+ rvc/embedders/custom/9_12_2048_WavLM/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
149
+ rvc/embedders/custom/KLM-SVM-Embedder_RVC/KLM-SVM.bin filter=lfs diff=lfs merge=lfs -text
150
+ rvc/embedders/custom/KLM-VOCAL-Embedder_RVC/KLM-VOCAL.bin filter=lfs diff=lfs merge=lfs -text
151
+ rvc/embedders/custom/KLM-Vocal-X1/KLM-VOCAL-X1.bin filter=lfs diff=lfs merge=lfs -text
152
+ rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768_pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
153
+ rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768.ckpt filter=lfs diff=lfs merge=lfs -text
154
+ rvc/embedders/custom/spin-v2/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
155
+ rvc/embedders/custom/spin/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
156
+ rvc/embedders/custom/spinv2_official/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
157
+ rvc/embedders/custom/spinv2_official/spinv2_epoch=15-step=7216.ckpt filter=lfs diff=lfs merge=lfs -text
158
+ rvc/embedders/custom/wavlm-ft.zip filter=lfs diff=lfs merge=lfs -text
159
+ rvc/embedders/custom/wavLM-SPIN-2048_11_12_updatedConfig/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
160
+ rvc/embedders/custom/wavLM-SPIN-2048_11_12/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
161
+ rvc/embedders/custom/WAVLMPLUS-5-12/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
rvc/embedders/custom/7_12_2048_WavLM/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "HubertModelWithFinalProj"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 256,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": false,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "mean",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": false,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "feat_proj_layer_norm": true,
52
+ "final_dropout": 0.1,
53
+ "hidden_act": "gelu",
54
+ "hidden_dropout": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_masks": 2,
64
+ "mask_time_prob": 0.05,
65
+ "max_bucket_distance": 800,
66
+ "model_type": "wavlm",
67
+ "num_adapter_layers": 3,
68
+ "num_attention_heads": 12,
69
+ "num_buckets": 320,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_ctc_classes": 80,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 12,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 768,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 256,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.44.2",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 32,
106
+ "xvector_output_dim": 512
107
+ }
rvc/embedders/custom/7_12_2048_WavLM/last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3af9b07e24a9bce1d04865eff7d6fb84ef43dd09ac6c0591f0cb996bc304e94
3
+ size 726628553
rvc/embedders/custom/7_12_2048_WavLM/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec8918afd652e4d368fa0199c7f5e7bb58719bbacb7585957d544246d20d877
3
+ size 378356791
rvc/embedders/custom/9_10_11_12_1024/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.44.2",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
rvc/embedders/custom/9_10_11_12_1024/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7e8082c7eeeb9852c7d72dd3d7d27cef9e476cd9453dbfa19fd89e5ccfb36e9
3
+ size 378356791
rvc/embedders/custom/9_12_2048_WavLM/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "HubertModelWithFinalProj"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 256,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": false,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "mean",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": false,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "feat_proj_layer_norm": true,
52
+ "final_dropout": 0.1,
53
+ "hidden_act": "gelu",
54
+ "hidden_dropout": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_masks": 2,
64
+ "mask_time_prob": 0.05,
65
+ "max_bucket_distance": 800,
66
+ "model_type": "wavlm",
67
+ "num_adapter_layers": 3,
68
+ "num_attention_heads": 12,
69
+ "num_buckets": 320,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_ctc_classes": 80,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 12,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 768,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 256,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.44.2",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 32,
106
+ "xvector_output_dim": 512
107
+ }
rvc/embedders/custom/9_12_2048_WavLM/last.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:937ed8aec91711b8074060470c8c15504b02c6200d428df52eac255f8843afe3
3
+ size 613182509
rvc/embedders/custom/9_12_2048_WavLM/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f18898ec10d1f01ba2bd43565be3704f68a8c1c399a6d577dacf46d9c8eef0c
3
+ size 378356791
rvc/embedders/custom/KLM-SVM-Embedder_RVC/KLM-SVM.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:358a41c122828ed7dc82fb7449c55309031fc29d9d5af2e975de914ea021d994
3
+ size 378346807
rvc/embedders/custom/KLM-SVM-Embedder_RVC/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "D:\\Codex\\klm-trainer\\engine\\rvc\\models\\embedders\\contentvec",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertModelWithFinalProj"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_min_masks": 0,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_min_masks": 2,
60
+ "mask_time_prob": 0.05,
61
+ "model_type": "hubert",
62
+ "num_attention_heads": 12,
63
+ "num_conv_pos_embedding_groups": 16,
64
+ "num_conv_pos_embeddings": 128,
65
+ "num_feat_extract_layers": 7,
66
+ "num_hidden_layers": 12,
67
+ "pad_token_id": 0,
68
+ "torch_dtype": "float32",
69
+ "transformers_version": "4.44.2",
70
+ "use_weighted_layer_sum": false,
71
+ "vocab_size": 32
72
+ }
rvc/embedders/custom/KLM-VOCAL-Embedder_RVC/KLM-VOCAL.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35b1391ea0370f1cc345fbb404d00afba5f86d869ef1c18e84b9b1cd8a5852d5
3
+ size 378346807
rvc/embedders/custom/KLM-VOCAL-Embedder_RVC/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "D:\\Codex\\klm-trainer\\engine\\rvc\\models\\embedders\\contentvec",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertModelWithFinalProj"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_min_masks": 0,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_min_masks": 2,
60
+ "mask_time_prob": 0.05,
61
+ "model_type": "hubert",
62
+ "num_attention_heads": 12,
63
+ "num_conv_pos_embedding_groups": 16,
64
+ "num_conv_pos_embeddings": 128,
65
+ "num_feat_extract_layers": 7,
66
+ "num_hidden_layers": 12,
67
+ "pad_token_id": 0,
68
+ "torch_dtype": "float32",
69
+ "transformers_version": "4.44.2",
70
+ "use_weighted_layer_sum": false,
71
+ "vocab_size": 32
72
+ }
rvc/embedders/custom/KLM-Vocal-X1/KLM-VOCAL-X1.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c81d1f004c5567441f39b09ed58899066af6ba4439b48b33521ad2e12e7360b
3
+ size 378399563
rvc/embedders/custom/KLM-Vocal-X1/config.json ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "D:\\Codex\\KLM-HF_V044\\engine\\rvc\\models\\embedders\\wavlm_base_plus",
3
+ "activation_dropout": 0.0,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "WavLMModelWithFinalProj"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.1,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.0,
54
+ "freeze_feat_extract_train": true,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 3072,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_channel_length": 10,
63
+ "mask_channel_min_space": 1,
64
+ "mask_channel_other": 0.0,
65
+ "mask_channel_prob": 0.0,
66
+ "mask_channel_selection": "static",
67
+ "mask_feature_length": 10,
68
+ "mask_feature_min_masks": 0,
69
+ "mask_feature_prob": 0.0,
70
+ "mask_time_length": 10,
71
+ "mask_time_min_masks": 2,
72
+ "mask_time_min_space": 1,
73
+ "mask_time_other": 0.0,
74
+ "mask_time_prob": 0.05,
75
+ "mask_time_selection": "static",
76
+ "max_bucket_distance": 800,
77
+ "model_type": "wavlm",
78
+ "no_mask_channel_overlap": false,
79
+ "no_mask_time_overlap": false,
80
+ "num_adapter_layers": 3,
81
+ "num_attention_heads": 12,
82
+ "num_buckets": 320,
83
+ "num_codevector_groups": 2,
84
+ "num_codevectors_per_group": 320,
85
+ "num_conv_pos_embedding_groups": 16,
86
+ "num_conv_pos_embeddings": 128,
87
+ "num_ctc_classes": 80,
88
+ "num_feat_extract_layers": 7,
89
+ "num_hidden_layers": 12,
90
+ "num_negatives": 100,
91
+ "output_hidden_size": 768,
92
+ "pad_token_id": 0,
93
+ "proj_codevector_dim": 256,
94
+ "replace_prob": 0.5,
95
+ "tdnn_dilation": [
96
+ 1,
97
+ 2,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "tdnn_dim": [
103
+ 512,
104
+ 512,
105
+ 512,
106
+ 512,
107
+ 1500
108
+ ],
109
+ "tdnn_kernel": [
110
+ 5,
111
+ 3,
112
+ 3,
113
+ 1,
114
+ 1
115
+ ],
116
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
117
+ "torch_dtype": "float32",
118
+ "transformers_version": "4.44.2",
119
+ "use_weighted_layer_sum": false,
120
+ "vocab_size": 32,
121
+ "xvector_output_dim": 512
122
+ }
rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99ebed0cf9316a4cbbeae9e3b2661fb01621d7d6ebe0a6c2898a2e72823042ed
3
+ size 500185456
rvc/embedders/custom/SPIN_450H_FINETUNE_26768/SPIN_450H_FINETUNE_26768_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2980f97e9b6ca0bf2d12afabf2f2643f7408991081ed34df58bd4af1da956c08
3
+ size 378356318
rvc/embedders/custom/SPIN_450H_FINETUNE_26768/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.44.2",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
rvc/embedders/custom/SPIN_450H_FINETUNE_26768/spin.yaml ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Interspeech 2023 version
2
+
3
+ # Training data
4
+ data:
5
+ json_dir: YOUR DATA FOLDER
6
+ splits:
7
+ - train-clean-100
8
+ - train-clean-360
9
+ sample_rate: 16000
10
+ min_audio_len: 40000 # minimum audio samples per utterance
11
+ random_crop_len: 272000 # maximum audio samples per utterance
12
+ spk2info: YOUR SPK2INFO FILE
13
+
14
+ # Validation data (not used for checkpointing, just for monitoring training progress)
15
+ val_data:
16
+ json_dir: YOUR DATA FOLDER
17
+ phn_dir: YOUR DATA FOLDER
18
+ splits:
19
+ - dev-clean
20
+ - dev-other
21
+ sample_rate: 16000
22
+
23
+ # SpinModel config
24
+ model:
25
+ encoder:
26
+ type: HuBERT # `HuBERT` / `WavLM`
27
+ use_layer: 12 # the layer which its representations are used for clustering
28
+ normalize: False
29
+ feat_select: x
30
+ randomize_all: False
31
+ randomize_layers: []
32
+ freeze_all: False
33
+ freeze_layers: ["pos", 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] # `pos`: positional encoding, `0`: CNN extractor
34
+ pred_head:
35
+ type: DNN
36
+ hid_dims: [256]
37
+ dropout: 0
38
+ activation: ReLU
39
+ loss:
40
+ type: SwavVQDisentangle
41
+ num_vars: 2048 # cluster size
42
+ epsilon: 0.02
43
+ sinkhorn_iters: 3
44
+ temp: 0.1
45
+ l2_norm: True
46
+ prob_ratio: 1.0
47
+
48
+ # Optimization
49
+ optim:
50
+ optimizer:
51
+ name: Adam
52
+ args:
53
+ lr: 1.e-4
54
+ weight_decay: 1.e-6
55
+ scheduler:
56
+ name: linear_warmup_decay # `linear_warmup_decay` / `linear_warmup_cosine_scheduler` / `noam_scheduler`
57
+ args:
58
+ warmup: 8365 # 1/4th the epochs
59
+ max_step: 33460 #5e
60
+ final_lr: 1.e-6
61
+
62
+ hparam:
63
+ batch_len: 4096000 # audio samples per GPU (256 secs ~ batch_size = 12.8k)
64
+ val_batch_size: 8
65
+
66
+ # pytorch_lightning.Trainer
67
+ # ref: https://lightning.ai/docs/pytorch/latest/common/trainer.html
68
+ trainer:
69
+ max_steps: 33460 #5e
70
+ gradient_clip_val: 10
71
+ accumulate_grad_batches: 1
72
+ precision: 16
73
+ logger: wandb # use `False` to disable logging
74
+ log_every_n_steps: 100
75
+ default_root_dir: exp/tmp
76
+ accelerator: gpu
77
+ # strategy: ddp # uncomment this line to enable DDP training
78
+ num_sanity_val_steps: 0
79
+ val_check_interval: 1000
80
+
81
+ # pytorch_lightning.callbacks.ModelCheckpoint
82
+ # ref: https://lightning.ai/docs/pytorch/latest/api/lightning.pytorch.callbacks.ModelCheckpoint.html
83
+ checkpoint:
84
+ filename: "{epoch}-{step}"
85
+ every_n_train_steps: 6692 # This is 1 epoch for new set
86
+ save_last: true
87
+
88
+ # pytorch_lightning.loggers.WandbLogger
89
+ # ref: https://lightning.ai/docs/pytorch/latest/extensions/generated/lightning.pytorch.loggers.WandbLogger.html
90
+ logger:
91
+ project: spin_is2023
rvc/embedders/custom/WAVLMPLUS-5-12/config.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "wavlm-base-plus",
3
+ "activation_dropout": 0.0,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "WavLMModel"
10
+ ],
11
+ "attention_dropout": 0.1,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 256,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": false,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "sum",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": false,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_norm": "group",
51
+ "feat_proj_dropout": 0.1,
52
+ "feat_quantizer_dropout": 0.0,
53
+ "final_dropout": 0.0,
54
+ "freeze_feat_extract_train": true,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.1,
57
+ "hidden_size": 768,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 3072,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_channel_length": 10,
63
+ "mask_channel_min_space": 1,
64
+ "mask_channel_other": 0.0,
65
+ "mask_channel_prob": 0.0,
66
+ "mask_channel_selection": "static",
67
+ "mask_feature_length": 10,
68
+ "mask_feature_min_masks": 0,
69
+ "mask_feature_prob": 0.0,
70
+ "mask_time_length": 10,
71
+ "mask_time_min_masks": 2,
72
+ "mask_time_min_space": 1,
73
+ "mask_time_other": 0.0,
74
+ "mask_time_prob": 0.05,
75
+ "mask_time_selection": "static",
76
+ "model_type": "wavlm",
77
+ "no_mask_channel_overlap": false,
78
+ "no_mask_time_overlap": false,
79
+ "num_adapter_layers": 3,
80
+ "num_attention_heads": 12,
81
+ "num_buckets": 320,
82
+ "num_codevector_groups": 2,
83
+ "num_codevectors_per_group": 320,
84
+ "num_conv_pos_embedding_groups": 16,
85
+ "num_conv_pos_embeddings": 128,
86
+ "num_ctc_classes": 80,
87
+ "num_feat_extract_layers": 7,
88
+ "num_hidden_layers": 12,
89
+ "num_negatives": 100,
90
+ "output_hidden_size": 768,
91
+ "pad_token_id": 0,
92
+ "proj_codevector_dim": 256,
93
+ "replace_prob": 0.5,
94
+ "torch_dtype": "float32",
95
+ "transformers_version": "4.13.0.dev0",
96
+ "use_weighted_layer_sum": false,
97
+ "vocab_size": 32,
98
+ "tokenizer_class": "Wav2Vec2CTCTokenizer"
99
+ }
rvc/embedders/custom/WAVLMPLUS-5-12/preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": false,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
rvc/embedders/custom/WAVLMPLUS-5-12/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0f45914d34b60b82127bf5b4b2ab2600f3796c493cc671c6333458002d409da
3
+ size 378356791
rvc/embedders/custom/spin-v2/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_pos_batch_norm": false,
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_min_masks": 0,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_min_masks": 2,
60
+ "mask_time_prob": 0.05,
61
+ "model_type": "hubert",
62
+ "num_attention_heads": 12,
63
+ "num_conv_pos_embedding_groups": 16,
64
+ "num_conv_pos_embeddings": 128,
65
+ "num_feat_extract_layers": 7,
66
+ "num_hidden_layers": 12,
67
+ "pad_token_id": 0,
68
+ "torch_dtype": "float32",
69
+ "transformers_version": "4.55.0",
70
+ "use_weighted_layer_sum": false,
71
+ "vocab_size": 32
72
+ }
rvc/embedders/custom/spin-v2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9ac0be326057b17607a988be497793817f8274e987cf691a1b61192510f823
3
+ size 378356791
rvc/embedders/custom/spin/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.44.2",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
rvc/embedders/custom/spin/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:057f12bfda54e2d486d86a52a3beb2a07c96a888bc6ac0c382c12ac18dbd500c
3
+ size 378356791
rvc/embedders/custom/spinv2_official/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_pos_batch_norm": false,
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_norm": "group",
45
+ "feat_proj_dropout": 0.0,
46
+ "feat_proj_layer_norm": true,
47
+ "final_dropout": 0.1,
48
+ "hidden_act": "gelu",
49
+ "hidden_dropout": 0.1,
50
+ "hidden_size": 768,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 3072,
53
+ "layer_norm_eps": 1e-05,
54
+ "layerdrop": 0.1,
55
+ "mask_feature_length": 10,
56
+ "mask_feature_min_masks": 0,
57
+ "mask_feature_prob": 0.0,
58
+ "mask_time_length": 10,
59
+ "mask_time_min_masks": 2,
60
+ "mask_time_prob": 0.05,
61
+ "model_type": "hubert",
62
+ "num_attention_heads": 12,
63
+ "num_conv_pos_embedding_groups": 16,
64
+ "num_conv_pos_embeddings": 128,
65
+ "num_feat_extract_layers": 7,
66
+ "num_hidden_layers": 12,
67
+ "pad_token_id": 0,
68
+ "torch_dtype": "float32",
69
+ "transformers_version": "4.55.0",
70
+ "use_weighted_layer_sum": false,
71
+ "vocab_size": 32
72
+ }
rvc/embedders/custom/spinv2_official/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a9ac0be326057b17607a988be497793817f8274e987cf691a1b61192510f823
3
+ size 378356791
rvc/embedders/custom/spinv2_official/spinv2_epoch=15-step=7216.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e0344e316aee5a2b63dcb2de643d686271974b5bc523dfdc74c8fc079e79765
3
+ size 837338557
rvc/embedders/custom/wavLM-SPIN-2048_11_12/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "HubertModelWithFinalProj"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 256,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": false,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "mean",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": false,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "feat_proj_layer_norm": true,
52
+ "final_dropout": 0.1,
53
+ "hidden_act": "gelu",
54
+ "hidden_dropout": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_masks": 2,
64
+ "mask_time_prob": 0.05,
65
+ "max_bucket_distance": 800,
66
+ "model_type": "wavlm",
67
+ "num_adapter_layers": 3,
68
+ "num_attention_heads": 12,
69
+ "num_buckets": 320,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_ctc_classes": 80,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 12,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 768,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 256,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.44.2",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 32,
106
+ "xvector_output_dim": 512
107
+ }
rvc/embedders/custom/wavLM-SPIN-2048_11_12/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978162a5049473a8f9d0b4ddc5d98d229c08b5c32040e0888fe275099515a0eb
3
+ size 378356791
rvc/embedders/custom/wavLM-SPIN-2048_11_12_updatedConfig/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "adapter_kernel_size": 3,
4
+ "adapter_stride": 2,
5
+ "add_adapter": false,
6
+ "apply_spec_augment": true,
7
+ "architectures": [
8
+ "HubertModelWithFinalProj"
9
+ ],
10
+ "attention_dropout": 0.1,
11
+ "bos_token_id": 1,
12
+ "classifier_proj_size": 256,
13
+ "codevector_dim": 256,
14
+ "contrastive_logits_temperature": 0.1,
15
+ "conv_bias": false,
16
+ "conv_dim": [
17
+ 512,
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512
24
+ ],
25
+ "conv_kernel": [
26
+ 10,
27
+ 3,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 2,
32
+ 2
33
+ ],
34
+ "conv_stride": [
35
+ 5,
36
+ 2,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2
42
+ ],
43
+ "ctc_loss_reduction": "mean",
44
+ "ctc_zero_infinity": false,
45
+ "diversity_loss_weight": 0.1,
46
+ "do_stable_layer_norm": false,
47
+ "eos_token_id": 2,
48
+ "feat_extract_activation": "gelu",
49
+ "feat_extract_norm": "group",
50
+ "feat_proj_dropout": 0.0,
51
+ "feat_proj_layer_norm": true,
52
+ "final_dropout": 0.1,
53
+ "hidden_act": "gelu",
54
+ "hidden_dropout": 0.1,
55
+ "hidden_size": 768,
56
+ "initializer_range": 0.02,
57
+ "intermediate_size": 3072,
58
+ "layer_norm_eps": 1e-05,
59
+ "layerdrop": 0.1,
60
+ "mask_feature_length": 10,
61
+ "mask_feature_prob": 0.0,
62
+ "mask_time_length": 10,
63
+ "mask_time_min_masks": 2,
64
+ "mask_time_prob": 0.05,
65
+ "max_bucket_distance": 800,
66
+ "model_type": "wavlm",
67
+ "num_adapter_layers": 3,
68
+ "num_attention_heads": 12,
69
+ "num_buckets": 320,
70
+ "num_codevector_groups": 2,
71
+ "num_codevectors_per_group": 320,
72
+ "num_conv_pos_embedding_groups": 16,
73
+ "num_conv_pos_embeddings": 128,
74
+ "num_ctc_classes": 80,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 12,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 768,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 256,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.44.2",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 32,
106
+ "xvector_output_dim": 512
107
+ }
rvc/embedders/custom/wavLM-SPIN-2048_11_12_updatedConfig/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:978162a5049473a8f9d0b4ddc5d98d229c08b5c32040e0888fe275099515a0eb
3
+ size 378356791
rvc/embedders/custom/wavlm-ft.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bab5e063c0d987821b0c323af4c86d656382b8a6e127f6d637000b29d2a55e3
3
+ size 298471330