up
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- AF/config.json +114 -0
- AJU_YM/config.json +115 -0
- AK/config.json +121 -0
- AKS/config.json +114 -0
- AKS2/config.json +115 -0
- AKT/config.json +115 -0
- AKY/config.json +114 -0
- AN/config.json +115 -0
- AS/config.json +121 -0
- AS2/config.json +121 -0
- AS3/config.json +121 -0
- AT/config.json +114 -0
- ATN/config.json +115 -0
- AU/config.json +121 -0
- AY/config.json +121 -0
- FI/config.json +114 -0
- HM/config.json +115 -0
- HS/config.json +114 -0
- HT/config.json +115 -0
- IH/config.json +115 -0
- IM/config.json +121 -0
- IM2/config.json +121 -0
- KH/config.json +114 -0
- KHN/config.json +115 -0
- KI/config.json +115 -0
- KI2/config.json +115 -0
- KIN/config.json +115 -0
- KK/config.json +115 -0
- KY/config.json +121 -0
- M/config.json +115 -0
- MCC/config.json +115 -0
- MF/config.json +114 -0
- MH/config.json +115 -0
- MHY/config.json +115 -0
- MI/config.json +114 -0
- MIW/config.json +114 -0
- MK/config.json +114 -0
- MKB/config.json +121 -0
- MS/config.json +114 -0
- MT/config.json +115 -0
- NH/config.json +115 -0
- NHR/config.json +115 -0
- NO/config.json +115 -0
- NS/config.json +116 -1
- RK/config.json +115 -0
- RS/config.json +115 -0
- RT/config.json +121 -0
- RT2/config.json +121 -0
- RTN/config.json +115 -0
- RU/config.json +114 -0
AF/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "3KD3",
|
| 3 |
"sort": 19,
|
| 4 |
"name": "AF",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/af-epoch25-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/af-epoch25-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"af-epoch25-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "3KD3",
|
| 117 |
"sort": 19,
|
| 118 |
"name": "AF",
|
AJU_YM/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "X59B",
|
| 3 |
"sort": 57,
|
| 4 |
"name": "AJU_YM",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\aju_ym-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\aju_ym-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"aju_ym-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "X59B",
|
| 118 |
"sort": 57,
|
| 119 |
"name": "AJU_YM",
|
AK/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "NSO5",
|
| 3 |
"sort": 5,
|
| 4 |
"name": "AK",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 20,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ak-epoch20-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ak-epoch20-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ak-epoch20-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "NSO5",
|
| 124 |
"sort": 5,
|
| 125 |
"name": "AK",
|
AKS/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "356k",
|
| 3 |
"sort": 16,
|
| 4 |
"name": "AKS",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 20,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/aks-epoch20-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/aks-epoch20-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"aks-epoch20-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "356k",
|
| 117 |
"sort": 16,
|
| 118 |
"name": "AKS",
|
AKS2/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "afWH",
|
| 3 |
"sort": 42,
|
| 4 |
"name": "AKS2",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\aks2-epoch25-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\aks2-epoch25-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"aks2-epoch25-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "afWH",
|
| 118 |
"sort": 42,
|
| 119 |
"name": "AKS2",
|
AKT/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "rLOK",
|
| 3 |
"sort": 58,
|
| 4 |
"name": "AKT",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 17,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\akt-epoch17-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\akt-epoch17-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"akt-epoch17-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "rLOK",
|
| 118 |
"sort": 58,
|
| 119 |
"name": "AKT",
|
AKY/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "OKsC",
|
| 3 |
"sort": 7,
|
| 4 |
"name": "AKY",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/aky-epoch50-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/aky-epoch50-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"aky-epoch50-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "OKsC",
|
| 117 |
"sort": 7,
|
| 118 |
"name": "AKY",
|
AN/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "9a5o",
|
| 3 |
"sort": 32,
|
| 4 |
"name": "AN",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\an-epoch33-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\an-epoch33-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"an-epoch33-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "9a5o",
|
| 118 |
"sort": 32,
|
| 119 |
"name": "AN",
|
AS/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "E7VR",
|
| 3 |
"sort": 64,
|
| 4 |
"name": "AS",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\as-epoch25-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\as-epoch25-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"as-epoch25-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "E7VR",
|
| 124 |
"sort": 64,
|
| 125 |
"name": "AS",
|
AS2/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "O8vN",
|
| 3 |
"sort": 65,
|
| 4 |
"name": "AS2",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\as2-epoch33-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\as2-epoch33-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"as2-epoch33-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "O8vN",
|
| 124 |
"sort": 65,
|
| 125 |
"name": "AS2",
|
AS3/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "f8Op",
|
| 3 |
"sort": 66,
|
| 4 |
"name": "AS3",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\as3-epoch50-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\as3-epoch50-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"as3-epoch50-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "f8Op",
|
| 124 |
"sort": 66,
|
| 125 |
"name": "AS3",
|
AT/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "cDSg",
|
| 3 |
"sort": 11,
|
| 4 |
"name": "AT",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 14,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/at-epoch14-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/at-epoch14-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"at-epoch14-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "cDSg",
|
| 117 |
"sort": 11,
|
| 118 |
"name": "AT",
|
ATN/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "q0cL",
|
| 3 |
"sort": 27,
|
| 4 |
"name": "ATN",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data/atn-epoch10-1.0.0/train.list",
|
| 34 |
+
"validation_files": "Data/atn-epoch10-1.0.0/val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"atn-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.3.1-JP-Extra",
|
| 117 |
"id": "q0cL",
|
| 118 |
"sort": 27,
|
| 119 |
"name": "ATN",
|
AU/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "O5Jt",
|
| 3 |
"sort": 3,
|
| 4 |
"name": "AU",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\au-epoch33-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\au-epoch33-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"au-epoch33-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "O5Jt",
|
| 124 |
"sort": 3,
|
| 125 |
"name": "AU",
|
AY/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "4Qs7",
|
| 3 |
"sort": 33,
|
| 4 |
"name": "AY",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ay3-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ay3-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ay3-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "4Qs7",
|
| 124 |
"sort": 33,
|
| 125 |
"name": "AY",
|
FI/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "vcL0",
|
| 3 |
"sort": 24,
|
| 4 |
"name": "FI",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 100,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/fi-epoch100-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/fi-epoch100-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"fi-epoch100-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "vcL0",
|
| 117 |
"sort": 24,
|
| 118 |
"name": "FI",
|
HM/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "KcGx",
|
| 3 |
"sort": 62,
|
| 4 |
"name": "HM",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\hm-epoch33-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\hm-epoch33-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"hm-epoch33-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "KcGx",
|
| 118 |
"sort": 62,
|
| 119 |
"name": "HM",
|
HS/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "u3KN",
|
| 3 |
"sort": 6,
|
| 4 |
"name": "HS",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/hs-epoch50-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/hs-epoch50-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"hs-epoch50-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "u3KN",
|
| 117 |
"sort": 6,
|
| 118 |
"name": "HS",
|
HT/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "Ac0c",
|
| 3 |
"sort": 36,
|
| 4 |
"name": "HT",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ht-epoch50-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ht-epoch50-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ht-epoch50-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "Ac0c",
|
| 118 |
"sort": 36,
|
| 119 |
"name": "HT",
|
IH/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "jK1l",
|
| 3 |
"sort": 29,
|
| 4 |
"name": "IH",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ih-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ih-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ih-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "jK1l",
|
| 118 |
"sort": 29,
|
| 119 |
"name": "IH",
|
IM/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "rY5k",
|
| 3 |
"sort": 4,
|
| 4 |
"name": "IM",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\im-epoch33-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\im-epoch33-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"im-epoch33-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "rY5k",
|
| 124 |
"sort": 4,
|
| 125 |
"name": "IM",
|
IM2/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "OY15",
|
| 3 |
"sort": 55,
|
| 4 |
"name": "IM2",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\im2-epoch33-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\im2-epoch33-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"im2-epoch33-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "OY15",
|
| 124 |
"sort": 55,
|
| 125 |
"name": "IM2",
|
KH/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "xg2y",
|
| 3 |
"sort": 13,
|
| 4 |
"name": "KH",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/kh-epoch25-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/kh-epoch25-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"kh-epoch25-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "xg2y",
|
| 117 |
"sort": 13,
|
| 118 |
"name": "KH",
|
KHN/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "diDP",
|
| 3 |
"sort": 63,
|
| 4 |
"name": "KHN",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 12,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\khn-epoch12-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\khn-epoch12-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"khn-epoch12-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "diDP",
|
| 118 |
"sort": 63,
|
| 119 |
"name": "KHN",
|
KI/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "009j",
|
| 3 |
"sort": 28,
|
| 4 |
"name": "KI",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 20,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ki-epoch20-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ki-epoch20-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ki-epoch20-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "009j",
|
| 118 |
"sort": 28,
|
| 119 |
"name": "KI",
|
KI2/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "y5g6",
|
| 3 |
"sort": 43,
|
| 4 |
"name": "KI2",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ki2-epoch33-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ki2-epoch33-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ki2-epoch33-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "y5g6",
|
| 118 |
"sort": 43,
|
| 119 |
"name": "KI2",
|
KIN/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "u7DJ",
|
| 3 |
"sort": 39,
|
| 4 |
"name": "KIN",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\kin-epoch33-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\kin-epoch33-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"kin-epoch33-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "u7DJ",
|
| 118 |
"sort": 39,
|
| 119 |
"name": "KIN",
|
KK/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "7Mu6",
|
| 3 |
"sort": 34,
|
| 4 |
"name": "KK",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\kk-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\kk-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"kk-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "7Mu6",
|
| 118 |
"sort": 34,
|
| 119 |
"name": "KK",
|
KY/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "diDP",
|
| 3 |
"sort": 68,
|
| 4 |
"name": "KY",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 20,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ky-epoch20-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ky-epoch20-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ky-epoch20-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "diDP",
|
| 124 |
"sort": 68,
|
| 125 |
"name": "KY",
|
M/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "z65H",
|
| 3 |
"sort": 35,
|
| 4 |
"name": "M",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\m-epoch50-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\m-epoch50-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"m-epoch50-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "z65H",
|
| 118 |
"sort": 35,
|
| 119 |
"name": "M",
|
MCC/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "rl8n",
|
| 3 |
"sort": 41,
|
| 4 |
"name": "MCC",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\mcc-epoch25-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\mcc-epoch25-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"mcc-epoch25-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "rl8n",
|
| 118 |
"sort": 41,
|
| 119 |
"name": "MCC",
|
MF/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "5S6n",
|
| 3 |
"sort": 15,
|
| 4 |
"name": "MF",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 13,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/mf-epoch13-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/mf-epoch13-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"mf-epoch13-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "5S6n",
|
| 117 |
"sort": 15,
|
| 118 |
"name": "MF",
|
MH/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "4fW7",
|
| 3 |
"sort": 26,
|
| 4 |
"name": "MH",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 14,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\mh-epoch14-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\mh-epoch14-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"mh-epoch14-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "4fW7",
|
| 118 |
"sort": 26,
|
| 119 |
"name": "MH",
|
MHY/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "a1dO",
|
| 3 |
"sort": 51,
|
| 4 |
"name": "MHY",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\mhy-epoch50-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\mhy-epoch50-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"mhy-epoch50-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "a1dO",
|
| 118 |
"sort": 51,
|
| 119 |
"name": "MHY",
|
MI/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "Lu1P",
|
| 3 |
"sort": 9,
|
| 4 |
"name": "MI",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 100,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/mi-epoch100-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/mi-epoch100-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"mi-epoch100-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "Lu1P",
|
| 117 |
"sort": 9,
|
| 118 |
"name": "MI",
|
MIW/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "y0F7",
|
| 3 |
"sort": 18,
|
| 4 |
"name": "MIW",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 17,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/miw-epoch17-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/miw-epoch17-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"miw-epoch17-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "y0F7",
|
| 117 |
"sort": 18,
|
| 118 |
"name": "MIW",
|
MK/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "q8YU",
|
| 3 |
"sort": 17,
|
| 4 |
"name": "MK",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 100,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/mk-epoch100-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/mk-epoch100-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"mk-epoch100-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "q8YU",
|
| 117 |
"sort": 17,
|
| 118 |
"name": "MK",
|
MKB/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "umYq",
|
| 3 |
"sort": 67,
|
| 4 |
"name": "MKB",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 14,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\mkb-epoch14-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\mkb-epoch14-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"mkb-epoch14-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "umYq",
|
| 124 |
"sort": 67,
|
| 125 |
"name": "MKB",
|
MS/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "X1JZ",
|
| 3 |
"sort": 12,
|
| 4 |
"name": "MS",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/ms-epoch25-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/ms-epoch25-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"ms-epoch25-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "X1JZ",
|
| 117 |
"sort": 12,
|
| 118 |
"name": "MS",
|
MT/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "J03K",
|
| 3 |
"sort": 37,
|
| 4 |
"name": "MT",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\mt-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\mt-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"mt-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "J03K",
|
| 118 |
"sort": 37,
|
| 119 |
"name": "MT",
|
NH/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "VDps",
|
| 3 |
"sort": 38,
|
| 4 |
"name": "NH",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 50,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\nh-epoch50-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\nh-epoch50-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"nh-epoch50-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "VDps",
|
| 118 |
"sort": 38,
|
| 119 |
"name": "NH",
|
NHR/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "9792",
|
| 3 |
"sort": 59,
|
| 4 |
"name": "NHR",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 100,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\nhr-epoch100-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\nhr-epoch100-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"nhr-epoch100-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "9792",
|
| 118 |
"sort": 59,
|
| 119 |
"name": "NHR",
|
NO/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "3E+43",
|
| 3 |
"sort": 48,
|
| 4 |
"name": "NO",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 12,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\no-epoch12-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\no-epoch12-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"no-epoch12-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "3E+43",
|
| 118 |
"sort": 48,
|
| 119 |
"name": "NO",
|
NS/config.json
CHANGED
|
@@ -1,7 +1,122 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "38G9",
|
| 3 |
"sort": 30,
|
| 4 |
"name": "NS",
|
| 5 |
-
"description": "
|
| 6 |
"character": "22"
|
| 7 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 10,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\ns-epoch10-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\ns-epoch10-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"ns-epoch10-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "38G9",
|
| 118 |
"sort": 30,
|
| 119 |
"name": "NS",
|
| 120 |
+
"description": "Vtuberの従井ノラ",
|
| 121 |
"character": "22"
|
| 122 |
}
|
RK/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "K9vK",
|
| 3 |
"sort": 31,
|
| 4 |
"name": "RK",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 500,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 2,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\rk-epoch25-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\rk-epoch25-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"rk-epoch25-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "K9vK",
|
| 118 |
"sort": 31,
|
| 119 |
"name": "RK",
|
RS/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "r50Q",
|
| 3 |
"sort": 49,
|
| 4 |
"name": "RS",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 100,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\rs-epoch100-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\rs-epoch100-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"rs-epoch100-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "r50Q",
|
| 118 |
"sort": 49,
|
| 119 |
"name": "RS",
|
RT/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "PLUQ",
|
| 3 |
"sort": 2,
|
| 4 |
"name": "RT",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\rt-epoch33-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\rt-epoch33-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"rt-epoch33-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "PLUQ",
|
| 124 |
"sort": 2,
|
| 125 |
"name": "RT",
|
RT2/config.json
CHANGED
|
@@ -1,4 +1,125 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "4jdl",
|
| 3 |
"sort": 46,
|
| 4 |
"name": "RT2",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 33,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\rt2-epoch33-2.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\rt2-epoch33-2.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"rt2-epoch33-2.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 7,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0,
|
| 52 |
+
"anger": 1,
|
| 53 |
+
"disgust": 2,
|
| 54 |
+
"fear": 3,
|
| 55 |
+
"happy": 4,
|
| 56 |
+
"sad": 5,
|
| 57 |
+
"surprise": 6
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"model": {
|
| 61 |
+
"use_spk_conditioned_encoder": true,
|
| 62 |
+
"use_noise_scaled_mas": true,
|
| 63 |
+
"use_mel_posterior_encoder": false,
|
| 64 |
+
"use_duration_discriminator": false,
|
| 65 |
+
"use_wavlm_discriminator": true,
|
| 66 |
+
"inter_channels": 192,
|
| 67 |
+
"hidden_channels": 192,
|
| 68 |
+
"filter_channels": 768,
|
| 69 |
+
"n_heads": 2,
|
| 70 |
+
"n_layers": 6,
|
| 71 |
+
"kernel_size": 3,
|
| 72 |
+
"p_dropout": 0.1,
|
| 73 |
+
"resblock": "1",
|
| 74 |
+
"resblock_kernel_sizes": [
|
| 75 |
+
3,
|
| 76 |
+
7,
|
| 77 |
+
11
|
| 78 |
+
],
|
| 79 |
+
"resblock_dilation_sizes": [
|
| 80 |
+
[
|
| 81 |
+
1,
|
| 82 |
+
3,
|
| 83 |
+
5
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
1,
|
| 87 |
+
3,
|
| 88 |
+
5
|
| 89 |
+
],
|
| 90 |
+
[
|
| 91 |
+
1,
|
| 92 |
+
3,
|
| 93 |
+
5
|
| 94 |
+
]
|
| 95 |
+
],
|
| 96 |
+
"upsample_rates": [
|
| 97 |
+
8,
|
| 98 |
+
8,
|
| 99 |
+
2,
|
| 100 |
+
2,
|
| 101 |
+
2
|
| 102 |
+
],
|
| 103 |
+
"upsample_initial_channel": 512,
|
| 104 |
+
"upsample_kernel_sizes": [
|
| 105 |
+
16,
|
| 106 |
+
16,
|
| 107 |
+
8,
|
| 108 |
+
2,
|
| 109 |
+
2
|
| 110 |
+
],
|
| 111 |
+
"n_layers_q": 3,
|
| 112 |
+
"use_spectral_norm": false,
|
| 113 |
+
"gin_channels": 512,
|
| 114 |
+
"slm": {
|
| 115 |
+
"model": "./slm/wavlm-base-plus",
|
| 116 |
+
"sr": 16000,
|
| 117 |
+
"hidden": 768,
|
| 118 |
+
"nlayers": 13,
|
| 119 |
+
"initial_channel": 64
|
| 120 |
+
}
|
| 121 |
+
},
|
| 122 |
+
"version": "2.6.1-JP-Extra",
|
| 123 |
"id": "4jdl",
|
| 124 |
"sort": 46,
|
| 125 |
"name": "RT2",
|
RTN/config.json
CHANGED
|
@@ -1,4 +1,119 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "sy1y",
|
| 3 |
"sort": 47,
|
| 4 |
"name": "RTN",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 14,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 3,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": false,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false,
|
| 29 |
+
"freeze_decoder": false
|
| 30 |
+
},
|
| 31 |
+
"data": {
|
| 32 |
+
"use_jp_extra": true,
|
| 33 |
+
"training_files": "Data\\rtn-epoch14-1.0.0\\train.list",
|
| 34 |
+
"validation_files": "Data\\rtn-epoch14-1.0.0\\val.list",
|
| 35 |
+
"max_wav_value": 32768.0,
|
| 36 |
+
"sampling_rate": 44100,
|
| 37 |
+
"filter_length": 2048,
|
| 38 |
+
"hop_length": 512,
|
| 39 |
+
"win_length": 2048,
|
| 40 |
+
"n_mel_channels": 128,
|
| 41 |
+
"mel_fmin": 0.0,
|
| 42 |
+
"mel_fmax": null,
|
| 43 |
+
"add_blank": true,
|
| 44 |
+
"n_speakers": 1,
|
| 45 |
+
"cleaned_text": true,
|
| 46 |
+
"spk2id": {
|
| 47 |
+
"rtn-epoch14-1.0.0": 0
|
| 48 |
+
},
|
| 49 |
+
"num_styles": 1,
|
| 50 |
+
"style2id": {
|
| 51 |
+
"Neutral": 0
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"model": {
|
| 55 |
+
"use_spk_conditioned_encoder": true,
|
| 56 |
+
"use_noise_scaled_mas": true,
|
| 57 |
+
"use_mel_posterior_encoder": false,
|
| 58 |
+
"use_duration_discriminator": false,
|
| 59 |
+
"use_wavlm_discriminator": true,
|
| 60 |
+
"inter_channels": 192,
|
| 61 |
+
"hidden_channels": 192,
|
| 62 |
+
"filter_channels": 768,
|
| 63 |
+
"n_heads": 2,
|
| 64 |
+
"n_layers": 6,
|
| 65 |
+
"kernel_size": 3,
|
| 66 |
+
"p_dropout": 0.1,
|
| 67 |
+
"resblock": "1",
|
| 68 |
+
"resblock_kernel_sizes": [
|
| 69 |
+
3,
|
| 70 |
+
7,
|
| 71 |
+
11
|
| 72 |
+
],
|
| 73 |
+
"resblock_dilation_sizes": [
|
| 74 |
+
[
|
| 75 |
+
1,
|
| 76 |
+
3,
|
| 77 |
+
5
|
| 78 |
+
],
|
| 79 |
+
[
|
| 80 |
+
1,
|
| 81 |
+
3,
|
| 82 |
+
5
|
| 83 |
+
],
|
| 84 |
+
[
|
| 85 |
+
1,
|
| 86 |
+
3,
|
| 87 |
+
5
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"upsample_rates": [
|
| 91 |
+
8,
|
| 92 |
+
8,
|
| 93 |
+
2,
|
| 94 |
+
2,
|
| 95 |
+
2
|
| 96 |
+
],
|
| 97 |
+
"upsample_initial_channel": 512,
|
| 98 |
+
"upsample_kernel_sizes": [
|
| 99 |
+
16,
|
| 100 |
+
16,
|
| 101 |
+
8,
|
| 102 |
+
2,
|
| 103 |
+
2
|
| 104 |
+
],
|
| 105 |
+
"n_layers_q": 3,
|
| 106 |
+
"use_spectral_norm": false,
|
| 107 |
+
"gin_channels": 512,
|
| 108 |
+
"slm": {
|
| 109 |
+
"model": "./slm/wavlm-base-plus",
|
| 110 |
+
"sr": 16000,
|
| 111 |
+
"hidden": 768,
|
| 112 |
+
"nlayers": 13,
|
| 113 |
+
"initial_channel": 64
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"version": "2.4.0-JP-Extra",
|
| 117 |
"id": "sy1y",
|
| 118 |
"sort": 47,
|
| 119 |
"name": "RTN",
|
RU/config.json
CHANGED
|
@@ -1,4 +1,118 @@
|
|
| 1 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
"id": "tB79",
|
| 3 |
"sort": 14,
|
| 4 |
"name": "RU",
|
|
|
|
| 1 |
{
|
| 2 |
+
"train": {
|
| 3 |
+
"log_interval": 200,
|
| 4 |
+
"eval_interval": 1000,
|
| 5 |
+
"seed": 42,
|
| 6 |
+
"epochs": 25,
|
| 7 |
+
"learning_rate": 0.0001,
|
| 8 |
+
"betas": [
|
| 9 |
+
0.8,
|
| 10 |
+
0.99
|
| 11 |
+
],
|
| 12 |
+
"eps": 1e-09,
|
| 13 |
+
"batch_size": 4,
|
| 14 |
+
"bf16_run": false,
|
| 15 |
+
"fp16_run": false,
|
| 16 |
+
"lr_decay": 0.99996,
|
| 17 |
+
"segment_size": 16384,
|
| 18 |
+
"init_lr_ratio": 1,
|
| 19 |
+
"warmup_epochs": 0,
|
| 20 |
+
"c_mel": 45,
|
| 21 |
+
"c_kl": 1.0,
|
| 22 |
+
"c_commit": 100,
|
| 23 |
+
"skip_optimizer": true,
|
| 24 |
+
"freeze_ZH_bert": false,
|
| 25 |
+
"freeze_JP_bert": false,
|
| 26 |
+
"freeze_EN_bert": false,
|
| 27 |
+
"freeze_emo": false,
|
| 28 |
+
"freeze_style": false
|
| 29 |
+
},
|
| 30 |
+
"data": {
|
| 31 |
+
"use_jp_extra": true,
|
| 32 |
+
"training_files": "Data/ru-epoch25-1.0.0/train.list",
|
| 33 |
+
"validation_files": "Data/ru-epoch25-1.0.0/val.list",
|
| 34 |
+
"max_wav_value": 32768.0,
|
| 35 |
+
"sampling_rate": 44100,
|
| 36 |
+
"filter_length": 2048,
|
| 37 |
+
"hop_length": 512,
|
| 38 |
+
"win_length": 2048,
|
| 39 |
+
"n_mel_channels": 128,
|
| 40 |
+
"mel_fmin": 0.0,
|
| 41 |
+
"mel_fmax": null,
|
| 42 |
+
"add_blank": true,
|
| 43 |
+
"n_speakers": 1,
|
| 44 |
+
"cleaned_text": true,
|
| 45 |
+
"spk2id": {
|
| 46 |
+
"ru-epoch25-1.0.0": 0
|
| 47 |
+
},
|
| 48 |
+
"num_styles": 1,
|
| 49 |
+
"style2id": {
|
| 50 |
+
"Neutral": 0
|
| 51 |
+
}
|
| 52 |
+
},
|
| 53 |
+
"model": {
|
| 54 |
+
"use_spk_conditioned_encoder": true,
|
| 55 |
+
"use_noise_scaled_mas": true,
|
| 56 |
+
"use_mel_posterior_encoder": false,
|
| 57 |
+
"use_duration_discriminator": false,
|
| 58 |
+
"use_wavlm_discriminator": true,
|
| 59 |
+
"inter_channels": 192,
|
| 60 |
+
"hidden_channels": 192,
|
| 61 |
+
"filter_channels": 768,
|
| 62 |
+
"n_heads": 2,
|
| 63 |
+
"n_layers": 6,
|
| 64 |
+
"kernel_size": 3,
|
| 65 |
+
"p_dropout": 0.1,
|
| 66 |
+
"resblock": "1",
|
| 67 |
+
"resblock_kernel_sizes": [
|
| 68 |
+
3,
|
| 69 |
+
7,
|
| 70 |
+
11
|
| 71 |
+
],
|
| 72 |
+
"resblock_dilation_sizes": [
|
| 73 |
+
[
|
| 74 |
+
1,
|
| 75 |
+
3,
|
| 76 |
+
5
|
| 77 |
+
],
|
| 78 |
+
[
|
| 79 |
+
1,
|
| 80 |
+
3,
|
| 81 |
+
5
|
| 82 |
+
],
|
| 83 |
+
[
|
| 84 |
+
1,
|
| 85 |
+
3,
|
| 86 |
+
5
|
| 87 |
+
]
|
| 88 |
+
],
|
| 89 |
+
"upsample_rates": [
|
| 90 |
+
8,
|
| 91 |
+
8,
|
| 92 |
+
2,
|
| 93 |
+
2,
|
| 94 |
+
2
|
| 95 |
+
],
|
| 96 |
+
"upsample_initial_channel": 512,
|
| 97 |
+
"upsample_kernel_sizes": [
|
| 98 |
+
16,
|
| 99 |
+
16,
|
| 100 |
+
8,
|
| 101 |
+
2,
|
| 102 |
+
2
|
| 103 |
+
],
|
| 104 |
+
"n_layers_q": 3,
|
| 105 |
+
"use_spectral_norm": false,
|
| 106 |
+
"gin_channels": 512,
|
| 107 |
+
"slm": {
|
| 108 |
+
"model": "./slm/wavlm-base-plus",
|
| 109 |
+
"sr": 16000,
|
| 110 |
+
"hidden": 768,
|
| 111 |
+
"nlayers": 13,
|
| 112 |
+
"initial_channel": 64
|
| 113 |
+
}
|
| 114 |
+
},
|
| 115 |
+
"version": "2.1-JP-Extra",
|
| 116 |
"id": "tB79",
|
| 117 |
"sort": 14,
|
| 118 |
"name": "RU",
|