kkvc-hf commited on
Commit
c39ccfe
·
1 Parent(s): 1193f05
This view is limited to 50 files because it contains too many changes.   See raw diff
AF/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "3KD3",
3
  "sort": 19,
4
  "name": "AF",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/af-epoch25-1.0.0/train.list",
33
+ "validation_files": "Data/af-epoch25-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "af-epoch25-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "3KD3",
117
  "sort": 19,
118
  "name": "AF",
AJU_YM/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "X59B",
3
  "sort": 57,
4
  "name": "AJU_YM",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\aju_ym-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\aju_ym-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "aju_ym-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "X59B",
118
  "sort": 57,
119
  "name": "AJU_YM",
AK/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "NSO5",
3
  "sort": 5,
4
  "name": "AK",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 20,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ak-epoch20-2.0.0\\train.list",
34
+ "validation_files": "Data\\ak-epoch20-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ak-epoch20-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "NSO5",
124
  "sort": 5,
125
  "name": "AK",
AKS/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "356k",
3
  "sort": 16,
4
  "name": "AKS",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 20,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/aks-epoch20-1.0.0/train.list",
33
+ "validation_files": "Data/aks-epoch20-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "aks-epoch20-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "356k",
117
  "sort": 16,
118
  "name": "AKS",
AKS2/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "afWH",
3
  "sort": 42,
4
  "name": "AKS2",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\aks2-epoch25-1.0.0\\train.list",
34
+ "validation_files": "Data\\aks2-epoch25-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "aks2-epoch25-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "afWH",
118
  "sort": 42,
119
  "name": "AKS2",
AKT/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "rLOK",
3
  "sort": 58,
4
  "name": "AKT",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 17,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\akt-epoch17-1.0.0\\train.list",
34
+ "validation_files": "Data\\akt-epoch17-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "akt-epoch17-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "rLOK",
118
  "sort": 58,
119
  "name": "AKT",
AKY/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "OKsC",
3
  "sort": 7,
4
  "name": "AKY",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/aky-epoch50-1.0.0/train.list",
33
+ "validation_files": "Data/aky-epoch50-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "aky-epoch50-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "OKsC",
117
  "sort": 7,
118
  "name": "AKY",
AN/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "9a5o",
3
  "sort": 32,
4
  "name": "AN",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\an-epoch33-1.0.0\\train.list",
34
+ "validation_files": "Data\\an-epoch33-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "an-epoch33-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "9a5o",
118
  "sort": 32,
119
  "name": "AN",
AS/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "E7VR",
3
  "sort": 64,
4
  "name": "AS",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\as-epoch25-1.0.0\\train.list",
34
+ "validation_files": "Data\\as-epoch25-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "as-epoch25-1.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "E7VR",
124
  "sort": 64,
125
  "name": "AS",
AS2/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "O8vN",
3
  "sort": 65,
4
  "name": "AS2",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\as2-epoch33-1.0.0\\train.list",
34
+ "validation_files": "Data\\as2-epoch33-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "as2-epoch33-1.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "O8vN",
124
  "sort": 65,
125
  "name": "AS2",
AS3/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "f8Op",
3
  "sort": 66,
4
  "name": "AS3",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\as3-epoch50-1.0.0\\train.list",
34
+ "validation_files": "Data\\as3-epoch50-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "as3-epoch50-1.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "f8Op",
124
  "sort": 66,
125
  "name": "AS3",
AT/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "cDSg",
3
  "sort": 11,
4
  "name": "AT",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 14,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/at-epoch14-1.0.0/train.list",
33
+ "validation_files": "Data/at-epoch14-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "at-epoch14-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "cDSg",
117
  "sort": 11,
118
  "name": "AT",
ATN/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "q0cL",
3
  "sort": 27,
4
  "name": "ATN",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data/atn-epoch10-1.0.0/train.list",
34
+ "validation_files": "Data/atn-epoch10-1.0.0/val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "atn-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.3.1-JP-Extra",
117
  "id": "q0cL",
118
  "sort": 27,
119
  "name": "ATN",
AU/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "O5Jt",
3
  "sort": 3,
4
  "name": "AU",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\au-epoch33-2.0.0\\train.list",
34
+ "validation_files": "Data\\au-epoch33-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "au-epoch33-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "O5Jt",
124
  "sort": 3,
125
  "name": "AU",
AY/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "4Qs7",
3
  "sort": 33,
4
  "name": "AY",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ay3-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\ay3-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ay3-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "4Qs7",
124
  "sort": 33,
125
  "name": "AY",
FI/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "vcL0",
3
  "sort": 24,
4
  "name": "FI",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/fi-epoch100-1.0.0/train.list",
33
+ "validation_files": "Data/fi-epoch100-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "fi-epoch100-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "vcL0",
117
  "sort": 24,
118
  "name": "FI",
HM/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "KcGx",
3
  "sort": 62,
4
  "name": "HM",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\hm-epoch33-1.0.0\\train.list",
34
+ "validation_files": "Data\\hm-epoch33-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "hm-epoch33-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "KcGx",
118
  "sort": 62,
119
  "name": "HM",
HS/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "u3KN",
3
  "sort": 6,
4
  "name": "HS",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/hs-epoch50-1.0.0/train.list",
33
+ "validation_files": "Data/hs-epoch50-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "hs-epoch50-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "u3KN",
117
  "sort": 6,
118
  "name": "HS",
HT/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "Ac0c",
3
  "sort": 36,
4
  "name": "HT",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ht-epoch50-1.0.0\\train.list",
34
+ "validation_files": "Data\\ht-epoch50-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ht-epoch50-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "Ac0c",
118
  "sort": 36,
119
  "name": "HT",
IH/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "jK1l",
3
  "sort": 29,
4
  "name": "IH",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ih-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\ih-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ih-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "jK1l",
118
  "sort": 29,
119
  "name": "IH",
IM/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "rY5k",
3
  "sort": 4,
4
  "name": "IM",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\im-epoch33-2.0.0\\train.list",
34
+ "validation_files": "Data\\im-epoch33-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "im-epoch33-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "rY5k",
124
  "sort": 4,
125
  "name": "IM",
IM2/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "OY15",
3
  "sort": 55,
4
  "name": "IM2",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\im2-epoch33-2.0.0\\train.list",
34
+ "validation_files": "Data\\im2-epoch33-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "im2-epoch33-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "OY15",
124
  "sort": 55,
125
  "name": "IM2",
KH/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "xg2y",
3
  "sort": 13,
4
  "name": "KH",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/kh-epoch25-1.0.0/train.list",
33
+ "validation_files": "Data/kh-epoch25-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "kh-epoch25-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "xg2y",
117
  "sort": 13,
118
  "name": "KH",
KHN/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "diDP",
3
  "sort": 63,
4
  "name": "KHN",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 12,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\khn-epoch12-1.0.0\\train.list",
34
+ "validation_files": "Data\\khn-epoch12-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "khn-epoch12-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "diDP",
118
  "sort": 63,
119
  "name": "KHN",
KI/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "009j",
3
  "sort": 28,
4
  "name": "KI",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 20,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ki-epoch20-1.0.0\\train.list",
34
+ "validation_files": "Data\\ki-epoch20-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ki-epoch20-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "009j",
118
  "sort": 28,
119
  "name": "KI",
KI2/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "y5g6",
3
  "sort": 43,
4
  "name": "KI2",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ki2-epoch33-1.0.0\\train.list",
34
+ "validation_files": "Data\\ki2-epoch33-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ki2-epoch33-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "y5g6",
118
  "sort": 43,
119
  "name": "KI2",
KIN/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "u7DJ",
3
  "sort": 39,
4
  "name": "KIN",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\kin-epoch33-1.0.0\\train.list",
34
+ "validation_files": "Data\\kin-epoch33-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "kin-epoch33-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "u7DJ",
118
  "sort": 39,
119
  "name": "KIN",
KK/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "7Mu6",
3
  "sort": 34,
4
  "name": "KK",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\kk-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\kk-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "kk-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "7Mu6",
118
  "sort": 34,
119
  "name": "KK",
KY/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "diDP",
3
  "sort": 68,
4
  "name": "KY",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 20,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ky-epoch20-2.0.0\\train.list",
34
+ "validation_files": "Data\\ky-epoch20-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ky-epoch20-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "diDP",
124
  "sort": 68,
125
  "name": "KY",
M/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "z65H",
3
  "sort": 35,
4
  "name": "M",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\m-epoch50-1.0.0\\train.list",
34
+ "validation_files": "Data\\m-epoch50-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "m-epoch50-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "z65H",
118
  "sort": 35,
119
  "name": "M",
MCC/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "rl8n",
3
  "sort": 41,
4
  "name": "MCC",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\mcc-epoch25-1.0.0\\train.list",
34
+ "validation_files": "Data\\mcc-epoch25-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "mcc-epoch25-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "rl8n",
118
  "sort": 41,
119
  "name": "MCC",
MF/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "5S6n",
3
  "sort": 15,
4
  "name": "MF",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 13,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/mf-epoch13-1.0.0/train.list",
33
+ "validation_files": "Data/mf-epoch13-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "mf-epoch13-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "5S6n",
117
  "sort": 15,
118
  "name": "MF",
MH/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "4fW7",
3
  "sort": 26,
4
  "name": "MH",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 14,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\mh-epoch14-1.0.0\\train.list",
34
+ "validation_files": "Data\\mh-epoch14-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "mh-epoch14-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "4fW7",
118
  "sort": 26,
119
  "name": "MH",
MHY/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "a1dO",
3
  "sort": 51,
4
  "name": "MHY",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\mhy-epoch50-1.0.0\\train.list",
34
+ "validation_files": "Data\\mhy-epoch50-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "mhy-epoch50-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "a1dO",
118
  "sort": 51,
119
  "name": "MHY",
MI/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "Lu1P",
3
  "sort": 9,
4
  "name": "MI",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/mi-epoch100-1.0.0/train.list",
33
+ "validation_files": "Data/mi-epoch100-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "mi-epoch100-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "Lu1P",
117
  "sort": 9,
118
  "name": "MI",
MIW/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "y0F7",
3
  "sort": 18,
4
  "name": "MIW",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 17,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/miw-epoch17-1.0.0/train.list",
33
+ "validation_files": "Data/miw-epoch17-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "miw-epoch17-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "y0F7",
117
  "sort": 18,
118
  "name": "MIW",
MK/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "q8YU",
3
  "sort": 17,
4
  "name": "MK",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/mk-epoch100-1.0.0/train.list",
33
+ "validation_files": "Data/mk-epoch100-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "mk-epoch100-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "q8YU",
117
  "sort": 17,
118
  "name": "MK",
MKB/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "umYq",
3
  "sort": 67,
4
  "name": "MKB",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 14,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\mkb-epoch14-2.0.0\\train.list",
34
+ "validation_files": "Data\\mkb-epoch14-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "mkb-epoch14-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "umYq",
124
  "sort": 67,
125
  "name": "MKB",
MS/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "X1JZ",
3
  "sort": 12,
4
  "name": "MS",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/ms-epoch25-1.0.0/train.list",
33
+ "validation_files": "Data/ms-epoch25-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "ms-epoch25-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "X1JZ",
117
  "sort": 12,
118
  "name": "MS",
MT/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "J03K",
3
  "sort": 37,
4
  "name": "MT",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\mt-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\mt-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "mt-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "J03K",
118
  "sort": 37,
119
  "name": "MT",
NH/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "VDps",
3
  "sort": 38,
4
  "name": "NH",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 50,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\nh-epoch50-1.0.0\\train.list",
34
+ "validation_files": "Data\\nh-epoch50-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "nh-epoch50-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "VDps",
118
  "sort": 38,
119
  "name": "NH",
NHR/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "9792",
3
  "sort": 59,
4
  "name": "NHR",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\nhr-epoch100-1.0.0\\train.list",
34
+ "validation_files": "Data\\nhr-epoch100-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "nhr-epoch100-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "9792",
118
  "sort": 59,
119
  "name": "NHR",
NO/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "3E+43",
3
  "sort": 48,
4
  "name": "NO",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 12,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\no-epoch12-1.0.0\\train.list",
34
+ "validation_files": "Data\\no-epoch12-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "no-epoch12-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "3E+43",
118
  "sort": 48,
119
  "name": "NO",
NS/config.json CHANGED
@@ -1,7 +1,122 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "38G9",
3
  "sort": 30,
4
  "name": "NS",
5
- "description": "深層組の従井ノラ",
6
  "character": "22"
7
  }
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 10,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\ns-epoch10-1.0.0\\train.list",
34
+ "validation_files": "Data\\ns-epoch10-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "ns-epoch10-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "38G9",
118
  "sort": 30,
119
  "name": "NS",
120
+ "description": "Vtuberの従井ノラ",
121
  "character": "22"
122
  }
RK/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "K9vK",
3
  "sort": 31,
4
  "name": "RK",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 500,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 2,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\rk-epoch25-1.0.0\\train.list",
34
+ "validation_files": "Data\\rk-epoch25-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "rk-epoch25-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "K9vK",
118
  "sort": 31,
119
  "name": "RK",
RS/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "r50Q",
3
  "sort": 49,
4
  "name": "RS",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 100,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\rs-epoch100-1.0.0\\train.list",
34
+ "validation_files": "Data\\rs-epoch100-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "rs-epoch100-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "r50Q",
118
  "sort": 49,
119
  "name": "RS",
RT/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "PLUQ",
3
  "sort": 2,
4
  "name": "RT",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\rt-epoch33-2.0.0\\train.list",
34
+ "validation_files": "Data\\rt-epoch33-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "rt-epoch33-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "PLUQ",
124
  "sort": 2,
125
  "name": "RT",
RT2/config.json CHANGED
@@ -1,4 +1,125 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "4jdl",
3
  "sort": 46,
4
  "name": "RT2",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 33,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\rt2-epoch33-2.0.0\\train.list",
34
+ "validation_files": "Data\\rt2-epoch33-2.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "rt2-epoch33-2.0.0": 0
48
+ },
49
+ "num_styles": 7,
50
+ "style2id": {
51
+ "Neutral": 0,
52
+ "anger": 1,
53
+ "disgust": 2,
54
+ "fear": 3,
55
+ "happy": 4,
56
+ "sad": 5,
57
+ "surprise": 6
58
+ }
59
+ },
60
+ "model": {
61
+ "use_spk_conditioned_encoder": true,
62
+ "use_noise_scaled_mas": true,
63
+ "use_mel_posterior_encoder": false,
64
+ "use_duration_discriminator": false,
65
+ "use_wavlm_discriminator": true,
66
+ "inter_channels": 192,
67
+ "hidden_channels": 192,
68
+ "filter_channels": 768,
69
+ "n_heads": 2,
70
+ "n_layers": 6,
71
+ "kernel_size": 3,
72
+ "p_dropout": 0.1,
73
+ "resblock": "1",
74
+ "resblock_kernel_sizes": [
75
+ 3,
76
+ 7,
77
+ 11
78
+ ],
79
+ "resblock_dilation_sizes": [
80
+ [
81
+ 1,
82
+ 3,
83
+ 5
84
+ ],
85
+ [
86
+ 1,
87
+ 3,
88
+ 5
89
+ ],
90
+ [
91
+ 1,
92
+ 3,
93
+ 5
94
+ ]
95
+ ],
96
+ "upsample_rates": [
97
+ 8,
98
+ 8,
99
+ 2,
100
+ 2,
101
+ 2
102
+ ],
103
+ "upsample_initial_channel": 512,
104
+ "upsample_kernel_sizes": [
105
+ 16,
106
+ 16,
107
+ 8,
108
+ 2,
109
+ 2
110
+ ],
111
+ "n_layers_q": 3,
112
+ "use_spectral_norm": false,
113
+ "gin_channels": 512,
114
+ "slm": {
115
+ "model": "./slm/wavlm-base-plus",
116
+ "sr": 16000,
117
+ "hidden": 768,
118
+ "nlayers": 13,
119
+ "initial_channel": 64
120
+ }
121
+ },
122
+ "version": "2.6.1-JP-Extra",
123
  "id": "4jdl",
124
  "sort": 46,
125
  "name": "RT2",
RTN/config.json CHANGED
@@ -1,4 +1,119 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "sy1y",
3
  "sort": 47,
4
  "name": "RTN",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 14,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 3,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": false,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false,
29
+ "freeze_decoder": false
30
+ },
31
+ "data": {
32
+ "use_jp_extra": true,
33
+ "training_files": "Data\\rtn-epoch14-1.0.0\\train.list",
34
+ "validation_files": "Data\\rtn-epoch14-1.0.0\\val.list",
35
+ "max_wav_value": 32768.0,
36
+ "sampling_rate": 44100,
37
+ "filter_length": 2048,
38
+ "hop_length": 512,
39
+ "win_length": 2048,
40
+ "n_mel_channels": 128,
41
+ "mel_fmin": 0.0,
42
+ "mel_fmax": null,
43
+ "add_blank": true,
44
+ "n_speakers": 1,
45
+ "cleaned_text": true,
46
+ "spk2id": {
47
+ "rtn-epoch14-1.0.0": 0
48
+ },
49
+ "num_styles": 1,
50
+ "style2id": {
51
+ "Neutral": 0
52
+ }
53
+ },
54
+ "model": {
55
+ "use_spk_conditioned_encoder": true,
56
+ "use_noise_scaled_mas": true,
57
+ "use_mel_posterior_encoder": false,
58
+ "use_duration_discriminator": false,
59
+ "use_wavlm_discriminator": true,
60
+ "inter_channels": 192,
61
+ "hidden_channels": 192,
62
+ "filter_channels": 768,
63
+ "n_heads": 2,
64
+ "n_layers": 6,
65
+ "kernel_size": 3,
66
+ "p_dropout": 0.1,
67
+ "resblock": "1",
68
+ "resblock_kernel_sizes": [
69
+ 3,
70
+ 7,
71
+ 11
72
+ ],
73
+ "resblock_dilation_sizes": [
74
+ [
75
+ 1,
76
+ 3,
77
+ 5
78
+ ],
79
+ [
80
+ 1,
81
+ 3,
82
+ 5
83
+ ],
84
+ [
85
+ 1,
86
+ 3,
87
+ 5
88
+ ]
89
+ ],
90
+ "upsample_rates": [
91
+ 8,
92
+ 8,
93
+ 2,
94
+ 2,
95
+ 2
96
+ ],
97
+ "upsample_initial_channel": 512,
98
+ "upsample_kernel_sizes": [
99
+ 16,
100
+ 16,
101
+ 8,
102
+ 2,
103
+ 2
104
+ ],
105
+ "n_layers_q": 3,
106
+ "use_spectral_norm": false,
107
+ "gin_channels": 512,
108
+ "slm": {
109
+ "model": "./slm/wavlm-base-plus",
110
+ "sr": 16000,
111
+ "hidden": 768,
112
+ "nlayers": 13,
113
+ "initial_channel": 64
114
+ }
115
+ },
116
+ "version": "2.4.0-JP-Extra",
117
  "id": "sy1y",
118
  "sort": 47,
119
  "name": "RTN",
RU/config.json CHANGED
@@ -1,4 +1,118 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "id": "tB79",
3
  "sort": 14,
4
  "name": "RU",
 
1
  {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 42,
6
+ "epochs": 25,
7
+ "learning_rate": 0.0001,
8
+ "betas": [
9
+ 0.8,
10
+ 0.99
11
+ ],
12
+ "eps": 1e-09,
13
+ "batch_size": 4,
14
+ "bf16_run": false,
15
+ "fp16_run": false,
16
+ "lr_decay": 0.99996,
17
+ "segment_size": 16384,
18
+ "init_lr_ratio": 1,
19
+ "warmup_epochs": 0,
20
+ "c_mel": 45,
21
+ "c_kl": 1.0,
22
+ "c_commit": 100,
23
+ "skip_optimizer": true,
24
+ "freeze_ZH_bert": false,
25
+ "freeze_JP_bert": false,
26
+ "freeze_EN_bert": false,
27
+ "freeze_emo": false,
28
+ "freeze_style": false
29
+ },
30
+ "data": {
31
+ "use_jp_extra": true,
32
+ "training_files": "Data/ru-epoch25-1.0.0/train.list",
33
+ "validation_files": "Data/ru-epoch25-1.0.0/val.list",
34
+ "max_wav_value": 32768.0,
35
+ "sampling_rate": 44100,
36
+ "filter_length": 2048,
37
+ "hop_length": 512,
38
+ "win_length": 2048,
39
+ "n_mel_channels": 128,
40
+ "mel_fmin": 0.0,
41
+ "mel_fmax": null,
42
+ "add_blank": true,
43
+ "n_speakers": 1,
44
+ "cleaned_text": true,
45
+ "spk2id": {
46
+ "ru-epoch25-1.0.0": 0
47
+ },
48
+ "num_styles": 1,
49
+ "style2id": {
50
+ "Neutral": 0
51
+ }
52
+ },
53
+ "model": {
54
+ "use_spk_conditioned_encoder": true,
55
+ "use_noise_scaled_mas": true,
56
+ "use_mel_posterior_encoder": false,
57
+ "use_duration_discriminator": false,
58
+ "use_wavlm_discriminator": true,
59
+ "inter_channels": 192,
60
+ "hidden_channels": 192,
61
+ "filter_channels": 768,
62
+ "n_heads": 2,
63
+ "n_layers": 6,
64
+ "kernel_size": 3,
65
+ "p_dropout": 0.1,
66
+ "resblock": "1",
67
+ "resblock_kernel_sizes": [
68
+ 3,
69
+ 7,
70
+ 11
71
+ ],
72
+ "resblock_dilation_sizes": [
73
+ [
74
+ 1,
75
+ 3,
76
+ 5
77
+ ],
78
+ [
79
+ 1,
80
+ 3,
81
+ 5
82
+ ],
83
+ [
84
+ 1,
85
+ 3,
86
+ 5
87
+ ]
88
+ ],
89
+ "upsample_rates": [
90
+ 8,
91
+ 8,
92
+ 2,
93
+ 2,
94
+ 2
95
+ ],
96
+ "upsample_initial_channel": 512,
97
+ "upsample_kernel_sizes": [
98
+ 16,
99
+ 16,
100
+ 8,
101
+ 2,
102
+ 2
103
+ ],
104
+ "n_layers_q": 3,
105
+ "use_spectral_norm": false,
106
+ "gin_channels": 512,
107
+ "slm": {
108
+ "model": "./slm/wavlm-base-plus",
109
+ "sr": 16000,
110
+ "hidden": 768,
111
+ "nlayers": 13,
112
+ "initial_channel": 64
113
+ }
114
+ },
115
+ "version": "2.1-JP-Extra",
116
  "id": "tB79",
117
  "sort": 14,
118
  "name": "RU",