Haopeng Gen commited on
Commit
b348852
·
1 Parent(s): a22f4e8

add vocoders

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ s3prl-vc-ppg_sxliu filter=lfs diff=lfs merge=lfs -text
37
+ hifigan.16k_320 filter=lfs diff=lfs merge=lfs -text
38
+ hifigan_hubert.16k_320 filter=lfs diff=lfs merge=lfs -text
39
+ hifigan_hubert_unit_km500.16k_320 filter=lfs diff=lfs merge=lfs -text
40
+ ppg_sxliu_decoder_V006 filter=lfs diff=lfs merge=lfs -text
41
+ pwg.16k_256 filter=lfs diff=lfs merge=lfs -text
42
+ README.md filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bcf87ecfbbb8e07a01b21415a970c8b53a5283bf6872b657040d3f45c9241f7
3
+ size 31
hifigan.16k_320/checkpoint-400000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99844ee49f8a011ad9a245219c19cd6a10d751539198b24e64929baf0d8c933e
3
+ size 1119163385
hifigan.16k_320/config.yml ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 10240
3
+ batch_size: 16
4
+ config: conf/hifigan.16k_320.yaml
5
+ dev_dumpdir: dump/dev/norm
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_adv_loss_params:
10
+ average_by_discriminators: false
11
+ discriminator_grad_norm: -1
12
+ discriminator_optimizer_params:
13
+ betas:
14
+ - 0.5
15
+ - 0.9
16
+ lr: 0.0002
17
+ weight_decay: 0.0
18
+ discriminator_optimizer_type: Adam
19
+ discriminator_params:
20
+ follow_official_norm: true
21
+ period_discriminator_params:
22
+ bias: true
23
+ channels: 32
24
+ downsample_scales:
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 1
30
+ in_channels: 1
31
+ kernel_sizes:
32
+ - 5
33
+ - 3
34
+ max_downsample_channels: 1024
35
+ nonlinear_activation: LeakyReLU
36
+ nonlinear_activation_params:
37
+ negative_slope: 0.1
38
+ out_channels: 1
39
+ use_spectral_norm: false
40
+ use_weight_norm: true
41
+ periods:
42
+ - 2
43
+ - 3
44
+ - 5
45
+ - 7
46
+ - 11
47
+ scale_discriminator_params:
48
+ bias: true
49
+ channels: 128
50
+ downsample_scales:
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 1
56
+ in_channels: 1
57
+ kernel_sizes:
58
+ - 15
59
+ - 41
60
+ - 5
61
+ - 3
62
+ max_downsample_channels: 1024
63
+ max_groups: 16
64
+ nonlinear_activation: LeakyReLU
65
+ nonlinear_activation_params:
66
+ negative_slope: 0.1
67
+ out_channels: 1
68
+ scale_downsample_pooling: AvgPool1d
69
+ scale_downsample_pooling_params:
70
+ kernel_size: 4
71
+ padding: 2
72
+ stride: 2
73
+ scales: 3
74
+ discriminator_scheduler_params:
75
+ gamma: 0.5
76
+ milestones:
77
+ - 200000
78
+ - 400000
79
+ - 600000
80
+ - 800000
81
+ discriminator_scheduler_type: MultiStepLR
82
+ discriminator_train_start_steps: 0
83
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
84
+ distributed: false
85
+ eval_interval_steps: 1000
86
+ feat_match_loss_params:
87
+ average_by_discriminators: false
88
+ average_by_layers: false
89
+ include_final_outputs: false
90
+ fft_size: 1280
91
+ fmax: 7600
92
+ fmin: 80
93
+ format: hdf5
94
+ generator_adv_loss_params:
95
+ average_by_discriminators: false
96
+ generator_grad_norm: -1
97
+ generator_optimizer_params:
98
+ betas:
99
+ - 0.5
100
+ - 0.9
101
+ lr: 0.0002
102
+ weight_decay: 0.0
103
+ generator_optimizer_type: Adam
104
+ generator_params:
105
+ bias: true
106
+ channels: 640
107
+ in_channels: 80
108
+ kernel_size: 7
109
+ nonlinear_activation: LeakyReLU
110
+ nonlinear_activation_params:
111
+ negative_slope: 0.1
112
+ out_channels: 1
113
+ resblock_dilations:
114
+ - - 1
115
+ - 3
116
+ - 5
117
+ - - 1
118
+ - 3
119
+ - 5
120
+ - - 1
121
+ - 3
122
+ - 5
123
+ resblock_kernel_sizes:
124
+ - 3
125
+ - 7
126
+ - 11
127
+ upsample_kernel_sizes:
128
+ - 20
129
+ - 16
130
+ - 4
131
+ - 4
132
+ upsample_scales:
133
+ - 10
134
+ - 8
135
+ - 2
136
+ - 2
137
+ use_additional_convs: true
138
+ use_weight_norm: true
139
+ generator_scheduler_params:
140
+ gamma: 0.5
141
+ milestones:
142
+ - 200000
143
+ - 400000
144
+ - 600000
145
+ - 800000
146
+ generator_scheduler_type: MultiStepLR
147
+ generator_train_start_steps: 1
148
+ generator_type: HiFiGANGenerator
149
+ global_gain_scale: 1.0
150
+ hop_size: 320
151
+ lambda_adv: 1.0
152
+ lambda_aux: 45.0
153
+ lambda_feat_match: 2.0
154
+ log_interval_steps: 100
155
+ mel_loss_params:
156
+ fft_size: 1280
157
+ fmax: 8000
158
+ fmin: 0
159
+ fs: 16000
160
+ hop_size: 320
161
+ log_base: null
162
+ num_mels: 80
163
+ win_length: null
164
+ window: hann
165
+ num_mels: 80
166
+ num_save_intermediate_results: 4
167
+ num_workers: 2
168
+ outdir: exp/train_nodev_hifigan.16k_320
169
+ pin_memory: true
170
+ pretrain: ''
171
+ rank: 0
172
+ remove_short_samples: false
173
+ resume: ''
174
+ sampling_rate: 16000
175
+ save_interval_steps: 10000
176
+ train_dumpdir: dump/train_nodev/norm
177
+ train_feats_scp: null
178
+ train_max_steps: 400000
179
+ train_segments: null
180
+ train_wav_scp: null
181
+ trim_frame_size: 1024
182
+ trim_hop_size: 320
183
+ trim_silence: false
184
+ trim_threshold_in_db: 20
185
+ use_feat_match_loss: true
186
+ use_mel_loss: true
187
+ use_stft_loss: false
188
+ verbose: 1
189
+ version: 0.6.2a
190
+ win_length: null
191
+ window: hann
hifigan.16k_320/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acdf123b29e8e9d857006144b46583da550af45dd865b89f2f609a45a80eee48
3
+ size 4912
hifigan_hubert.16k_320/checkpoint-400000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bf20a87037cb70e3151309135b73890927f4174daec3f92d5ea7312a6095c6d
3
+ size 1042691825
hifigan_hubert.16k_320/config.yml ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 10240
3
+ batch_size: 32
4
+ config: ./conf/hifigan_hubert.v1.yaml
5
+ dev_dumpdir: dump/V006_SS_max_valid_dev/raw
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_adv_loss_params:
10
+ average_by_discriminators: false
11
+ discriminator_grad_norm: -1
12
+ discriminator_optimizer_params:
13
+ betas:
14
+ - 0.5
15
+ - 0.9
16
+ lr: 0.0002
17
+ weight_decay: 0.0
18
+ discriminator_optimizer_type: Adam
19
+ discriminator_params:
20
+ follow_official_norm: true
21
+ period_discriminator_params:
22
+ bias: true
23
+ channels: 32
24
+ downsample_scales:
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 1
30
+ in_channels: 1
31
+ kernel_sizes:
32
+ - 5
33
+ - 3
34
+ max_downsample_channels: 1024
35
+ nonlinear_activation: LeakyReLU
36
+ nonlinear_activation_params:
37
+ negative_slope: 0.1
38
+ out_channels: 1
39
+ use_spectral_norm: false
40
+ use_weight_norm: true
41
+ periods:
42
+ - 2
43
+ - 3
44
+ - 5
45
+ - 7
46
+ - 11
47
+ scale_discriminator_params:
48
+ bias: true
49
+ channels: 128
50
+ downsample_scales:
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 1
56
+ in_channels: 1
57
+ kernel_sizes:
58
+ - 15
59
+ - 41
60
+ - 5
61
+ - 3
62
+ max_downsample_channels: 1024
63
+ max_groups: 16
64
+ nonlinear_activation: LeakyReLU
65
+ nonlinear_activation_params:
66
+ negative_slope: 0.1
67
+ out_channels: 1
68
+ scale_downsample_pooling: AvgPool1d
69
+ scale_downsample_pooling_params:
70
+ kernel_size: 4
71
+ padding: 2
72
+ stride: 2
73
+ scales: 3
74
+ discriminator_scheduler_params:
75
+ gamma: 0.5
76
+ milestones:
77
+ - 200000
78
+ - 400000
79
+ - 600000
80
+ - 800000
81
+ discriminator_scheduler_type: MultiStepLR
82
+ discriminator_train_start_steps: 0
83
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
84
+ distributed: false
85
+ eval_interval_steps: 1000
86
+ feat_match_loss_params:
87
+ average_by_discriminators: false
88
+ average_by_layers: false
89
+ include_final_outputs: true
90
+ fft_size: null
91
+ fmax: null
92
+ fmin: null
93
+ format: hdf5
94
+ generator_adv_loss_params:
95
+ average_by_discriminators: false
96
+ generator_grad_norm: -1
97
+ generator_optimizer_params:
98
+ betas:
99
+ - 0.5
100
+ - 0.9
101
+ lr: 0.0002
102
+ weight_decay: 0.0
103
+ generator_optimizer_type: Adam
104
+ generator_params:
105
+ bias: true
106
+ channels: 512
107
+ concat_spk_emb: false
108
+ in_channels: 512
109
+ kernel_size: 7
110
+ nonlinear_activation: LeakyReLU
111
+ nonlinear_activation_params:
112
+ negative_slope: 0.1
113
+ num_embs: 100
114
+ num_spk_embs: 128
115
+ out_channels: 1
116
+ resblock_dilations:
117
+ - - 1
118
+ - 3
119
+ - 5
120
+ - - 1
121
+ - 3
122
+ - 5
123
+ - - 1
124
+ - 3
125
+ - 5
126
+ resblock_kernel_sizes:
127
+ - 3
128
+ - 7
129
+ - 11
130
+ spk_emb_dim: 512
131
+ upsample_kernel_sizes:
132
+ - 20
133
+ - 16
134
+ - 4
135
+ - 4
136
+ upsample_scales:
137
+ - 10
138
+ - 8
139
+ - 2
140
+ - 2
141
+ use_additional_convs: true
142
+ use_weight_norm: true
143
+ generator_scheduler_params:
144
+ gamma: 0.5
145
+ milestones:
146
+ - 200000
147
+ - 400000
148
+ - 600000
149
+ - 800000
150
+ generator_scheduler_type: MultiStepLR
151
+ generator_train_start_steps: 1
152
+ generator_type: DiscreteSymbolHiFiGANGenerator
153
+ global_gain_scale: 1.0
154
+ hop_size: 320
155
+ lambda_adv: 1.0
156
+ lambda_aux: 45.0
157
+ lambda_feat_match: 2.0
158
+ log_interval_steps: 100
159
+ mel_loss_params:
160
+ fft_size: 1280
161
+ fmax: 8000
162
+ fmin: 0
163
+ fs: 16000
164
+ hop_size: 320
165
+ log_base: null
166
+ num_mels: 80
167
+ win_length: null
168
+ window: hann
169
+ num_mels: 2
170
+ num_save_intermediate_results: 4
171
+ num_workers: 2
172
+ outdir: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1
173
+ pin_memory: true
174
+ pretrain: ''
175
+ rank: 0
176
+ remove_short_samples: false
177
+ resume: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1/checkpoint-300steps.pkl
178
+ sampling_rate: 16000
179
+ save_interval_steps: 50000
180
+ train_dumpdir: dump/V006_SS_max_valid_train_2000/raw
181
+ train_feats_scp: null
182
+ train_max_steps: 2500000
183
+ train_segments: null
184
+ train_wav_scp: null
185
+ trim_frame_size: 1024
186
+ trim_hop_size: 320
187
+ trim_silence: false
188
+ trim_threshold_in_db: 20
189
+ use_feat_match_loss: true
190
+ use_mel_loss: true
191
+ use_stft_loss: false
192
+ verbose: 1
193
+ version: 0.6.2a
194
+ win_length: null
195
+ window: null
hifigan_hubert_unit_km500.16k_320/checkpoint-800000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c947a9745940990f39799cbee794a3cde8326e2dbc720e7976ccea4675a213d7
3
+ size 1045149425
hifigan_hubert_unit_km500.16k_320/config.yml ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 10240
3
+ batch_size: 32
4
+ config: ./conf/hifigan_hubert.v1.yaml
5
+ dev_dumpdir: dump/V006_SS_max_valid_dev/raw
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_adv_loss_params:
10
+ average_by_discriminators: false
11
+ discriminator_grad_norm: -1
12
+ discriminator_optimizer_params:
13
+ betas:
14
+ - 0.5
15
+ - 0.9
16
+ lr: 0.0002
17
+ weight_decay: 0.0
18
+ discriminator_optimizer_type: Adam
19
+ discriminator_params:
20
+ follow_official_norm: true
21
+ period_discriminator_params:
22
+ bias: true
23
+ channels: 32
24
+ downsample_scales:
25
+ - 3
26
+ - 3
27
+ - 3
28
+ - 3
29
+ - 1
30
+ in_channels: 1
31
+ kernel_sizes:
32
+ - 5
33
+ - 3
34
+ max_downsample_channels: 1024
35
+ nonlinear_activation: LeakyReLU
36
+ nonlinear_activation_params:
37
+ negative_slope: 0.1
38
+ out_channels: 1
39
+ use_spectral_norm: false
40
+ use_weight_norm: true
41
+ periods:
42
+ - 2
43
+ - 3
44
+ - 5
45
+ - 7
46
+ - 11
47
+ scale_discriminator_params:
48
+ bias: true
49
+ channels: 128
50
+ downsample_scales:
51
+ - 4
52
+ - 4
53
+ - 4
54
+ - 4
55
+ - 1
56
+ in_channels: 1
57
+ kernel_sizes:
58
+ - 15
59
+ - 41
60
+ - 5
61
+ - 3
62
+ max_downsample_channels: 1024
63
+ max_groups: 16
64
+ nonlinear_activation: LeakyReLU
65
+ nonlinear_activation_params:
66
+ negative_slope: 0.1
67
+ out_channels: 1
68
+ scale_downsample_pooling: AvgPool1d
69
+ scale_downsample_pooling_params:
70
+ kernel_size: 4
71
+ padding: 2
72
+ stride: 2
73
+ scales: 3
74
+ discriminator_scheduler_params:
75
+ gamma: 0.5
76
+ milestones:
77
+ - 200000
78
+ - 400000
79
+ - 600000
80
+ - 800000
81
+ discriminator_scheduler_type: MultiStepLR
82
+ discriminator_train_start_steps: 0
83
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
84
+ distributed: false
85
+ eval_interval_steps: 1000
86
+ feat_match_loss_params:
87
+ average_by_discriminators: false
88
+ average_by_layers: false
89
+ include_final_outputs: true
90
+ fft_size: null
91
+ fmax: null
92
+ fmin: null
93
+ format: hdf5
94
+ generator_adv_loss_params:
95
+ average_by_discriminators: false
96
+ generator_grad_norm: -1
97
+ generator_optimizer_params:
98
+ betas:
99
+ - 0.5
100
+ - 0.9
101
+ lr: 0.0002
102
+ weight_decay: 0.0
103
+ generator_optimizer_type: Adam
104
+ generator_params:
105
+ bias: true
106
+ channels: 512
107
+ concat_spk_emb: false
108
+ in_channels: 512
109
+ kernel_size: 7
110
+ nonlinear_activation: LeakyReLU
111
+ nonlinear_activation_params:
112
+ negative_slope: 0.1
113
+ num_embs: 500
114
+ num_spk_embs: 128
115
+ out_channels: 1
116
+ resblock_dilations:
117
+ - - 1
118
+ - 3
119
+ - 5
120
+ - - 1
121
+ - 3
122
+ - 5
123
+ - - 1
124
+ - 3
125
+ - 5
126
+ resblock_kernel_sizes:
127
+ - 3
128
+ - 7
129
+ - 11
130
+ spk_emb_dim: 512
131
+ upsample_kernel_sizes:
132
+ - 20
133
+ - 16
134
+ - 4
135
+ - 4
136
+ upsample_scales:
137
+ - 10
138
+ - 8
139
+ - 2
140
+ - 2
141
+ use_additional_convs: true
142
+ use_weight_norm: true
143
+ generator_scheduler_params:
144
+ gamma: 0.5
145
+ milestones:
146
+ - 200000
147
+ - 400000
148
+ - 600000
149
+ - 800000
150
+ generator_scheduler_type: MultiStepLR
151
+ generator_train_start_steps: 1
152
+ generator_type: DiscreteSymbolHiFiGANGenerator
153
+ global_gain_scale: 1.0
154
+ hop_size: 320
155
+ lambda_adv: 1.0
156
+ lambda_aux: 45.0
157
+ lambda_feat_match: 2.0
158
+ log_interval_steps: 100
159
+ mel_loss_params:
160
+ fft_size: 1024
161
+ fmax: 8000
162
+ fmin: 0
163
+ fs: 16000
164
+ hop_size: 256
165
+ log_base: null
166
+ num_mels: 80
167
+ win_length: null
168
+ window: hann
169
+ num_mels: 2
170
+ num_save_intermediate_results: 4
171
+ num_workers: 2
172
+ outdir: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1
173
+ pin_memory: true
174
+ pretrain: ''
175
+ rank: 0
176
+ remove_short_samples: false
177
+ resume: exp/V006_SS_max_valid_train_2000_vctk_hifigan_hubert.v1/checkpoint-104steps.pkl
178
+ sampling_rate: 16000
179
+ save_interval_steps: 50000
180
+ train_dumpdir: dump/V006_SS_max_valid_train_2000/raw
181
+ train_feats_scp: null
182
+ train_max_steps: 2500000
183
+ train_segments: null
184
+ train_wav_scp: null
185
+ trim_frame_size: 1024
186
+ trim_hop_size: 256
187
+ trim_silence: false
188
+ trim_threshold_in_db: 20
189
+ use_feat_match_loss: true
190
+ use_mel_loss: true
191
+ use_stft_loss: false
192
+ verbose: 1
193
+ version: 0.6.2a
194
+ win_length: null
195
+ window: null
hifigan_hubert_unit_km500.16k_320/hifigan_hubert.v1.yaml ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This configuration is based on HiFiGAN V1, derived
2
+ # from official repository (https://github.com/jik876/hifi-gan).
3
+
4
+ ###########################################################
5
+ # FEATURE EXTRACTION SETTING #
6
+ ###########################################################
7
+ sampling_rate: 16000 # Sampling rate.
8
+ fft_size: null # FFT size.
9
+ hop_size: 320 # Hop size.
10
+ win_length: null # Window length.
11
+ # If set to null, it will be the same as fft_size.
12
+ window: null # Window function.
13
+ num_mels: 2 # Number of mel basis.
14
+ fmin: null # Minimum freq in mel basis calculation.
15
+ fmax: null # Maximum frequency in mel basis calculation.
16
+ global_gain_scale: 1.0 # Will be multiplied to all of waveform.
17
+ trim_silence: false # Whether to trim the start and end of silence.
18
+ trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good.
19
+ trim_frame_size: 1024 # Frame size in trimming.
20
+ trim_hop_size: 256 # Hop size in trimming.
21
+ format: "hdf5" # Feature file format. "npy" or "hdf5" is supported.
22
+
23
+ ###########################################################
24
+ # GENERATOR NETWORK ARCHITECTURE SETTING #
25
+ ###########################################################
26
+ generator_type: DiscreteSymbolHiFiGANGenerator
27
+ generator_params:
28
+ in_channels: 512 # Number of input channels.
29
+ out_channels: 1 # Number of output channels.
30
+ channels: 512 # Number of initial channels.
31
+ num_embs: 500
32
+ num_spk_embs: 128
33
+ spk_emb_dim: 512
34
+ concat_spk_emb: false
35
+ kernel_size: 7 # Kernel size of initial and final conv layers.
36
+ upsample_scales: [10, 8, 2, 2] # Upsampling scales.
37
+ upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
38
+ resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
39
+ resblock_dilations: # Dilations for residual blocks.
40
+ - [1, 3, 5]
41
+ - [1, 3, 5]
42
+ - [1, 3, 5]
43
+ use_additional_convs: true # Whether to use additional conv layer in residual blocks.
44
+ bias: true # Whether to use bias parameter in conv.
45
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation type.
46
+ nonlinear_activation_params: # Nonlinear activation paramters.
47
+ negative_slope: 0.1
48
+ use_weight_norm: true # Whether to apply weight normalization.
49
+
50
+ ###########################################################
51
+ # DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
52
+ ###########################################################
53
+ discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator
54
+ discriminator_params:
55
+ scales: 3 # Number of multi-scale discriminator.
56
+ scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator.
57
+ scale_downsample_pooling_params:
58
+ kernel_size: 4 # Pooling kernel size.
59
+ stride: 2 # Pooling stride.
60
+ padding: 2 # Padding size.
61
+ scale_discriminator_params:
62
+ in_channels: 1 # Number of input channels.
63
+ out_channels: 1 # Number of output channels.
64
+ kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
65
+ channels: 128 # Initial number of channels.
66
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
67
+ max_groups: 16 # Maximum number of groups in downsampling conv layers.
68
+ bias: true
69
+ downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
70
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
71
+ nonlinear_activation_params:
72
+ negative_slope: 0.1
73
+ follow_official_norm: true # Whether to follow the official norm setting.
74
+ periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator.
75
+ period_discriminator_params:
76
+ in_channels: 1 # Number of input channels.
77
+ out_channels: 1 # Number of output channels.
78
+ kernel_sizes: [5, 3] # List of kernel sizes.
79
+ channels: 32 # Initial number of channels.
80
+ downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
81
+ max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
82
+ bias: true # Whether to use bias parameter in conv layer."
83
+ nonlinear_activation: "LeakyReLU" # Nonlinear activation.
84
+ nonlinear_activation_params: # Nonlinear activation paramters.
85
+ negative_slope: 0.1
86
+ use_weight_norm: true # Whether to apply weight normalization.
87
+ use_spectral_norm: false # Whether to apply spectral normalization.
88
+
89
+ ###########################################################
90
+ # STFT LOSS SETTING #
91
+ ###########################################################
92
+ use_stft_loss: false # Whether to use multi-resolution STFT loss.
93
+ use_mel_loss: true # Whether to use Mel-spectrogram loss.
94
+ mel_loss_params: # Mel-spectrogram loss parameters.
95
+ fs: 16000
96
+ fft_size: 1024
97
+ hop_size: 256
98
+ win_length: null
99
+ window: "hann"
100
+ num_mels: 80
101
+ fmin: 0
102
+ fmax: 8000
103
+ log_base: null # Log base. If set to null, use natural logarithm.
104
+ generator_adv_loss_params:
105
+ average_by_discriminators: false # Whether to average loss by #discriminators.
106
+ discriminator_adv_loss_params:
107
+ average_by_discriminators: false # Whether to average loss by #discriminators.
108
+ use_feat_match_loss: true
109
+ feat_match_loss_params:
110
+ average_by_discriminators: false # Whether to average loss by #discriminators.
111
+ average_by_layers: false # Whether to average loss by #layers in each discriminator.
112
+ include_final_outputs: true # Whether to include final outputs in feat match loss calculation.
113
+
114
+ ###########################################################
115
+ # ADVERSARIAL LOSS SETTING #
116
+ ###########################################################
117
+ lambda_aux: 45.0 # Loss balancing coefficient for STFT loss.
118
+ lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss.
119
+ lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
120
+
121
+ ###########################################################
122
+ # DATA LOADER SETTING #
123
+ ###########################################################
124
+ batch_size: 32 # Batch size.
125
+ batch_max_steps: 10240 # Length of each audio in batch. Make sure dividable by hop_size.
126
+ pin_memory: true # Whether to pin memory in Pytorch DataLoader.
127
+ num_workers: 2 # Number of workers in Pytorch DataLoader.
128
+ remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
129
+ allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory.
130
+
131
+ ###########################################################
132
+ # OPTIMIZER & SCHEDULER SETTING #
133
+ ###########################################################
134
+ generator_optimizer_type: Adam
135
+ generator_optimizer_params:
136
+ lr: 2.0e-4
137
+ betas: [0.5, 0.9]
138
+ weight_decay: 0.0
139
+ generator_scheduler_type: MultiStepLR
140
+ generator_scheduler_params:
141
+ gamma: 0.5
142
+ milestones:
143
+ - 200000
144
+ - 400000
145
+ - 600000
146
+ - 800000
147
+ generator_grad_norm: -1
148
+ discriminator_optimizer_type: Adam
149
+ discriminator_optimizer_params:
150
+ lr: 2.0e-4
151
+ betas: [0.5, 0.9]
152
+ weight_decay: 0.0
153
+ discriminator_scheduler_type: MultiStepLR
154
+ discriminator_scheduler_params:
155
+ gamma: 0.5
156
+ milestones:
157
+ - 200000
158
+ - 400000
159
+ - 600000
160
+ - 800000
161
+ discriminator_grad_norm: -1
162
+
163
+ ###########################################################
164
+ # INTERVAL SETTING #
165
+ ###########################################################
166
+ generator_train_start_steps: 1 # Number of steps to start to train discriminator.
167
+ discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
168
+ train_max_steps: 2500000 # Number of training steps.
169
+ save_interval_steps: 50000 # Interval steps to save checkpoint.
170
+ eval_interval_steps: 1000 # Interval steps to evaluate the network.
171
+ log_interval_steps: 100 # Interval steps to record the training log.
172
+
173
+ ###########################################################
174
+ # OTHER SETTING #
175
+ ###########################################################
176
+ num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
ppg_sxliu_decoder_V006/checkpoint-38000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e10c9f73d33dd39fe7ef3f52f6fd513fdfa621c933b1abeeb901d5ee0b857af9
3
+ size 339924234
ppg_sxliu_decoder_V006/config.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ additional_config: null
2
+ allow_cache: true
3
+ batch_size: 6
4
+ config: conf/taco2_ar_V006_S1.yaml
5
+ dev_scp: data/V006_S1_max_valid_dev/wav.scp
6
+ dev_spemb_scp: null
7
+ distributed: false
8
+ eval_interval_steps: 1000
9
+ fft_size: 1024
10
+ fmax: 7600
11
+ fmin: 80
12
+ global_gain_scale: 1.0
13
+ grad_norm: 1.0
14
+ hop_size: 256
15
+ init_checkpoint: ''
16
+ log_interval_steps: 100
17
+ main_loss_type: L1Loss
18
+ model_params:
19
+ ar: true
20
+ encoder_type: taco2
21
+ hidden_dim: 1024
22
+ lstmp_dropout_rate: 0.2
23
+ lstmp_layernorm: false
24
+ lstmp_layers: 2
25
+ lstmp_proj_dim: 256
26
+ prenet_dim: 256
27
+ prenet_dropout_rate: 0.5
28
+ prenet_layers: 2
29
+ model_type: Taco2_AR
30
+ num_mels: 80
31
+ num_save_intermediate_results: 4
32
+ num_workers: 2
33
+ optimizer_params:
34
+ lr: 0.0001
35
+ optimizer_type: AdamW
36
+ outdir: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1
37
+ pin_memory: true
38
+ rank: 0
39
+ resume: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1/checkpoint-10000steps.pkl
40
+ sampling_rate: 16000
41
+ save_interval_steps: 1000
42
+ scheduler: linear_schedule_with_warmup
43
+ scheduler_params:
44
+ num_warmup_steps: 4000
45
+ train_max_steps: 100000
46
+ train_scp: data/V006_S1_max_valid_train/wav.scp
47
+ train_spemb_scp: null
48
+ trg_stats: exp/V006_S1_max_valid_ppg_sxliu_taco2_ar_V006_S1/stats.h5
49
+ trim_frame_size: 2048
50
+ trim_hop_size: 512
51
+ trim_silence: false
52
+ trim_threshold_in_db: 60
53
+ upstream: ppg_sxliu
54
+ verbose: 1
55
+ version: 0.3.0
56
+ vocoder:
57
+ checkpoint: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/checkpoint-400000steps.pkl
58
+ config: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/config.yml
59
+ stats: /home/kevingenghaopeng/vocoder/ParallelWaveGAN/egs/V006/voc1/exp/train_nodev_parallel_wavegan.v1/stats.h5
60
+ win_length: null
61
+ window: hann
ppg_sxliu_decoder_V006/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ff1d3f33879137cf5f6d8d2d9ad6f1db5b1a5ff2a8093e52969976759152bee
3
+ size 4736
pwg.16k_256/checkpoint-400000steps.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76381a7fdbb799e183d5db949a0eee7fa6f97a3f47412f9ae1eaed05fcd915f
3
+ size 17668782
pwg.16k_256/config.yml ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allow_cache: true
2
+ batch_max_steps: 25600
3
+ batch_size: 6
4
+ config: conf/parallel_wavegan.v1.yaml
5
+ dev_dumpdir: dump/dev/norm
6
+ dev_feats_scp: null
7
+ dev_segments: null
8
+ dev_wav_scp: null
9
+ discriminator_grad_norm: 1
10
+ discriminator_optimizer_params:
11
+ eps: 1.0e-06
12
+ lr: 5.0e-05
13
+ weight_decay: 0.0
14
+ discriminator_params:
15
+ bias: true
16
+ conv_channels: 64
17
+ in_channels: 1
18
+ kernel_size: 3
19
+ layers: 10
20
+ nonlinear_activation: LeakyReLU
21
+ nonlinear_activation_params:
22
+ negative_slope: 0.2
23
+ out_channels: 1
24
+ use_weight_norm: true
25
+ discriminator_scheduler_params:
26
+ gamma: 0.5
27
+ step_size: 200000
28
+ discriminator_train_start_steps: 100000
29
+ distributed: false
30
+ eval_interval_steps: 1000
31
+ fft_size: 1024
32
+ fmax: 7600
33
+ fmin: 80
34
+ format: hdf5
35
+ generator_grad_norm: 10
36
+ generator_optimizer_params:
37
+ eps: 1.0e-06
38
+ lr: 0.0001
39
+ weight_decay: 0.0
40
+ generator_params:
41
+ aux_channels: 80
42
+ aux_context_window: 2
43
+ dropout: 0.0
44
+ gate_channels: 128
45
+ in_channels: 1
46
+ kernel_size: 3
47
+ layers: 30
48
+ out_channels: 1
49
+ residual_channels: 64
50
+ skip_channels: 64
51
+ stacks: 3
52
+ upsample_net: ConvInUpsampleNetwork
53
+ upsample_params:
54
+ upsample_scales:
55
+ - 4
56
+ - 4
57
+ - 4
58
+ - 4
59
+ use_weight_norm: true
60
+ generator_scheduler_params:
61
+ gamma: 0.5
62
+ step_size: 200000
63
+ global_gain_scale: 1.0
64
+ hop_size: 256
65
+ lambda_adv: 4.0
66
+ log_interval_steps: 100
67
+ num_mels: 80
68
+ num_save_intermediate_results: 4
69
+ num_workers: 2
70
+ outdir: exp/train_nodev_parallel_wavegan.v1
71
+ pin_memory: true
72
+ pretrain: ''
73
+ rank: 0
74
+ remove_short_samples: true
75
+ resume: ''
76
+ sampling_rate: 16000
77
+ save_interval_steps: 5000
78
+ stft_loss_params:
79
+ fft_sizes:
80
+ - 1024
81
+ - 2048
82
+ - 512
83
+ hop_sizes:
84
+ - 120
85
+ - 240
86
+ - 50
87
+ win_lengths:
88
+ - 600
89
+ - 1200
90
+ - 240
91
+ window: hann_window
92
+ train_dumpdir: dump/train_nodev/norm
93
+ train_feats_scp: null
94
+ train_max_steps: 400000
95
+ train_segments: null
96
+ train_wav_scp: null
97
+ trim_frame_size: 2048
98
+ trim_hop_size: 512
99
+ trim_silence: false
100
+ trim_threshold_in_db: 60
101
+ verbose: 1
102
+ version: 0.6.2a
103
+ win_length: null
104
+ window: hann
pwg.16k_256/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d73ba94507a1a33eb4bd34180cd98871b88b106c489adec76c41d67104284a69
3
+ size 4736
s3prl-vc-ppg_sxliu/checkpoint-50000steps.pkl ADDED
@@ -0,0 +1 @@
 
 
1
+ ../../../../../../../.cache/huggingface/hub/models--unilight--accent-conversion-2023/blobs/f5fd4b70e8739d1822a1d3491fdf1f4c4d7ae44f8c1a902cba5510079f44ec5e
s3prl-vc-ppg_sxliu/config.yml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ additional_config: null
2
+ allow_cache: true
3
+ batch_size: 16
4
+ config: conf/taco2_ar.yaml
5
+ dev_scp: data/TXHC_dev/wav.scp
6
+ dev_spemb_scp: null
7
+ distributed: false
8
+ eval_interval_steps: 1000
9
+ fft_size: 1024
10
+ fmax: 7600
11
+ fmin: 80
12
+ global_gain_scale: 1.0
13
+ grad_norm: 1.0
14
+ hop_size: 256
15
+ init_checkpoint: ''
16
+ log_interval_steps: 100
17
+ main_loss_type: L1Loss
18
+ model_params:
19
+ ar: true
20
+ encoder_type: taco2
21
+ hidden_dim: 1024
22
+ lstmp_dropout_rate: 0.2
23
+ lstmp_layernorm: false
24
+ lstmp_layers: 2
25
+ lstmp_proj_dim: 256
26
+ prenet_dim: 256
27
+ prenet_dropout_rate: 0.5
28
+ prenet_layers: 2
29
+ model_type: Taco2_AR
30
+ num_mels: 80
31
+ num_save_intermediate_results: 4
32
+ num_workers: 2
33
+ optimizer_params:
34
+ lr: 0.0001
35
+ optimizer_type: AdamW
36
+ outdir: exp/TXHC_ppg_sxliu_taco2_ar
37
+ pin_memory: true
38
+ rank: 0
39
+ resume: ''
40
+ sampling_rate: 16000
41
+ save_interval_steps: 1000
42
+ scheduler: linear_schedule_with_warmup
43
+ scheduler_params:
44
+ num_warmup_steps: 4000
45
+ train_max_steps: 50000
46
+ train_scp: data/TXHC_train_1032/wav.scp
47
+ train_spemb_scp: null
48
+ trg_stats: exp/TXHC_ppg_sxliu_taco2_ar/stats.h5
49
+ trim_frame_size: 2048
50
+ trim_hop_size: 512
51
+ trim_silence: false
52
+ trim_threshold_in_db: 60
53
+ upstream: ppg_sxliu
54
+ verbose: 1
55
+ version: 0.2.0
56
+ vocoder:
57
+ checkpoint: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/checkpoint-105000steps.pkl
58
+ config: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/config.yml
59
+ stats: /data/group1/z44476r/Experiments/ParallelWaveGAN/egs/l2-arctic/voc1/exp/train_nodev_TXHC_parallel_wavegan.v1/stats.h5
60
+ win_length: null
61
+ window: hann
s3prl-vc-ppg_sxliu/stats.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2494f218f2758b6c6c4cd7252970647e8e153bbd31bcdf122030092036f6ef7a
3
+ size 4736