TangRain commited on
Commit
239851c
·
1 Parent(s): 1c76fbc

add pretrained model

Browse files
Files changed (22) hide show
  1. README.md +15 -0
  2. exp/svs_stats_raw_phn_none_zh/train/feats_stats.npz +3 -0
  3. exp/svs_stats_raw_phn_none_zh/train/pitch_stats.npz +3 -0
  4. exp/svs_train_discrete_acoustic_raw_phn_none_zh/49epoch.pth +3 -0
  5. exp/svs_train_discrete_acoustic_raw_phn_none_zh/config.yaml +304 -0
  6. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/acc.png +0 -0
  7. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/backward_time.png +0 -0
  8. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/clip.png +0 -0
  9. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/decoder_alpha.png +0 -0
  10. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/duration_loss.png +0 -0
  11. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/encoder_alpha.png +0 -0
  12. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/forward_time.png +0 -0
  13. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/gpu_max_cached_mem_GB.png +0 -0
  14. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/grad_norm.png +0 -0
  15. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/iter_time.png +0 -0
  16. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/loss.png +0 -0
  17. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/loss_scale.png +0 -0
  18. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/optim0_lr0.png +0 -0
  19. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/optim_step_time.png +0 -0
  20. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/out_loss.png +0 -0
  21. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/pitch_loss.png +0 -0
  22. exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/train_time.png +0 -0
README.md CHANGED
@@ -1,3 +1,18 @@
1
  ---
 
 
 
 
 
 
 
2
  license: cc-by-4.0
3
  ---
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: zh
7
+ datasets:
8
+ - opencpop
9
  license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/opencpop2_svs2_toksing_pretrain`
15
+
16
+ This model was trained by TangRain using opencpop recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ Train the model for 300 epochs and choose the one **with the best performance based on validation loss**.
exp/svs_stats_raw_phn_none_zh/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c819a05078a3ba122477fdbd6d60394a519fb9fd1116e75dbe19531b9251ab
3
+ size 1402
exp/svs_stats_raw_phn_none_zh/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c62642bb93923108eb02f125bd653d06816bdfd4f6d66091468552360e6a01c6
3
+ size 770
exp/svs_train_discrete_acoustic_raw_phn_none_zh/49epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51065dfe6873cb5078ac43aabc8160789057645454af1bb463e3a5af05722df4
3
+ size 281016849
exp/svs_train_discrete_acoustic_raw_phn_none_zh/config.yaml ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_discrete_acoustic.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_discrete_acoustic_raw_phn_none_zh
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 10
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 300
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ - - train
43
+ - loss
44
+ - min
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_lora: false
67
+ save_lora_only: true
68
+ lora_conf: {}
69
+ pretrain_path: null
70
+ init_param: []
71
+ ignore_init_mismatch: false
72
+ freeze_param: []
73
+ num_iters_per_epoch: null
74
+ batch_size: 32
75
+ valid_batch_size: null
76
+ batch_bins: 1000000
77
+ valid_batch_bins: null
78
+ train_shape_file:
79
+ - exp/svs_stats_raw_phn_none_zh/train/text_shape.phn
80
+ - exp/svs_stats_raw_phn_none_zh/train/singing_shape
81
+ valid_shape_file:
82
+ - exp/svs_stats_raw_phn_none_zh/valid/text_shape.phn
83
+ - exp/svs_stats_raw_phn_none_zh/valid/singing_shape
84
+ batch_type: sorted
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 150
88
+ - 256000
89
+ sort_in_batch: descending
90
+ shuffle_within_batch: false
91
+ sort_batch: descending
92
+ multiple_iterator: false
93
+ chunk_length: 500
94
+ chunk_shift_ratio: 0.5
95
+ num_cache_chunks: 1024
96
+ chunk_excluded_key_prefixes: []
97
+ train_data_path_and_name_and_type:
98
+ - - dump/raw/tr_no_dev/text
99
+ - text
100
+ - text
101
+ - - dump/raw/tr_no_dev/wav.scp
102
+ - singing
103
+ - sound
104
+ - - dump/raw/tr_no_dev/label
105
+ - label
106
+ - duration
107
+ - - dump/raw/tr_no_dev/score.scp
108
+ - score
109
+ - score
110
+ - - dump/raw/tr_no_dev/token_multi_wavlm_large_6+wavlm_large_23+hubert_large_6
111
+ - discrete_token
112
+ - text_int
113
+ valid_data_path_and_name_and_type:
114
+ - - dump/raw/dev/text
115
+ - text
116
+ - text
117
+ - - dump/raw/dev/wav.scp
118
+ - singing
119
+ - sound
120
+ - - dump/raw/dev/label
121
+ - label
122
+ - duration
123
+ - - dump/raw/dev/score.scp
124
+ - score
125
+ - score
126
+ - - dump/raw/dev/token_multi_wavlm_large_6+wavlm_large_23+hubert_large_6
127
+ - discrete_token
128
+ - text_int
129
+ allow_variable_data_keys: false
130
+ max_cache_size: 0.0
131
+ max_cache_fd: 32
132
+ allow_multi_rates: false
133
+ valid_max_cache_size: null
134
+ exclude_weight_decay: false
135
+ exclude_weight_decay_conf: {}
136
+ optim: adam
137
+ optim_conf:
138
+ lr: 0.001
139
+ eps: 1.0e-06
140
+ weight_decay: 0.0
141
+ scheduler: null
142
+ scheduler_conf: {}
143
+ token_list:
144
+ - <blank>
145
+ - <unk>
146
+ - SP
147
+ - i
148
+ - AP
149
+ - e
150
+ - y
151
+ - d
152
+ - w
153
+ - sh
154
+ - ai
155
+ - n
156
+ - x
157
+ - j
158
+ - ian
159
+ - u
160
+ - l
161
+ - h
162
+ - b
163
+ - o
164
+ - zh
165
+ - an
166
+ - ou
167
+ - m
168
+ - q
169
+ - z
170
+ - en
171
+ - g
172
+ - ing
173
+ - ei
174
+ - ao
175
+ - ang
176
+ - uo
177
+ - eng
178
+ - t
179
+ - a
180
+ - ong
181
+ - ui
182
+ - k
183
+ - f
184
+ - r
185
+ - iang
186
+ - ch
187
+ - v
188
+ - in
189
+ - iao
190
+ - ie
191
+ - iu
192
+ - c
193
+ - s
194
+ - van
195
+ - p
196
+ - ve
197
+ - uan
198
+ - uang
199
+ - ia
200
+ - ua
201
+ - uai
202
+ - un
203
+ - er
204
+ - vn
205
+ - iong
206
+ - <sos/eos>
207
+ odim: null
208
+ model_conf: {}
209
+ use_preprocessor: true
210
+ token_type: phn
211
+ bpemodel: null
212
+ non_linguistic_symbols: null
213
+ cleaner: null
214
+ g2p: null
215
+ fs: 16000
216
+ discrete_token_layers: 3
217
+ nclusters: 128
218
+ score_feats_extract: syllable_score_feats
219
+ score_feats_extract_conf:
220
+ fs: 16000
221
+ n_fft: 2048
222
+ win_length: 1200
223
+ hop_length: 320
224
+ feats_extract: fbank
225
+ feats_extract_conf:
226
+ n_fft: 2048
227
+ hop_length: 320
228
+ win_length: 1200
229
+ fs: 16000
230
+ fmin: 80
231
+ fmax: 7600
232
+ n_mels: 80
233
+ normalize: global_mvn
234
+ normalize_conf:
235
+ stats_file: exp/svs_stats_raw_phn_none_zh/train/feats_stats.npz
236
+ svs: discrete_acoustic
237
+ svs_conf:
238
+ midi_dim: 129
239
+ duration_dim: 512
240
+ adim: 384
241
+ aheads: 4
242
+ elayers: 6
243
+ eunits: 1536
244
+ dlayers: 6
245
+ dunits: 1536
246
+ postnet_layers: 0
247
+ postnet_chans: 512
248
+ postnet_filts: 5
249
+ postnet_dropout_rate: 0.5
250
+ use_batch_norm: true
251
+ reduction_factor: 1
252
+ global_channels: -1
253
+ text_encoder_attention_heads: 2
254
+ text_encoder_ffn_expand: 4
255
+ text_encoder_blocks: 6
256
+ text_encoder_positionwise_layer_type: conv1d
257
+ text_encoder_positionwise_conv_kernel_size: 3
258
+ text_encoder_positional_encoding_layer_type: rel_pos
259
+ text_encoder_self_attention_layer_type: rel_selfattn
260
+ text_encoder_activation_type: swish
261
+ text_encoder_normalize_before: true
262
+ text_encoder_dropout_rate: 0.1
263
+ text_encoder_positional_dropout_rate: 0.0
264
+ text_encoder_attention_dropout_rate: 0.1
265
+ use_macaron_style_in_text_encoder: true
266
+ use_conformer_conv_in_text_encoder: false
267
+ text_encoder_conformer_kernel_size: -1
268
+ init_type: pytorch
269
+ use_masking: true
270
+ loss_function: FastSpeech1
271
+ loss_type: L1
272
+ lambda_out: 1
273
+ lambda_dur: 1
274
+ lambda_pitch: 1
275
+ lambda_vuv: 0.01
276
+ use_discrete_token: true
277
+ predict_pitch: true
278
+ codec_codebook: 0
279
+ pitch_extract: dio
280
+ pitch_extract_conf:
281
+ use_token_averaged_f0: false
282
+ use_log_f0: true
283
+ fs: 16000
284
+ n_fft: 2048
285
+ hop_length: 320
286
+ f0max: 800
287
+ f0min: 80
288
+ reduction_factor: 1
289
+ pitch_normalize: null
290
+ pitch_normalize_conf:
291
+ stats_file: exp/svs_stats_raw_phn_none_zh/train/pitch_stats.npz
292
+ ying_extract: null
293
+ ying_extract_conf: {}
294
+ energy_extract: null
295
+ energy_extract_conf: {}
296
+ energy_normalize: null
297
+ energy_normalize_conf: {}
298
+ model_type: discrete_svs
299
+ model_type_conf: {}
300
+ required:
301
+ - output_dir
302
+ - token_list
303
+ version: '202310'
304
+ distributed: false
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/acc.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/backward_time.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/clip.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/decoder_alpha.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/duration_loss.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/encoder_alpha.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/forward_time.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/grad_norm.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/iter_time.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/loss.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/loss_scale.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/optim0_lr0.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/optim_step_time.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/out_loss.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/pitch_loss.png ADDED
exp/svs_train_discrete_acoustic_raw_phn_none_zh/images/train_time.png ADDED