Thanapat Trachu commited on
Commit
560a46d
·
1 Parent(s): a776bbf

Update model

Browse files
Files changed (32) hide show
  1. README.md +517 -0
  2. dump/xvector/dev/spk_xvector.ark +3 -0
  3. dump/xvector/dev/spk_xvector.scp +3 -0
  4. dump/xvector/eval/spk_xvector.ark +3 -0
  5. dump/xvector/eval/spk_xvector.scp +3 -0
  6. dump/xvector/tr_no_dev/spk_xvector.ark +3 -0
  7. dump/xvector/tr_no_dev/spk_xvector.scp +3 -0
  8. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/785epoch.pth +3 -0
  9. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/config.yaml +434 -0
  10. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_backward_time.png +0 -0
  11. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_fake_loss.png +0 -0
  12. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_forward_time.png +0 -0
  13. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_loss.png +0 -0
  14. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_optim_step_time.png +0 -0
  15. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_real_loss.png +0 -0
  16. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_train_time.png +0 -0
  17. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_adv_loss.png +0 -0
  18. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_backward_time.png +0 -0
  19. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_dur_loss.png +0 -0
  20. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_feat_match_loss.png +0 -0
  21. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_forward_time.png +0 -0
  22. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_kl_loss.png +0 -0
  23. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_loss.png +0 -0
  24. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_mel_loss.png +0 -0
  25. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_optim_step_time.png +0 -0
  26. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_train_time.png +0 -0
  27. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/gpu_max_cached_mem_GB.png +0 -0
  28. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/iter_time.png +0 -0
  29. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/optim0_lr0.png +0 -0
  30. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/optim1_lr0.png +0 -0
  31. exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/train_time.png +0 -0
  32. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,517 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - emilia_full
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `NewGame/Emilia-vits-espnet2`
15
+
16
+ This model was trained by NewGamezzz using emilia_full recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 7d46ef980eb56d14c23201d26ff4eceec97d0b3a
26
+ pip install -e .
27
+ cd egs2/emilia_full/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model NewGame/Emilia-vits-espnet2
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_xvector_vits_large_grad_clip_original_batch_size.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en
46
+ ngpu: 1
47
+ seed: 777
48
+ num_workers: 0
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: 4
53
+ dist_rank: 0
54
+ local_rank: 0
55
+ dist_master_addr: localhost
56
+ dist_master_port: 53813
57
+ dist_launcher: null
58
+ multiprocessing_distributed: true
59
+ unused_parameters: true
60
+ sharded_ddp: false
61
+ use_deepspeed: false
62
+ deepspeed_config: null
63
+ gradient_as_bucket_view: true
64
+ ddp_comm_hook: null
65
+ cudnn_enabled: true
66
+ cudnn_benchmark: false
67
+ cudnn_deterministic: false
68
+ use_tf32: false
69
+ collect_stats: false
70
+ write_collected_feats: false
71
+ max_epoch: 800
72
+ patience: null
73
+ val_scheduler_criterion:
74
+ - valid
75
+ - loss
76
+ early_stopping_criterion:
77
+ - valid
78
+ - loss
79
+ - min
80
+ best_model_criterion:
81
+ - - train
82
+ - total_count
83
+ - max
84
+ keep_nbest_models: 10
85
+ nbest_averaging_interval: 0
86
+ grad_clip: 1.0
87
+ grad_clip_type: 2.0
88
+ grad_noise: false
89
+ accum_grad: 1
90
+ no_forward_run: false
91
+ resume: true
92
+ train_dtype: float32
93
+ use_amp: false
94
+ log_interval: 50
95
+ use_matplotlib: true
96
+ use_tensorboard: true
97
+ create_graph_in_tensorboard: false
98
+ use_wandb: false
99
+ wandb_project: null
100
+ wandb_id: null
101
+ wandb_entity: null
102
+ wandb_name: null
103
+ wandb_model_log_interval: -1
104
+ detect_anomaly: false
105
+ use_adapter: false
106
+ adapter: lora
107
+ save_strategy: all
108
+ adapter_conf: {}
109
+ pretrain_path: null
110
+ init_param: []
111
+ ignore_init_mismatch: false
112
+ freeze_param: []
113
+ num_iters_per_epoch: 1000
114
+ batch_size: 20
115
+ valid_batch_size: null
116
+ batch_bins: 8000000
117
+ valid_batch_bins: null
118
+ category_sample_size: 10
119
+ upsampling_factor: 0.5
120
+ category_upsampling_factor: 0.5
121
+ dataset_upsampling_factor: 0.5
122
+ dataset_scaling_factor: 1.2
123
+ max_batch_size: null
124
+ min_batch_size: 1
125
+ train_shape_file:
126
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/text_shape.phn
127
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/speech_shape
128
+ valid_shape_file:
129
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/text_shape.phn
130
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/speech_shape
131
+ batch_type: numel
132
+ valid_batch_type: null
133
+ fold_length:
134
+ - 150
135
+ - 204800
136
+ sort_in_batch: descending
137
+ shuffle_within_batch: false
138
+ sort_batch: descending
139
+ multiple_iterator: false
140
+ chunk_length: 500
141
+ chunk_shift_ratio: 0.5
142
+ num_cache_chunks: 1024
143
+ chunk_excluded_key_prefixes: []
144
+ chunk_default_fs: null
145
+ chunk_max_abs_length: null
146
+ chunk_discard_short_samples: true
147
+ train_data_path_and_name_and_type:
148
+ - - dump/raw/tr_no_dev/text
149
+ - text
150
+ - text
151
+ - - dump/raw/tr_no_dev/wav.scp
152
+ - speech
153
+ - sound
154
+ - - dump/xvector/tr_no_dev/xvector.scp
155
+ - spembs
156
+ - kaldi_ark
157
+ valid_data_path_and_name_and_type:
158
+ - - dump/raw/dev/text
159
+ - text
160
+ - text
161
+ - - dump/raw/dev/wav.scp
162
+ - speech
163
+ - sound
164
+ - - dump/xvector/dev/xvector.scp
165
+ - spembs
166
+ - kaldi_ark
167
+ multi_task_dataset: false
168
+ allow_variable_data_keys: false
169
+ max_cache_size: 0.0
170
+ max_cache_fd: 32
171
+ allow_multi_rates: false
172
+ valid_max_cache_size: null
173
+ exclude_weight_decay: false
174
+ exclude_weight_decay_conf: {}
175
+ optim: adamw
176
+ optim_conf:
177
+ lr: 0.0002
178
+ betas:
179
+ - 0.8
180
+ - 0.99
181
+ eps: 1.0e-09
182
+ weight_decay: 0.0
183
+ scheduler: exponentiallr
184
+ scheduler_conf:
185
+ gamma: 0.999
186
+ optim2: adamw
187
+ optim2_conf:
188
+ lr: 0.0002
189
+ betas:
190
+ - 0.8
191
+ - 0.99
192
+ eps: 1.0e-09
193
+ weight_decay: 0.0
194
+ scheduler2: exponentiallr
195
+ scheduler2_conf:
196
+ gamma: 0.999
197
+ generator_first: false
198
+ skip_discriminator_prob: 0.0
199
+ token_list:
200
+ - <blank>
201
+ - <unk>
202
+ - ' '
203
+ - T
204
+ - AH0
205
+ - N
206
+ - S
207
+ - R
208
+ - D
209
+ - L
210
+ - K
211
+ - IH1
212
+ - DH
213
+ - M
214
+ - Z
215
+ - EH1
216
+ - AH1
217
+ - AE1
218
+ - IH0
219
+ - ','
220
+ - W
221
+ - UW1
222
+ - AY1
223
+ - IY1
224
+ - .
225
+ - P
226
+ - V
227
+ - B
228
+ - AA1
229
+ - F
230
+ - ER0
231
+ - IY0
232
+ - EY1
233
+ - AO1
234
+ - OW1
235
+ - HH
236
+ - Y
237
+ - NG
238
+ - G
239
+ - SH
240
+ - AW1
241
+ - JH
242
+ - TH
243
+ - CH
244
+ - ER1
245
+ - UH1
246
+ - '?'
247
+ - OW0
248
+ - EH2
249
+ - IH2
250
+ - EY2
251
+ - AY2
252
+ - OW2
253
+ - AA2
254
+ - AE2
255
+ - UW0
256
+ - EH0
257
+ - OY1
258
+ - AH2
259
+ - AO2
260
+ - AE0
261
+ - ZH
262
+ - AY0
263
+ - UW2
264
+ - AA0
265
+ - IY2
266
+ - AO0
267
+ - '...'
268
+ - AW2
269
+ - '!'
270
+ - EY0
271
+ - UH2
272
+ - ER2
273
+ - ''''
274
+ - AW0
275
+ - UH0
276
+ - OY2
277
+ - OY0
278
+ - '... ...'
279
+ - . ...
280
+ - . .
281
+ - ..
282
+ - . . .
283
+ - . . . .
284
+ - . . . . .
285
+ - '... .'
286
+ - <sos/eos>
287
+ odim: null
288
+ model_conf: {}
289
+ use_preprocessor: true
290
+ token_type: phn
291
+ bpemodel: null
292
+ non_linguistic_symbols: null
293
+ cleaner: tacotron
294
+ g2p: g2p_en
295
+ feats_extract: linear_spectrogram
296
+ feats_extract_conf:
297
+ n_fft: 1024
298
+ hop_length: 256
299
+ win_length: null
300
+ normalize: null
301
+ normalize_conf: {}
302
+ tts: vits
303
+ tts_conf:
304
+ generator_type: vits_generator
305
+ generator_params:
306
+ hidden_channels: 256
307
+ spks: -1
308
+ spk_embed_dim: 192
309
+ global_channels: 256
310
+ segment_size: 32
311
+ text_encoder_attention_heads: 4
312
+ text_encoder_ffn_expand: 4
313
+ text_encoder_blocks: 8
314
+ text_encoder_positionwise_layer_type: conv1d
315
+ text_encoder_positionwise_conv_kernel_size: 3
316
+ text_encoder_positional_encoding_layer_type: rel_pos
317
+ text_encoder_self_attention_layer_type: rel_selfattn
318
+ text_encoder_activation_type: swish
319
+ text_encoder_normalize_before: true
320
+ text_encoder_dropout_rate: 0.1
321
+ text_encoder_positional_dropout_rate: 0.0
322
+ text_encoder_attention_dropout_rate: 0.1
323
+ use_macaron_style_in_text_encoder: true
324
+ use_conformer_conv_in_text_encoder: false
325
+ text_encoder_conformer_kernel_size: -1
326
+ decoder_kernel_size: 7
327
+ decoder_channels: 768
328
+ decoder_upsample_scales:
329
+ - 8
330
+ - 8
331
+ - 2
332
+ - 2
333
+ decoder_upsample_kernel_sizes:
334
+ - 16
335
+ - 16
336
+ - 4
337
+ - 4
338
+ decoder_resblock_kernel_sizes:
339
+ - 3
340
+ - 7
341
+ - 11
342
+ decoder_resblock_dilations:
343
+ - - 1
344
+ - 3
345
+ - 5
346
+ - - 1
347
+ - 3
348
+ - 5
349
+ - - 1
350
+ - 3
351
+ - 5
352
+ use_weight_norm_in_decoder: true
353
+ posterior_encoder_kernel_size: 5
354
+ posterior_encoder_layers: 16
355
+ posterior_encoder_stacks: 1
356
+ posterior_encoder_base_dilation: 1
357
+ posterior_encoder_dropout_rate: 0.0
358
+ use_weight_norm_in_posterior_encoder: true
359
+ flow_flows: 4
360
+ flow_kernel_size: 5
361
+ flow_base_dilation: 1
362
+ flow_layers: 4
363
+ flow_dropout_rate: 0.0
364
+ use_weight_norm_in_flow: true
365
+ use_only_mean_in_flow: true
366
+ stochastic_duration_predictor_kernel_size: 3
367
+ stochastic_duration_predictor_dropout_rate: 0.5
368
+ stochastic_duration_predictor_flows: 4
369
+ stochastic_duration_predictor_dds_conv_layers: 3
370
+ vocabs: 87
371
+ aux_channels: 513
372
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
373
+ discriminator_params:
374
+ scales: 1
375
+ scale_downsample_pooling: AvgPool1d
376
+ scale_downsample_pooling_params:
377
+ kernel_size: 4
378
+ stride: 2
379
+ padding: 2
380
+ scale_discriminator_params:
381
+ in_channels: 1
382
+ out_channels: 1
383
+ kernel_sizes:
384
+ - 15
385
+ - 41
386
+ - 5
387
+ - 3
388
+ channels: 128
389
+ max_downsample_channels: 1024
390
+ max_groups: 16
391
+ bias: true
392
+ downsample_scales:
393
+ - 2
394
+ - 2
395
+ - 4
396
+ - 4
397
+ - 1
398
+ nonlinear_activation: LeakyReLU
399
+ nonlinear_activation_params:
400
+ negative_slope: 0.1
401
+ use_weight_norm: true
402
+ use_spectral_norm: false
403
+ follow_official_norm: false
404
+ periods:
405
+ - 2
406
+ - 3
407
+ - 5
408
+ - 7
409
+ - 11
410
+ period_discriminator_params:
411
+ in_channels: 1
412
+ out_channels: 1
413
+ kernel_sizes:
414
+ - 5
415
+ - 3
416
+ channels: 32
417
+ downsample_scales:
418
+ - 3
419
+ - 3
420
+ - 3
421
+ - 3
422
+ - 1
423
+ max_downsample_channels: 1024
424
+ bias: true
425
+ nonlinear_activation: LeakyReLU
426
+ nonlinear_activation_params:
427
+ negative_slope: 0.1
428
+ use_weight_norm: true
429
+ use_spectral_norm: false
430
+ generator_adv_loss_params:
431
+ average_by_discriminators: false
432
+ loss_type: mse
433
+ discriminator_adv_loss_params:
434
+ average_by_discriminators: false
435
+ loss_type: mse
436
+ feat_match_loss_params:
437
+ average_by_discriminators: false
438
+ average_by_layers: false
439
+ include_final_outputs: true
440
+ mel_loss_params:
441
+ fs: 22050
442
+ n_fft: 1024
443
+ hop_length: 256
444
+ win_length: null
445
+ window: hann
446
+ n_mels: 80
447
+ fmin: 0
448
+ fmax: null
449
+ log_base: null
450
+ lambda_adv: 1.0
451
+ lambda_mel: 45.0
452
+ lambda_feat_match: 2.0
453
+ lambda_dur: 1.0
454
+ lambda_kl: 1.0
455
+ sampling_rate: 22050
456
+ cache_generator_outputs: true
457
+ plot_pred_mos: false
458
+ mos_pred_tool: utmos
459
+ pitch_extract: null
460
+ pitch_extract_conf: {}
461
+ pitch_normalize: null
462
+ pitch_normalize_conf: {}
463
+ energy_extract: null
464
+ energy_extract_conf: {}
465
+ energy_normalize: null
466
+ energy_normalize_conf: {}
467
+ required:
468
+ - output_dir
469
+ - token_list
470
+ version: '202509'
471
+ distributed: true
472
+ ```
473
+
474
+ </details>
475
+
476
+
477
+
478
+ ### Citing ESPnet
479
+
480
+ ```BibTex
481
+ @inproceedings{watanabe2018espnet,
482
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
483
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
484
+ year={2018},
485
+ booktitle={Proceedings of Interspeech},
486
+ pages={2207--2211},
487
+ doi={10.21437/Interspeech.2018-1456},
488
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
489
+ }
490
+
491
+
492
+
493
+
494
+ @inproceedings{hayashi2020espnet,
495
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
496
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
497
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
498
+ pages={7654--7658},
499
+ year={2020},
500
+ organization={IEEE}
501
+ }
502
+
503
+
504
+ ```
505
+
506
+ or arXiv:
507
+
508
+ ```bibtex
509
+ @misc{watanabe2018espnet,
510
+ title={ESPnet: End-to-End Speech Processing Toolkit},
511
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
512
+ year={2018},
513
+ eprint={1804.00015},
514
+ archivePrefix={arXiv},
515
+ primaryClass={cs.CL}
516
+ }
517
+ ```
dump/xvector/dev/spk_xvector.ark ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:affc35e2e715c8760c68d2b4d159714f58364bb52a676e7e92bf45b19daa0089
3
+ size 3895500
dump/xvector/dev/spk_xvector.scp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a4caf62b898a9f08f881d84630d71f020a8a435ebe28bf2cec0585c41b2d8f
3
+ size 282800
dump/xvector/eval/spk_xvector.ark ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:426d19bea97a23337f12434e31be01cf66e712b9ea51aad93d03a54072b1f8a8
3
+ size 84564
dump/xvector/eval/spk_xvector.scp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56cafc9d2e5554f5b360319c000935b41e0fe732cbbb0968d798318efc0456fa
3
+ size 4843
dump/xvector/tr_no_dev/spk_xvector.ark ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42e0e59bdf97f8330d035169652b84e2a5f5d61a57d54a9e4a8a905f514520e5
3
+ size 896806905
dump/xvector/tr_no_dev/spk_xvector.scp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c92bb17872d46b14fbffcd0b85404bc423e83164f5ea474dd1e4add2108fbc5b
3
+ size 74312128
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/785epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a692546fc6873385bd2c8213d55e5a92f979190817b25940799588354b041c86
3
+ size 575103133
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/config.yaml ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_xvector_vits_large_grad_clip_original_batch_size.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en
9
+ ngpu: 1
10
+ seed: 777
11
+ num_workers: 0
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: 4
16
+ dist_rank: 0
17
+ local_rank: 0
18
+ dist_master_addr: localhost
19
+ dist_master_port: 53813
20
+ dist_launcher: null
21
+ multiprocessing_distributed: true
22
+ unused_parameters: true
23
+ sharded_ddp: false
24
+ use_deepspeed: false
25
+ deepspeed_config: null
26
+ gradient_as_bucket_view: true
27
+ ddp_comm_hook: null
28
+ cudnn_enabled: true
29
+ cudnn_benchmark: false
30
+ cudnn_deterministic: false
31
+ use_tf32: false
32
+ collect_stats: false
33
+ write_collected_feats: false
34
+ max_epoch: 800
35
+ patience: null
36
+ val_scheduler_criterion:
37
+ - valid
38
+ - loss
39
+ early_stopping_criterion:
40
+ - valid
41
+ - loss
42
+ - min
43
+ best_model_criterion:
44
+ - - train
45
+ - total_count
46
+ - max
47
+ keep_nbest_models: 10
48
+ nbest_averaging_interval: 0
49
+ grad_clip: 1.0
50
+ grad_clip_type: 2.0
51
+ grad_noise: false
52
+ accum_grad: 1
53
+ no_forward_run: false
54
+ resume: true
55
+ train_dtype: float32
56
+ use_amp: false
57
+ log_interval: 50
58
+ use_matplotlib: true
59
+ use_tensorboard: true
60
+ create_graph_in_tensorboard: false
61
+ use_wandb: false
62
+ wandb_project: null
63
+ wandb_id: null
64
+ wandb_entity: null
65
+ wandb_name: null
66
+ wandb_model_log_interval: -1
67
+ detect_anomaly: false
68
+ use_adapter: false
69
+ adapter: lora
70
+ save_strategy: all
71
+ adapter_conf: {}
72
+ pretrain_path: null
73
+ init_param: []
74
+ ignore_init_mismatch: false
75
+ freeze_param: []
76
+ num_iters_per_epoch: 1000
77
+ batch_size: 20
78
+ valid_batch_size: null
79
+ batch_bins: 8000000
80
+ valid_batch_bins: null
81
+ category_sample_size: 10
82
+ upsampling_factor: 0.5
83
+ category_upsampling_factor: 0.5
84
+ dataset_upsampling_factor: 0.5
85
+ dataset_scaling_factor: 1.2
86
+ max_batch_size: null
87
+ min_batch_size: 1
88
+ train_shape_file:
89
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/text_shape.phn
90
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/train/speech_shape
91
+ valid_shape_file:
92
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/text_shape.phn
93
+ - exp/tts_stats_raw_linear_spectrogram_phn_tacotron_g2p_en/valid/speech_shape
94
+ batch_type: numel
95
+ valid_batch_type: null
96
+ fold_length:
97
+ - 150
98
+ - 204800
99
+ sort_in_batch: descending
100
+ shuffle_within_batch: false
101
+ sort_batch: descending
102
+ multiple_iterator: false
103
+ chunk_length: 500
104
+ chunk_shift_ratio: 0.5
105
+ num_cache_chunks: 1024
106
+ chunk_excluded_key_prefixes: []
107
+ chunk_default_fs: null
108
+ chunk_max_abs_length: null
109
+ chunk_discard_short_samples: true
110
+ train_data_path_and_name_and_type:
111
+ - - dump/raw/tr_no_dev/text
112
+ - text
113
+ - text
114
+ - - dump/raw/tr_no_dev/wav.scp
115
+ - speech
116
+ - sound
117
+ - - dump/xvector/tr_no_dev/xvector.scp
118
+ - spembs
119
+ - kaldi_ark
120
+ valid_data_path_and_name_and_type:
121
+ - - dump/raw/dev/text
122
+ - text
123
+ - text
124
+ - - dump/raw/dev/wav.scp
125
+ - speech
126
+ - sound
127
+ - - dump/xvector/dev/xvector.scp
128
+ - spembs
129
+ - kaldi_ark
130
+ multi_task_dataset: false
131
+ allow_variable_data_keys: false
132
+ max_cache_size: 0.0
133
+ max_cache_fd: 32
134
+ allow_multi_rates: false
135
+ valid_max_cache_size: null
136
+ exclude_weight_decay: false
137
+ exclude_weight_decay_conf: {}
138
+ optim: adamw
139
+ optim_conf:
140
+ lr: 0.0002
141
+ betas:
142
+ - 0.8
143
+ - 0.99
144
+ eps: 1.0e-09
145
+ weight_decay: 0.0
146
+ scheduler: exponentiallr
147
+ scheduler_conf:
148
+ gamma: 0.999
149
+ optim2: adamw
150
+ optim2_conf:
151
+ lr: 0.0002
152
+ betas:
153
+ - 0.8
154
+ - 0.99
155
+ eps: 1.0e-09
156
+ weight_decay: 0.0
157
+ scheduler2: exponentiallr
158
+ scheduler2_conf:
159
+ gamma: 0.999
160
+ generator_first: false
161
+ skip_discriminator_prob: 0.0
162
+ token_list:
163
+ - <blank>
164
+ - <unk>
165
+ - ' '
166
+ - T
167
+ - AH0
168
+ - N
169
+ - S
170
+ - R
171
+ - D
172
+ - L
173
+ - K
174
+ - IH1
175
+ - DH
176
+ - M
177
+ - Z
178
+ - EH1
179
+ - AH1
180
+ - AE1
181
+ - IH0
182
+ - ','
183
+ - W
184
+ - UW1
185
+ - AY1
186
+ - IY1
187
+ - .
188
+ - P
189
+ - V
190
+ - B
191
+ - AA1
192
+ - F
193
+ - ER0
194
+ - IY0
195
+ - EY1
196
+ - AO1
197
+ - OW1
198
+ - HH
199
+ - Y
200
+ - NG
201
+ - G
202
+ - SH
203
+ - AW1
204
+ - JH
205
+ - TH
206
+ - CH
207
+ - ER1
208
+ - UH1
209
+ - '?'
210
+ - OW0
211
+ - EH2
212
+ - IH2
213
+ - EY2
214
+ - AY2
215
+ - OW2
216
+ - AA2
217
+ - AE2
218
+ - UW0
219
+ - EH0
220
+ - OY1
221
+ - AH2
222
+ - AO2
223
+ - AE0
224
+ - ZH
225
+ - AY0
226
+ - UW2
227
+ - AA0
228
+ - IY2
229
+ - AO0
230
+ - '...'
231
+ - AW2
232
+ - '!'
233
+ - EY0
234
+ - UH2
235
+ - ER2
236
+ - ''''
237
+ - AW0
238
+ - UH0
239
+ - OY2
240
+ - OY0
241
+ - '... ...'
242
+ - . ...
243
+ - . .
244
+ - ..
245
+ - . . .
246
+ - . . . .
247
+ - . . . . .
248
+ - '... .'
249
+ - <sos/eos>
250
+ odim: null
251
+ model_conf: {}
252
+ use_preprocessor: true
253
+ token_type: phn
254
+ bpemodel: null
255
+ non_linguistic_symbols: null
256
+ cleaner: tacotron
257
+ g2p: g2p_en
258
+ feats_extract: linear_spectrogram
259
+ feats_extract_conf:
260
+ n_fft: 1024
261
+ hop_length: 256
262
+ win_length: null
263
+ normalize: null
264
+ normalize_conf: {}
265
+ tts: vits
266
+ tts_conf:
267
+ generator_type: vits_generator
268
+ generator_params:
269
+ hidden_channels: 256
270
+ spks: -1
271
+ spk_embed_dim: 192
272
+ global_channels: 256
273
+ segment_size: 32
274
+ text_encoder_attention_heads: 4
275
+ text_encoder_ffn_expand: 4
276
+ text_encoder_blocks: 8
277
+ text_encoder_positionwise_layer_type: conv1d
278
+ text_encoder_positionwise_conv_kernel_size: 3
279
+ text_encoder_positional_encoding_layer_type: rel_pos
280
+ text_encoder_self_attention_layer_type: rel_selfattn
281
+ text_encoder_activation_type: swish
282
+ text_encoder_normalize_before: true
283
+ text_encoder_dropout_rate: 0.1
284
+ text_encoder_positional_dropout_rate: 0.0
285
+ text_encoder_attention_dropout_rate: 0.1
286
+ use_macaron_style_in_text_encoder: true
287
+ use_conformer_conv_in_text_encoder: false
288
+ text_encoder_conformer_kernel_size: -1
289
+ decoder_kernel_size: 7
290
+ decoder_channels: 768
291
+ decoder_upsample_scales:
292
+ - 8
293
+ - 8
294
+ - 2
295
+ - 2
296
+ decoder_upsample_kernel_sizes:
297
+ - 16
298
+ - 16
299
+ - 4
300
+ - 4
301
+ decoder_resblock_kernel_sizes:
302
+ - 3
303
+ - 7
304
+ - 11
305
+ decoder_resblock_dilations:
306
+ - - 1
307
+ - 3
308
+ - 5
309
+ - - 1
310
+ - 3
311
+ - 5
312
+ - - 1
313
+ - 3
314
+ - 5
315
+ use_weight_norm_in_decoder: true
316
+ posterior_encoder_kernel_size: 5
317
+ posterior_encoder_layers: 16
318
+ posterior_encoder_stacks: 1
319
+ posterior_encoder_base_dilation: 1
320
+ posterior_encoder_dropout_rate: 0.0
321
+ use_weight_norm_in_posterior_encoder: true
322
+ flow_flows: 4
323
+ flow_kernel_size: 5
324
+ flow_base_dilation: 1
325
+ flow_layers: 4
326
+ flow_dropout_rate: 0.0
327
+ use_weight_norm_in_flow: true
328
+ use_only_mean_in_flow: true
329
+ stochastic_duration_predictor_kernel_size: 3
330
+ stochastic_duration_predictor_dropout_rate: 0.5
331
+ stochastic_duration_predictor_flows: 4
332
+ stochastic_duration_predictor_dds_conv_layers: 3
333
+ vocabs: 87
334
+ aux_channels: 513
335
+ discriminator_type: hifigan_multi_scale_multi_period_discriminator
336
+ discriminator_params:
337
+ scales: 1
338
+ scale_downsample_pooling: AvgPool1d
339
+ scale_downsample_pooling_params:
340
+ kernel_size: 4
341
+ stride: 2
342
+ padding: 2
343
+ scale_discriminator_params:
344
+ in_channels: 1
345
+ out_channels: 1
346
+ kernel_sizes:
347
+ - 15
348
+ - 41
349
+ - 5
350
+ - 3
351
+ channels: 128
352
+ max_downsample_channels: 1024
353
+ max_groups: 16
354
+ bias: true
355
+ downsample_scales:
356
+ - 2
357
+ - 2
358
+ - 4
359
+ - 4
360
+ - 1
361
+ nonlinear_activation: LeakyReLU
362
+ nonlinear_activation_params:
363
+ negative_slope: 0.1
364
+ use_weight_norm: true
365
+ use_spectral_norm: false
366
+ follow_official_norm: false
367
+ periods:
368
+ - 2
369
+ - 3
370
+ - 5
371
+ - 7
372
+ - 11
373
+ period_discriminator_params:
374
+ in_channels: 1
375
+ out_channels: 1
376
+ kernel_sizes:
377
+ - 5
378
+ - 3
379
+ channels: 32
380
+ downsample_scales:
381
+ - 3
382
+ - 3
383
+ - 3
384
+ - 3
385
+ - 1
386
+ max_downsample_channels: 1024
387
+ bias: true
388
+ nonlinear_activation: LeakyReLU
389
+ nonlinear_activation_params:
390
+ negative_slope: 0.1
391
+ use_weight_norm: true
392
+ use_spectral_norm: false
393
+ generator_adv_loss_params:
394
+ average_by_discriminators: false
395
+ loss_type: mse
396
+ discriminator_adv_loss_params:
397
+ average_by_discriminators: false
398
+ loss_type: mse
399
+ feat_match_loss_params:
400
+ average_by_discriminators: false
401
+ average_by_layers: false
402
+ include_final_outputs: true
403
+ mel_loss_params:
404
+ fs: 22050
405
+ n_fft: 1024
406
+ hop_length: 256
407
+ win_length: null
408
+ window: hann
409
+ n_mels: 80
410
+ fmin: 0
411
+ fmax: null
412
+ log_base: null
413
+ lambda_adv: 1.0
414
+ lambda_mel: 45.0
415
+ lambda_feat_match: 2.0
416
+ lambda_dur: 1.0
417
+ lambda_kl: 1.0
418
+ sampling_rate: 22050
419
+ cache_generator_outputs: true
420
+ plot_pred_mos: false
421
+ mos_pred_tool: utmos
422
+ pitch_extract: null
423
+ pitch_extract_conf: {}
424
+ pitch_normalize: null
425
+ pitch_normalize_conf: {}
426
+ energy_extract: null
427
+ energy_extract_conf: {}
428
+ energy_normalize: null
429
+ energy_normalize_conf: {}
430
+ required:
431
+ - output_dir
432
+ - token_list
433
+ version: '202509'
434
+ distributed: true
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_backward_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_fake_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_forward_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_optim_step_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_real_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/discriminator_train_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_adv_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_backward_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_dur_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_feat_match_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_forward_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_kl_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_mel_loss.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_optim_step_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/generator_train_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/iter_time.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/optim0_lr0.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/optim1_lr0.png ADDED
exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202511'
2
+ files:
3
+ model_file: exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/785epoch.pth
4
+ python: 3.11.14 (main, Oct 21 2025, 18:31:21) [GCC 11.2.0]
5
+ timestamp: 1774480619.969943
6
+ torch: 2.7.1+cu126
7
+ yaml_files:
8
+ train_config: exp/tts_train_xvector_vits_large_grad_clip_original_batch_size_raw_phn_tacotron_g2p_en/config.yaml