Ji-Hoon Kim commited on
Commit
d89344e
·
1 Parent(s): 4a23b41

Update model

Browse files
Files changed (26) hide show
  1. README.md +15 -17
  2. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth +3 -0
  3. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml +13 -15
  4. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
  5. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
  6. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
  7. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
  8. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
  9. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
  10. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
  11. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png +0 -0
  12. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
  13. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png +0 -0
  14. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png +0 -0
  15. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
  16. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png +0 -0
  17. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
  18. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png +0 -0
  19. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
  20. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
  21. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
  22. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
  23. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
  24. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
  25. exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
  26. meta.yaml +3 -3
README.md CHANGED
@@ -13,7 +13,7 @@ license: cc-by-4.0
13
 
14
  ### `jihoonk/expresso-vits-espnet2`
15
 
16
- This model was trained by Jihoon Kim using expresso recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
  ### Demo: How to use in ESPnet2
19
 
@@ -22,7 +22,7 @@ if you haven't done that already.
22
 
23
  ```bash
24
  cd espnet
25
- git checkout 1efdaa835178b0ce5034904e29f89f8fc7e0a358
26
  pip install -e .
27
  cd egs2/expresso/tts1
28
  ./run.sh --skip_data_prep false --skip_train true --download_model jihoonk/expresso-vits-espnet2
@@ -53,7 +53,7 @@ dist_world_size: 4
53
  dist_rank: 0
54
  local_rank: 0
55
  dist_master_addr: localhost
56
- dist_master_port: 37393
57
  dist_launcher: null
58
  multiprocessing_distributed: true
59
  unused_parameters: true
@@ -126,7 +126,7 @@ batch_type: numel
126
  valid_batch_type: null
127
  fold_length:
128
  - 150
129
- - 409600
130
  sort_in_batch: descending
131
  shuffle_within_batch: false
132
  sort_batch: descending
@@ -207,16 +207,16 @@ token_list:
207
  - EH1
208
  - W
209
  - DH
210
- - AE1
211
  - UW1
212
  - IH0
213
  - AH1
 
214
  - AA1
215
- - F
216
  - IY1
217
  - AY1
218
- - B
219
  - IY0
 
 
220
  - .
221
  - P
222
  - V
@@ -231,8 +231,8 @@ token_list:
231
  - NG
232
  - G
233
  - AW1
234
- - TH
235
  - '!'
 
236
  - SH
237
  - JH
238
  - UH1
@@ -279,8 +279,8 @@ cleaner: tacotron
279
  g2p: g2p_en_no_space
280
  feats_extract: linear_spectrogram
281
  feats_extract_conf:
282
- n_fft: 2048
283
- hop_length: 512
284
  win_length: null
285
  normalize: null
286
  normalize_conf: {}
@@ -314,13 +314,11 @@ tts_conf:
314
  - 8
315
  - 2
316
  - 2
317
- - 2
318
  decoder_upsample_kernel_sizes:
319
  - 16
320
  - 16
321
  - 4
322
  - 4
323
- - 4
324
  decoder_resblock_kernel_sizes:
325
  - 3
326
  - 7
@@ -354,7 +352,7 @@ tts_conf:
354
  stochastic_duration_predictor_flows: 4
355
  stochastic_duration_predictor_dds_conv_layers: 3
356
  vocabs: 78
357
- aux_channels: 1025
358
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
359
  discriminator_params:
360
  scales: 1
@@ -424,9 +422,9 @@ tts_conf:
424
  average_by_layers: false
425
  include_final_outputs: true
426
  mel_loss_params:
427
- fs: 48000
428
- n_fft: 2048
429
- hop_length: 512
430
  win_length: null
431
  window: hann
432
  n_mels: 80
@@ -438,7 +436,7 @@ tts_conf:
438
  lambda_feat_match: 2.0
439
  lambda_dur: 1.0
440
  lambda_kl: 1.0
441
- sampling_rate: 48000
442
  cache_generator_outputs: true
443
  plot_pred_mos: false
444
  mos_pred_tool: utmos
 
13
 
14
  ### `jihoonk/expresso-vits-espnet2`
15
 
16
+ This model was trained by Ji-Hoon Kim using expresso recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
  ### Demo: How to use in ESPnet2
19
 
 
22
 
23
  ```bash
24
  cd espnet
25
+ git checkout 5f146d803dbd998af1f830017b6cf558f0e5ccb2
26
  pip install -e .
27
  cd egs2/expresso/tts1
28
  ./run.sh --skip_data_prep false --skip_train true --download_model jihoonk/expresso-vits-espnet2
 
53
  dist_rank: 0
54
  local_rank: 0
55
  dist_master_addr: localhost
56
+ dist_master_port: 57379
57
  dist_launcher: null
58
  multiprocessing_distributed: true
59
  unused_parameters: true
 
126
  valid_batch_type: null
127
  fold_length:
128
  - 150
129
+ - 204800
130
  sort_in_batch: descending
131
  shuffle_within_batch: false
132
  sort_batch: descending
 
207
  - EH1
208
  - W
209
  - DH
 
210
  - UW1
211
  - IH0
212
  - AH1
213
+ - AE1
214
  - AA1
 
215
  - IY1
216
  - AY1
 
217
  - IY0
218
+ - F
219
+ - B
220
  - .
221
  - P
222
  - V
 
231
  - NG
232
  - G
233
  - AW1
 
234
  - '!'
235
+ - TH
236
  - SH
237
  - JH
238
  - UH1
 
279
  g2p: g2p_en_no_space
280
  feats_extract: linear_spectrogram
281
  feats_extract_conf:
282
+ n_fft: 1024
283
+ hop_length: 256
284
  win_length: null
285
  normalize: null
286
  normalize_conf: {}
 
314
  - 8
315
  - 2
316
  - 2
 
317
  decoder_upsample_kernel_sizes:
318
  - 16
319
  - 16
320
  - 4
321
  - 4
 
322
  decoder_resblock_kernel_sizes:
323
  - 3
324
  - 7
 
352
  stochastic_duration_predictor_flows: 4
353
  stochastic_duration_predictor_dds_conv_layers: 3
354
  vocabs: 78
355
+ aux_channels: 513
356
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
357
  discriminator_params:
358
  scales: 1
 
422
  average_by_layers: false
423
  include_final_outputs: true
424
  mel_loss_params:
425
+ fs: 16000
426
+ n_fft: 1024
427
+ hop_length: 256
428
  win_length: null
429
  window: hann
430
  n_mels: 80
 
436
  lambda_feat_match: 2.0
437
  lambda_dur: 1.0
438
  lambda_kl: 1.0
439
+ sampling_rate: 16000
440
  cache_generator_outputs: true
441
  plot_pred_mos: false
442
  mos_pred_tool: utmos
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ad98adde0ec6462ab955669a413f82cabac09797de6808bc04fa1b407f010f7
3
+ size 385969038
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml CHANGED
@@ -16,7 +16,7 @@ dist_world_size: 4
16
  dist_rank: 0
17
  local_rank: 0
18
  dist_master_addr: localhost
19
- dist_master_port: 37393
20
  dist_launcher: null
21
  multiprocessing_distributed: true
22
  unused_parameters: true
@@ -89,7 +89,7 @@ batch_type: numel
89
  valid_batch_type: null
90
  fold_length:
91
  - 150
92
- - 409600
93
  sort_in_batch: descending
94
  shuffle_within_batch: false
95
  sort_batch: descending
@@ -170,16 +170,16 @@ token_list:
170
  - EH1
171
  - W
172
  - DH
173
- - AE1
174
  - UW1
175
  - IH0
176
  - AH1
 
177
  - AA1
178
- - F
179
  - IY1
180
  - AY1
181
- - B
182
  - IY0
 
 
183
  - .
184
  - P
185
  - V
@@ -194,8 +194,8 @@ token_list:
194
  - NG
195
  - G
196
  - AW1
197
- - TH
198
  - '!'
 
199
  - SH
200
  - JH
201
  - UH1
@@ -242,8 +242,8 @@ cleaner: tacotron
242
  g2p: g2p_en_no_space
243
  feats_extract: linear_spectrogram
244
  feats_extract_conf:
245
- n_fft: 2048
246
- hop_length: 512
247
  win_length: null
248
  normalize: null
249
  normalize_conf: {}
@@ -277,13 +277,11 @@ tts_conf:
277
  - 8
278
  - 2
279
  - 2
280
- - 2
281
  decoder_upsample_kernel_sizes:
282
  - 16
283
  - 16
284
  - 4
285
  - 4
286
- - 4
287
  decoder_resblock_kernel_sizes:
288
  - 3
289
  - 7
@@ -317,7 +315,7 @@ tts_conf:
317
  stochastic_duration_predictor_flows: 4
318
  stochastic_duration_predictor_dds_conv_layers: 3
319
  vocabs: 78
320
- aux_channels: 1025
321
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
322
  discriminator_params:
323
  scales: 1
@@ -387,9 +385,9 @@ tts_conf:
387
  average_by_layers: false
388
  include_final_outputs: true
389
  mel_loss_params:
390
- fs: 48000
391
- n_fft: 2048
392
- hop_length: 512
393
  win_length: null
394
  window: hann
395
  n_mels: 80
@@ -401,7 +399,7 @@ tts_conf:
401
  lambda_feat_match: 2.0
402
  lambda_dur: 1.0
403
  lambda_kl: 1.0
404
- sampling_rate: 48000
405
  cache_generator_outputs: true
406
  plot_pred_mos: false
407
  mos_pred_tool: utmos
 
16
  dist_rank: 0
17
  local_rank: 0
18
  dist_master_addr: localhost
19
+ dist_master_port: 57379
20
  dist_launcher: null
21
  multiprocessing_distributed: true
22
  unused_parameters: true
 
89
  valid_batch_type: null
90
  fold_length:
91
  - 150
92
+ - 204800
93
  sort_in_batch: descending
94
  shuffle_within_batch: false
95
  sort_batch: descending
 
170
  - EH1
171
  - W
172
  - DH
 
173
  - UW1
174
  - IH0
175
  - AH1
176
+ - AE1
177
  - AA1
 
178
  - IY1
179
  - AY1
 
180
  - IY0
181
+ - F
182
+ - B
183
  - .
184
  - P
185
  - V
 
194
  - NG
195
  - G
196
  - AW1
 
197
  - '!'
198
+ - TH
199
  - SH
200
  - JH
201
  - UH1
 
242
  g2p: g2p_en_no_space
243
  feats_extract: linear_spectrogram
244
  feats_extract_conf:
245
+ n_fft: 1024
246
+ hop_length: 256
247
  win_length: null
248
  normalize: null
249
  normalize_conf: {}
 
277
  - 8
278
  - 2
279
  - 2
 
280
  decoder_upsample_kernel_sizes:
281
  - 16
282
  - 16
283
  - 4
284
  - 4
 
285
  decoder_resblock_kernel_sizes:
286
  - 3
287
  - 7
 
315
  stochastic_duration_predictor_flows: 4
316
  stochastic_duration_predictor_dds_conv_layers: 3
317
  vocabs: 78
318
+ aux_channels: 513
319
  discriminator_type: hifigan_multi_scale_multi_period_discriminator
320
  discriminator_params:
321
  scales: 1
 
385
  average_by_layers: false
386
  include_final_outputs: true
387
  mel_loss_params:
388
+ fs: 16000
389
+ n_fft: 1024
390
+ hop_length: 256
391
  win_length: null
392
  window: hann
393
  n_mels: 80
 
399
  lambda_feat_match: 2.0
400
  lambda_dur: 1.0
401
  lambda_kl: 1.0
402
+ sampling_rate: 16000
403
  cache_generator_outputs: true
404
  plot_pred_mos: false
405
  mos_pred_tool: utmos
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png CHANGED
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png CHANGED
meta.yaml CHANGED
@@ -1,8 +1,8 @@
1
  espnet: '202503'
2
  files:
3
- model_file: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth
4
- python: "3.9.22 | packaged by conda-forge | (main, Apr 14 2025, 23:35:59) \n[GCC 13.3.0]"
5
- timestamp: 1748636001.403026
6
  torch: 2.4.1+cu124
7
  yaml_files:
8
  train_config: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml
 
1
  espnet: '202503'
2
  files:
3
+ model_file: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth
4
+ python: 3.10.18 | packaged by conda-forge | (main, Jun 4 2025, 14:45:41) [GCC 13.3.0]
5
+ timestamp: 1752098916.664364
6
  torch: 2.4.1+cu124
7
  yaml_files:
8
  train_config: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml