Ji-Hoon Kim commited on Jul 9, 2025

Commit

d89344e

1 Parent(s): 4a23b41

Update model

Files changed (26) hide show

README.md +15 -17
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth +3 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml +13 -15
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png +0 -0
exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png +0 -0
meta.yaml +3 -3

README.md CHANGED Viewed

@@ -13,7 +13,7 @@ license: cc-by-4.0
 ### `jihoonk/expresso-vits-espnet2`
-This model was trained by Jihoon Kim using expresso recipe in [espnet](https://github.com/espnet/espnet/).
 ### Demo: How to use in ESPnet2
@@ -22,7 +22,7 @@ if you haven't done that already.
 ```bash
 cd espnet
-git checkout 1efdaa835178b0ce5034904e29f89f8fc7e0a358
 pip install -e .
 cd egs2/expresso/tts1
 ./run.sh --skip_data_prep false --skip_train true --download_model jihoonk/expresso-vits-espnet2
@@ -53,7 +53,7 @@ dist_world_size: 4
 dist_rank: 0
 local_rank: 0
 dist_master_addr: localhost
-dist_master_port: 37393
 dist_launcher: null
 multiprocessing_distributed: true
 unused_parameters: true
@@ -126,7 +126,7 @@ batch_type: numel
 valid_batch_type: null
 fold_length:
 - 150
-- 409600
 sort_in_batch: descending
 shuffle_within_batch: false
 sort_batch: descending
@@ -207,16 +207,16 @@ token_list:
 - EH1
 - W
 - DH
-- AE1
 - UW1
 - IH0
 - AH1
 - AA1
-- F
 - IY1
 - AY1
-- B
 - IY0
 - .
 - P
 - V
@@ -231,8 +231,8 @@ token_list:
 - NG
 - G
 - AW1
-- TH
 - '!'
 - SH
 - JH
 - UH1
@@ -279,8 +279,8 @@ cleaner: tacotron
 g2p: g2p_en_no_space
 feats_extract: linear_spectrogram
 feats_extract_conf:
-    n_fft: 2048
-    hop_length: 512
     win_length: null
 normalize: null
 normalize_conf: {}
@@ -314,13 +314,11 @@ tts_conf:
         - 8
         - 2
         - 2
-        - 2
         decoder_upsample_kernel_sizes:
         - 16
         - 16
         - 4
         - 4
-        - 4
         decoder_resblock_kernel_sizes:
         - 3
         - 7
@@ -354,7 +352,7 @@ tts_conf:
         stochastic_duration_predictor_flows: 4
         stochastic_duration_predictor_dds_conv_layers: 3
         vocabs: 78
-        aux_channels: 1025
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
         scales: 1
@@ -424,9 +422,9 @@ tts_conf:
         average_by_layers: false
         include_final_outputs: true
     mel_loss_params:
-        fs: 48000
-        n_fft: 2048
-        hop_length: 512
         win_length: null
         window: hann
         n_mels: 80
@@ -438,7 +436,7 @@ tts_conf:
     lambda_feat_match: 2.0
     lambda_dur: 1.0
     lambda_kl: 1.0
-    sampling_rate: 48000
     cache_generator_outputs: true
     plot_pred_mos: false
     mos_pred_tool: utmos

 ### `jihoonk/expresso-vits-espnet2`
+This model was trained by Ji-Hoon Kim using expresso recipe in [espnet](https://github.com/espnet/espnet/).
 ### Demo: How to use in ESPnet2
 ```bash
 cd espnet
+git checkout 5f146d803dbd998af1f830017b6cf558f0e5ccb2
 pip install -e .
 cd egs2/expresso/tts1
 ./run.sh --skip_data_prep false --skip_train true --download_model jihoonk/expresso-vits-espnet2
 dist_rank: 0
 local_rank: 0
 dist_master_addr: localhost
+dist_master_port: 57379
 dist_launcher: null
 multiprocessing_distributed: true
 unused_parameters: true
 valid_batch_type: null
 fold_length:
 - 150
+- 204800
 sort_in_batch: descending
 shuffle_within_batch: false
 sort_batch: descending
 - EH1
 - W
 - DH
 - UW1
 - IH0
 - AH1
+- AE1
 - AA1
 - IY1
 - AY1
 - IY0
+- F
+- B
 - .
 - P
 - V
 - NG
 - G
 - AW1
 - '!'
+- TH
 - SH
 - JH
 - UH1
 g2p: g2p_en_no_space
 feats_extract: linear_spectrogram
 feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
     win_length: null
 normalize: null
 normalize_conf: {}
         - 8
         - 2
         - 2
         decoder_upsample_kernel_sizes:
         - 16
         - 16
         - 4
         - 4
         decoder_resblock_kernel_sizes:
         - 3
         - 7
         stochastic_duration_predictor_flows: 4
         stochastic_duration_predictor_dds_conv_layers: 3
         vocabs: 78
+        aux_channels: 513
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
         scales: 1
         average_by_layers: false
         include_final_outputs: true
     mel_loss_params:
+        fs: 16000
+        n_fft: 1024
+        hop_length: 256
         win_length: null
         window: hann
         n_mels: 80
     lambda_feat_match: 2.0
     lambda_dur: 1.0
     lambda_kl: 1.0
+    sampling_rate: 16000
     cache_generator_outputs: true
     plot_pred_mos: false
     mos_pred_tool: utmos

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ad98adde0ec6462ab955669a413f82cabac09797de6808bc04fa1b407f010f7
+size 385969038

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml CHANGED Viewed

@@ -16,7 +16,7 @@ dist_world_size: 4
 dist_rank: 0
 local_rank: 0
 dist_master_addr: localhost
-dist_master_port: 37393
 dist_launcher: null
 multiprocessing_distributed: true
 unused_parameters: true
@@ -89,7 +89,7 @@ batch_type: numel
 valid_batch_type: null
 fold_length:
 - 150
-- 409600
 sort_in_batch: descending
 shuffle_within_batch: false
 sort_batch: descending
@@ -170,16 +170,16 @@ token_list:
 - EH1
 - W
 - DH
-- AE1
 - UW1
 - IH0
 - AH1
 - AA1
-- F
 - IY1
 - AY1
-- B
 - IY0
 - .
 - P
 - V
@@ -194,8 +194,8 @@ token_list:
 - NG
 - G
 - AW1
-- TH
 - '!'
 - SH
 - JH
 - UH1
@@ -242,8 +242,8 @@ cleaner: tacotron
 g2p: g2p_en_no_space
 feats_extract: linear_spectrogram
 feats_extract_conf:
-    n_fft: 2048
-    hop_length: 512
     win_length: null
 normalize: null
 normalize_conf: {}
@@ -277,13 +277,11 @@ tts_conf:
         - 8
         - 2
         - 2
-        - 2
         decoder_upsample_kernel_sizes:
         - 16
         - 16
         - 4
         - 4
-        - 4
         decoder_resblock_kernel_sizes:
         - 3
         - 7
@@ -317,7 +315,7 @@ tts_conf:
         stochastic_duration_predictor_flows: 4
         stochastic_duration_predictor_dds_conv_layers: 3
         vocabs: 78
-        aux_channels: 1025
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
         scales: 1
@@ -387,9 +385,9 @@ tts_conf:
         average_by_layers: false
         include_final_outputs: true
     mel_loss_params:
-        fs: 48000
-        n_fft: 2048
-        hop_length: 512
         win_length: null
         window: hann
         n_mels: 80
@@ -401,7 +399,7 @@ tts_conf:
     lambda_feat_match: 2.0
     lambda_dur: 1.0
     lambda_kl: 1.0
-    sampling_rate: 48000
     cache_generator_outputs: true
     plot_pred_mos: false
     mos_pred_tool: utmos

 dist_rank: 0
 local_rank: 0
 dist_master_addr: localhost
+dist_master_port: 57379
 dist_launcher: null
 multiprocessing_distributed: true
 unused_parameters: true
 valid_batch_type: null
 fold_length:
 - 150
+- 204800
 sort_in_batch: descending
 shuffle_within_batch: false
 sort_batch: descending
 - EH1
 - W
 - DH
 - UW1
 - IH0
 - AH1
+- AE1
 - AA1
 - IY1
 - AY1
 - IY0
+- F
+- B
 - .
 - P
 - V
 - NG
 - G
 - AW1
 - '!'
+- TH
 - SH
 - JH
 - UH1
 g2p: g2p_en_no_space
 feats_extract: linear_spectrogram
 feats_extract_conf:
+    n_fft: 1024
+    hop_length: 256
     win_length: null
 normalize: null
 normalize_conf: {}
         - 8
         - 2
         - 2
         decoder_upsample_kernel_sizes:
         - 16
         - 16
         - 4
         - 4
         decoder_resblock_kernel_sizes:
         - 3
         - 7
         stochastic_duration_predictor_flows: 4
         stochastic_duration_predictor_dds_conv_layers: 3
         vocabs: 78
+        aux_channels: 513
     discriminator_type: hifigan_multi_scale_multi_period_discriminator
     discriminator_params:
         scales: 1
         average_by_layers: false
         include_final_outputs: true
     mel_loss_params:
+        fs: 16000
+        n_fft: 1024
+        hop_length: 256
         win_length: null
         window: hann
         n_mels: 80
     lambda_feat_match: 2.0
     lambda_dur: 1.0
     lambda_kl: 1.0
+    sampling_rate: 16000
     cache_generator_outputs: true
     plot_pred_mos: false
     mos_pred_tool: utmos

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_backward_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_fake_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_forward_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_optim_step_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_real_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/discriminator_train_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_adv_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_backward_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_dur_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_feat_match_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_forward_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_kl_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_mel_loss.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_optim_step_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/generator_train_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/gpu_max_cached_mem_GB.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim0_lr0.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/optim1_lr0.png CHANGED Viewed

exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/images/train_time.png CHANGED Viewed

meta.yaml CHANGED Viewed

@@ -1,8 +1,8 @@
 espnet: '202503'
 files:
-  model_file: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/train.total_count.ave_10best.pth
-python: "3.9.22 | packaged by conda-forge | (main, Apr 14 2025, 23:35:59) \n[GCC 13.3.0]"
-timestamp: 1748636001.403026
 torch: 2.4.1+cu124
 yaml_files:
   train_config: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml

 espnet: '202503'
 files:
+  model_file: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/500epoch.pth
+python: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:45:41) [GCC 13.3.0]
+timestamp: 1752098916.664364
 torch: 2.4.1+cu124
 yaml_files:
   train_config: exp/tts_train_full_band_multi_spk_vits_raw_phn_tacotron_g2p_en_no_space/config.yaml